diff --git a/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_graph_sycl.h b/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_graph_sycl.h
new file mode 100644
index 0000000000000000000000000000000000000000..829a6424550b3f44d56f1b09f316d7abce3961c8
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_graph_sycl.h
@@ -0,0 +1,99 @@
+/*******************************************************************************
+* Copyright 2020-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef ONEAPI_DNNL_DNNL_GRAPH_SYCL_H
+#define ONEAPI_DNNL_DNNL_GRAPH_SYCL_H
+
+#include "oneapi/dnnl/dnnl_graph.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// @addtogroup dnnl_api
+/// @{
+
+/// @addtogroup dnnl_graph_api
+/// @{
+
+/// @addtogroup dnnl_graph_api_interop
+/// @{
+
+/// @addtogroup dnnl_graph_api_sycl_interop
+/// @{
+
+/// Allocation call-back function interface for SYCL. SYCL allocator should be
+/// used for SYCL runtime and host allocator should be used for non-SYCL. The
+/// call-back should return a USM device memory pointer.
+typedef void *(*dnnl_graph_sycl_allocate_f)(
+        size_t size, size_t alignment, const void *dev, const void *context);
+
+/// Deallocation call-back function interface for SYCL. SYCL allocator should be
+/// used for SYCL runtime and host allocator should be used for non-SYCL. The
+/// call-back should deallocate a USM device memory returned by
+/// #dnnl_graph_sycl_allocate_f.
+typedef void (*dnnl_graph_sycl_deallocate_f)(
+        void *buf, const void *dev, const void *context, void *event);
+
+/// Creates an allocator with the given allocation and deallocation call-back
+/// function pointers.
+///
+/// @param allocator Output allocator
+/// @param sycl_malloc A pointer to SYCL malloc function
+/// @param sycl_free A pointer to SYCL free function
+/// @returns #dnnl_success on success and a status describing the
+///     error otherwise.
+dnnl_status_t DNNL_API dnnl_graph_sycl_interop_allocator_create(
+        dnnl_graph_allocator_t *allocator,
+        dnnl_graph_sycl_allocate_f sycl_malloc,
+        dnnl_graph_sycl_deallocate_f sycl_free);
+
+/// This API is a supplement for existing onednn engine API.
+dnnl_status_t DNNL_API dnnl_graph_sycl_interop_make_engine_with_allocator(
+        dnnl_engine_t *engine, const void *device, const void *context,
+        const_dnnl_graph_allocator_t alloc);
+
+/// Execute a compiled partition with sycl runtime.
+///
+/// @param compiled_partition The handle of target compiled_partition.
+/// @param stream The stream used for execution
+/// @param num_inputs The number of input tensors
+/// @param inputs A list of input tensors
+/// @param num_outputs The number of output tensors
+/// @param outputs A non-empty list of output tensors
+/// @param deps Optional handle of list with `sycl::event` dependencies.
+/// @param sycl_event The handle of sycl event.
+/// @returns #dnnl_success on success and a status describing the
+///     error otherwise.
+dnnl_status_t DNNL_API dnnl_graph_sycl_interop_compiled_partition_execute(
+        const_dnnl_graph_compiled_partition_t compiled_partition,
+        dnnl_stream_t stream, size_t num_inputs,
+        const_dnnl_graph_tensor_t *inputs, size_t num_outputs,
+        const_dnnl_graph_tensor_t *outputs, const void *deps, void *sycl_event);
+
+/// @} dnnl_graph_api_sycl_interop
+
+/// @} dnnl_graph_api_interop
+
+/// @} dnnl_graph_api
+
+/// @} dnnl_api
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_graph_sycl.hpp b/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_graph_sycl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..acb28b905319338bfb97d4fb39044161a827fbd6
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_graph_sycl.hpp
@@ -0,0 +1,131 @@
+/*******************************************************************************
+* Copyright 2020-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/// @file
+/// Graph SYCL interop API
+
+#ifndef ONEAPI_DNNL_DNNL_GRAPH_SYCL_HPP
+#define ONEAPI_DNNL_DNNL_GRAPH_SYCL_HPP
+
+/// @cond DO_NOT_DOCUMENT_THIS
+#include <vector>
+
+#if __has_include(<sycl/sycl.hpp>)
+#include <sycl/sycl.hpp>
+#else
+#error "Unsupported compiler"
+#endif
+
+#include "oneapi/dnnl/dnnl_graph.hpp"
+#include "oneapi/dnnl/dnnl_graph_sycl.h"
+/// @endcond
+
+/// @addtogroup dnnl_api
+/// @{
+
+namespace dnnl {
+
+/// @addtogroup dnnl_graph_api
+/// @{
+
+namespace graph {
+
+/// @addtogroup dnnl_graph_api_interop Runtime interoperability API
+/// API extensions to interact with the underlying run-time.
+/// @{
+
+/// @addtogroup dnnl_graph_api_sycl_interop SYCL interoperability API
+/// API extensions to interact with the underlying SYCL run-time.
+/// @{
+
+/// SYCL interoperability namespace
+namespace sycl_interop {
+
+/// Constructs an allocator from SYCL malloc and free function pointer. SYCL
+/// allocator  should be used for SYCL runtime and host allocator should be used
+/// for non-SYCL. Currently, only device USM allocator is supported.
+///
+/// @param sycl_malloc The pointer to SYCL malloc function
+/// @param sycl_free The pointer to SYCL free function
+/// @returns Created allocator
+inline allocator make_allocator(dnnl_graph_sycl_allocate_f sycl_malloc,
+        dnnl_graph_sycl_deallocate_f sycl_free) {
+    dnnl_graph_allocator_t c_allocator = nullptr;
+    error::wrap_c_api(dnnl_graph_sycl_interop_allocator_create(
+                              &c_allocator, sycl_malloc, sycl_free),
+            "could not create allocator for sycl device");
+    return allocator(c_allocator);
+}
+
+inline engine make_engine_with_allocator(const sycl::device &adevice,
+        const sycl::context &acontext, const allocator &alloc) {
+    dnnl_engine_t c_engine;
+    error::wrap_c_api(
+            dnnl_graph_sycl_interop_make_engine_with_allocator(&c_engine,
+                    static_cast<const void *>(&adevice),
+                    static_cast<const void *>(&acontext), alloc.get()),
+            "could not make an engine with allocator");
+    return engine(c_engine);
+}
+
+/// Executes a compiled partition in a specified stream and returns a SYCL
+/// event.
+///
+/// @param c_partition Compiled partition to execute.
+/// @param astream Stream object to run over
+/// @param inputs Arguments map.
+/// @param outputs Arguments map.
+/// @param deps Optional vector with `sycl::event` dependencies.
+/// @returns Output event.
+inline sycl::event execute(compiled_partition &c_partition, stream &astream,
+        const std::vector<tensor> &inputs, std::vector<tensor> &outputs,
+        const std::vector<sycl::event> &deps = {}) {
+    std::vector<const_dnnl_graph_tensor_t> c_inputs;
+    c_inputs.reserve(inputs.size());
+    for (auto &in : inputs) {
+        c_inputs.push_back(in.get());
+    }
+    std::vector<const_dnnl_graph_tensor_t> c_outputs;
+    c_outputs.reserve(outputs.size());
+    for (auto &out : outputs) {
+        c_outputs.push_back(out.get());
+    }
+
+    sycl::event sycl_event;
+    error::wrap_c_api(dnnl_graph_sycl_interop_compiled_partition_execute(
+                              c_partition.get(), astream.get(), c_inputs.size(),
+                              c_inputs.data(), c_outputs.size(),
+                              c_outputs.data(), &deps, &sycl_event),
+            "could not execute the compiled_partition on a specified sycl "
+            "stream");
+    return sycl_event;
+}
+
+} // namespace sycl_interop
+
+/// @} dnnl_graph_api_sycl_interop
+
+/// @} dnnl_graph_api_interop
+
+} // namespace graph
+
+/// @} dnnl_graph_api
+
+} // namespace dnnl
+
+/// @} dnnl_api
+
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_graph_types.h b/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_graph_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..4193c7bdb517676571b8331f2e89c57ded6c5630
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_graph_types.h
@@ -0,0 +1,475 @@
+/*******************************************************************************
+ * Copyright 2020-2025 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
+
+/// @file
+/// C API definitions
+
+#ifndef ONEAPI_DNNL_DNNL_GRAPH_TYPES_H
+#define ONEAPI_DNNL_DNNL_GRAPH_TYPES_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// @cond DO_NOT_DOCUMENT_THIS
+#include <limits.h>
+#include <stddef.h>
+
+#include "oneapi/dnnl/dnnl_common_types.h"
+/// @endcond
+
+/// @addtogroup dnnl_api
+/// @{
+
+/// @addtogroup dnnl_graph_api
+/// @{
+
+/// @addtogroup dnnl_graph_api_logical_tensor
+/// @{
+
+/// A wildcard value for number of dimensions which is unknown at a tensor or
+/// operation creation time.
+#define DNNL_GRAPH_UNKNOWN_NDIMS -1
+
+/// A wildcard value for dimensions that are unknown at a tensor or operation
+/// creation time.
+#define DNNL_GRAPH_UNKNOWN_DIM INT64_MIN
+
+/// Layout type specification
+typedef enum {
+    /// Undefined layout type
+    dnnl_graph_layout_type_undef = 0,
+    /// Any means to let the library to decide the layout for a tensor during
+    /// partition compilation.
+    dnnl_graph_layout_type_any = 1,
+    /// Strided means that the layout of a tensor is determined by the strides
+    /// field in the logical tensor.
+    dnnl_graph_layout_type_strided = 2,
+    /// Opaque means that the layout of a tensor is the library specific.
+    /// Usually, an opaque layout is generated by a partition which is compiled
+    /// with layout type any.
+    dnnl_graph_layout_type_opaque = 3,
+} dnnl_graph_layout_type_t;
+
+/// Logical tensor property
+typedef enum {
+    /// Undefined tensor property
+    dnnl_graph_tensor_property_undef = 0,
+    /// Variable means the tensor may be changed during computation or between
+    /// different iterations.
+    dnnl_graph_tensor_property_variable = 1,
+    /// Constant means the tensor will keep unchanged during computation and
+    /// between different iterations. It's useful for the library to apply
+    /// optimizations for constant tensors or cache constant tensors inside the
+    /// library. For example, constant weight tensors in inference scenarios.
+    dnnl_graph_tensor_property_constant = 2,
+} dnnl_graph_tensor_property_t;
+
+/// Logical tensor. It is based on an ID, a number of dimensions, dimensions
+/// themselves, element data type, tensor property and tensor memory layout.
+typedef struct {
+    /// Unique id of each logical tensor. The library uses logical tensor IDs to
+    /// build up the connections between operations if the output of one
+    /// operation has the same ID as the input of another operation.
+    size_t id;
+
+    /// Number of dimensions. -1 means unknown (DNNL_GRAPH_UNKNOWN_NDIMS). 0 is
+    /// used to define scalar tensor.
+    int ndims;
+
+    /// Size of each dimension. #DNNL_GRAPH_UNKNOWN_DIM means the size of that
+    /// dimension is unknown. 0 is used to define zero-dimension tensor. The
+    /// library supports to deduce output shapes according to input shapes
+    /// during compilation. Unlike memory descriptor in oneDNN primitive API,
+    /// the order of dimensions is not defined in logical tensor. It is defined
+    /// by the operations which respect the order through the attributes
+    /// #dnnl_graph_op_attr_data_format or #dnnl_graph_op_attr_weights_format.
+    /// For example, for a Convolution with `data_format=NXC`, it means the
+    /// first element of dims of activation tensor is mini-batch size, the last
+    /// effective element of dims is channel size, and other elements between
+    /// them are spatial dimensions.
+    dnnl_dims_t dims;
+
+    /// Data type of the tensor elements.
+    dnnl_data_type_t data_type;
+
+    /// Property type of the tensor.
+    dnnl_graph_tensor_property_t property;
+
+    /// Layout type of the tensor.
+    dnnl_graph_layout_type_t layout_type;
+    union {
+        /// The field is valid when `layout_type` is
+        /// #dnnl_graph_layout_type_strided. #DNNL_GRAPH_UNKNOWN_DIM means the
+        /// stride of the dimension is unknown. The library currently doesn't
+        /// support other negative stride values.
+        dnnl_dims_t strides;
+
+        /// The field is valid when `layout_type` is
+        /// #dnnl_graph_layout_type_opaque. An opaque layout ID is usually
+        /// generated by a partition which is compiled with layout type any.
+        size_t layout_id;
+    } layout;
+} dnnl_graph_logical_tensor_t;
+
+/// @} dnnl_graph_api_logical_tensor
+
+/// @addtogroup dnnl_graph_api_partition
+/// @{
+
+/// Policy specifications for partitioning
+typedef enum {
+    /// Fusion policy returns partitions with typical post-op fusions, eg.
+    /// Convolution + ReLU or other element-wise operations or a chian of
+    /// post-ops.
+    dnnl_graph_partition_policy_fusion = 1,
+    /// Debug policy doesn't not apply any fusions. It returns partitions with
+    /// single operation in each partition. The policy is useful when users
+    /// notice any bug or correctness issue in fusion policy.
+    dnnl_graph_partition_policy_debug = 2,
+} dnnl_graph_partition_policy_t;
+
+/// An opaque structure to describe a partition.
+struct dnnl_graph_partition;
+
+/// A partition handle.
+typedef struct dnnl_graph_partition *dnnl_graph_partition_t;
+
+/// A constant partition handle.
+typedef const struct dnnl_graph_partition *const_dnnl_graph_partition_t;
+
+/// @} dnnl_graph_api_partition
+
+/// @addtogroup dnnl_graph_api_graph
+/// @{
+
+/// An opaque structure to describe a graph.
+struct dnnl_graph_graph;
+
+/// A graph handle.
+typedef struct dnnl_graph_graph *dnnl_graph_graph_t;
+
+/// A constant graph handle.
+typedef const struct dnnl_graph_graph *const_dnnl_graph_graph_t;
+
+/// @} dnnl_graph_api_graph
+
+/// @addtogroup dnnl_graph_api_op
+/// @{
+
+/// Kinds of operations
+typedef enum {
+    dnnl_graph_op_abs,
+    dnnl_graph_op_abs_backward,
+    dnnl_graph_op_add,
+    dnnl_graph_op_avg_pool,
+    dnnl_graph_op_avg_pool_backward,
+    dnnl_graph_op_batch_norm_backward,
+    dnnl_graph_op_batch_norm_forward_training,
+    dnnl_graph_op_batch_norm_inference,
+    dnnl_graph_op_bias_add,
+    dnnl_graph_op_bias_add_backward,
+    dnnl_graph_op_clamp,
+    dnnl_graph_op_clamp_backward,
+    dnnl_graph_op_concat,
+    dnnl_graph_op_convolution,
+    dnnl_graph_op_convolution_backward_data,
+    dnnl_graph_op_convolution_backward_weights,
+    dnnl_graph_op_conv_transpose,
+    dnnl_graph_op_conv_transpose_backward_data,
+    dnnl_graph_op_conv_transpose_backward_weights,
+    dnnl_graph_op_dequantize,
+    dnnl_graph_op_divide,
+    dnnl_graph_op_dynamic_dequantize,
+    dnnl_graph_op_dynamic_quantize,
+    dnnl_graph_op_elu,
+    dnnl_graph_op_elu_backward,
+    dnnl_graph_op_end,
+    dnnl_graph_op_exp,
+    dnnl_graph_op_gelu,
+    dnnl_graph_op_gelu_backward,
+    dnnl_graph_op_hard_swish,
+    dnnl_graph_op_hard_swish_backward,
+    dnnl_graph_op_interpolate,
+    dnnl_graph_op_interpolate_backward,
+    dnnl_graph_op_layer_norm,
+    dnnl_graph_op_layer_norm_backward,
+    dnnl_graph_op_leaky_relu,
+    dnnl_graph_op_log,
+    dnnl_graph_op_log_softmax,
+    dnnl_graph_op_log_softmax_backward,
+    dnnl_graph_op_matmul,
+    dnnl_graph_op_maximum,
+    dnnl_graph_op_max_pool,
+    dnnl_graph_op_max_pool_backward,
+    dnnl_graph_op_minimum,
+    dnnl_graph_op_mish,
+    dnnl_graph_op_mish_backward,
+    dnnl_graph_op_multiply,
+    dnnl_graph_op_prelu,
+    dnnl_graph_op_prelu_backward,
+    dnnl_graph_op_quantize,
+    dnnl_graph_op_reciprocal,
+    dnnl_graph_op_reduce_l1,
+    dnnl_graph_op_reduce_l2,
+    dnnl_graph_op_reduce_max,
+    dnnl_graph_op_reduce_mean,
+    dnnl_graph_op_reduce_min,
+    dnnl_graph_op_reduce_prod,
+    dnnl_graph_op_reduce_sum,
+    dnnl_graph_op_relu,
+    dnnl_graph_op_relu_backward,
+    dnnl_graph_op_reorder,
+    dnnl_graph_op_round,
+    dnnl_graph_op_sigmoid,
+    dnnl_graph_op_sigmoid_backward,
+    dnnl_graph_op_softmax,
+    dnnl_graph_op_softmax_backward,
+    dnnl_graph_op_softplus,
+    dnnl_graph_op_softplus_backward,
+    dnnl_graph_op_sqrt,
+    dnnl_graph_op_sqrt_backward,
+    dnnl_graph_op_square,
+    dnnl_graph_op_squared_difference,
+    dnnl_graph_op_static_reshape,
+    dnnl_graph_op_static_transpose,
+    dnnl_graph_op_subtract,
+    dnnl_graph_op_tanh,
+    dnnl_graph_op_tanh_backward,
+    dnnl_graph_op_type_cast,
+    dnnl_graph_op_wildcard,
+    dnnl_graph_op_hard_sigmoid,
+    dnnl_graph_op_hard_sigmoid_backward,
+    dnnl_graph_op_select,
+    dnnl_graph_op_pow,
+    dnnl_graph_op_group_norm,
+    dnnl_graph_op_gen_index,
+    dnnl_graph_op_greater_equal,
+    dnnl_graph_op_last_symbol,
+} dnnl_graph_op_kind_t;
+
+/// Attributes of operations
+typedef enum {
+    /// Undefined op attribute.
+    dnnl_graph_op_attr_undef = 0,
+
+    // float32 attributes. The value of these attributes can be any single
+    // float32 number.
+
+    /// Specifies an alpha attribute to an op.
+    dnnl_graph_op_attr_alpha = 0x1,
+    /// Specifies an beta attribute to an op.
+    dnnl_graph_op_attr_beta,
+    /// Specifies an epsilon attribute to an op.
+    dnnl_graph_op_attr_epsilon,
+    /// Specifies a max attribute to an op.
+    dnnl_graph_op_attr_max,
+    ///Specifies a min attribute to an op.
+    dnnl_graph_op_attr_min,
+    /// Specifies a momentum attribute to an op.
+    dnnl_graph_op_attr_momentum,
+
+    // float32 vector attributes. The value of these attributes can be a vector
+    // of float32 numbers.
+
+    /// Specifies a scales attribute to an op.
+    dnnl_graph_op_attr_scales = 0x20,
+
+    // int64_t attributes. The value of these attributes can be any single int64
+    // number.
+
+    /// Specifies an axis attribute to an op.
+    dnnl_graph_op_attr_axis = 0x30,
+    /// Specifies a begin_norm_axis attribute to an op.
+    dnnl_graph_op_attr_begin_norm_axis,
+    /// Specifies a groups attribute to an op.
+    dnnl_graph_op_attr_groups,
+
+    // int64_t vector attributes. The value of these attributes can be a vector
+    // of int64 numbers.
+
+    /// Specifies an axes attribute to an op.
+    dnnl_graph_op_attr_axes = 0x40,
+    /// Specifies a dilations attribute to an op.
+    dnnl_graph_op_attr_dilations,
+    /// Specifies an dst_shape attribute to an op.
+    dnnl_graph_op_attr_dst_shape,
+    /// Specifies a kernel attribute to an op.
+    dnnl_graph_op_attr_kernel,
+    /// Specifies an order attribute to an op.
+    dnnl_graph_op_attr_order,
+    /// Specifies an output_padding attribute to an op.
+    dnnl_graph_op_attr_output_padding,
+    /// Specifies a pads_begin attribute to an op.
+    dnnl_graph_op_attr_pads_begin,
+    /// Specifies a pads_end attribute to an op.
+    dnnl_graph_op_attr_pads_end,
+    /// Specifies a shape attribute to an op.
+    dnnl_graph_op_attr_shape,
+    /// Specifies a sizes attribute to an op.
+    dnnl_graph_op_attr_sizes,
+    /// Specifies a input_shape attribute to an op.
+    dnnl_graph_op_attr_src_shape,
+    /// Specifies a strides attribute to an op.
+    dnnl_graph_op_attr_strides,
+    /// Specifies a weight_shape attribute to an op.
+    dnnl_graph_op_attr_weights_shape,
+    /// Specifies a zps attribute to an op.
+    dnnl_graph_op_attr_zps,
+    /// Specifies a group shape attribute to an op.
+    dnnl_graph_op_attr_group_shape,
+
+    // bool attributes. The value of these attributes can be any single bool
+    // value.
+
+    /// Specifies an exclude_pad attribute to an op.
+    dnnl_graph_op_attr_exclude_pad = 0x60,
+    /// Specifies a keep_dims attribute to an op.
+    dnnl_graph_op_attr_keep_dims,
+    /// Specifies a keep_stats attribute to an op.
+    dnnl_graph_op_attr_keep_stats,
+    /// Specifies a per_channel_broadcast attribute to an op.
+    dnnl_graph_op_attr_per_channel_broadcast,
+    /// Specifies a special_zero attribute to an op.
+    dnnl_graph_op_attr_special_zero,
+    /// Specifies a transpose_a attribute to an op.
+    dnnl_graph_op_attr_transpose_a,
+    /// Specifies a transpose_b attribute to an op.
+    dnnl_graph_op_attr_transpose_b,
+    /// Specifies an use_affine attribute to an op.
+    dnnl_graph_op_attr_use_affine,
+    /// Specifies an use_dst attribute to an op.
+    dnnl_graph_op_attr_use_dst,
+
+    // string attributes. The value of these attributes can be a string.
+
+    /// Specifies an auto_broadcast attribute to an op. The value can be "none"
+    /// or "numpy".
+    dnnl_graph_op_attr_auto_broadcast = 0x80,
+    /// Specifies an auto_pad attribute to an op. The value can be "none",
+    /// "same_upper", "same_lower", or "valid".
+    dnnl_graph_op_attr_auto_pad,
+    /// Specifies an coordinate_transformation_mode attribute to an op. The
+    /// value can be "half_pixel" or "align_corners". The attribute is defined
+    /// for Interpolate operations.
+    dnnl_graph_op_attr_coordinate_transformation_mode,
+    /// Specifies a data_format of an op. The value can be "NCX" or "NXC".
+    dnnl_graph_op_attr_data_format,
+    /// Specifies a mode attribute of an op. The value can be "nearest",
+    /// "linear", "bilinear", or "trilinear". The attribute is defined for
+    /// Interpolate operations.
+    dnnl_graph_op_attr_mode,
+    /// Specifies a qtype attribute to an op. The value can be "per_channel" or
+    /// "per_tensor". The attribute is defined for quantization operations.
+    dnnl_graph_op_attr_qtype,
+    /// Specifies a rounding_type attribute to an op. The value can be "ceil" or
+    /// "floor".
+    dnnl_graph_op_attr_rounding_type,
+    /// Specifies a weights_format of an op. The value can be "OIX", "XIO",
+    /// "IOX", or "XOI". Different operations may support different values.
+    dnnl_graph_op_attr_weights_format,
+
+    /// Specifies the end of all above exteral attributes for check.
+    dnnl_graph_op_attr_end = 0xFF,
+} dnnl_graph_op_attr_t;
+
+/// An opaque structure to describe an operation.
+struct dnnl_graph_op;
+
+/// An operation handle.
+typedef struct dnnl_graph_op *dnnl_graph_op_t;
+
+/// A constant operation handle.
+typedef const struct dnnl_graph_op *const_dnnl_graph_op_t;
+
+/// @} dnnl_graph_api_op
+
+/// @addtogroup dnnl_graph_api_allocator
+/// @{
+
+/// Allocation call-back function interface for host. For SYCL allocator, see
+/// #dnnl_graph_sycl_allocate_f.
+typedef void *(*dnnl_graph_host_allocate_f)(size_t size, size_t alignment);
+
+/// Deallocation call-back function interface for host. For SYCL allocator, see
+/// #dnnl_graph_sycl_deallocate_f.
+typedef void (*dnnl_graph_host_deallocate_f)(void *);
+
+/// An opaque structure to describe an allocator.
+struct dnnl_graph_allocator;
+
+/// An allocator handle.
+typedef struct dnnl_graph_allocator *dnnl_graph_allocator_t;
+
+/// A constant allocator handle.
+typedef const struct dnnl_graph_allocator *const_dnnl_graph_allocator_t;
+
+/// @} dnnl_graph_api_allocator
+
+/// @addtogroup dnnl_graph_api_compiled_partition
+/// @{
+
+/// In-place pair definition. It can queried from a compiled partition
+/// indicating that an input and an output of the partition can share the same
+/// memory buffer for computation. In-place computation helps to reduce the
+/// memory footprint and improves cache locality. But since the library may not
+/// have a global view of user's application, it's possible that the tensor with
+/// `input_id` is used at other places in user's computation graph. In this
+/// case, the user should take the in-place pair as a hint and pass a different
+/// memory buffer for output tensor to avoid overwriting the input memory buffer
+/// which will probably cause unexpected incorrect results.
+typedef struct {
+    /// The id of input tensor
+    size_t input_id;
+
+    /// The id of output tensor
+    size_t output_id;
+} dnnl_graph_inplace_pair_t;
+
+/// An opaque structure to describe a compiled partition.
+struct dnnl_graph_compiled_partition;
+
+/// A compiled partition handle.
+typedef struct dnnl_graph_compiled_partition *dnnl_graph_compiled_partition_t;
+
+/// A constant compiled partition handle.
+typedef const struct dnnl_graph_compiled_partition
+        *const_dnnl_graph_compiled_partition_t;
+
+/// @} dnnl_graph_api_compiled_partition
+
+/// @addtogroup dnnl_graph_api_tensor
+/// @{
+
+/// An opaque structure to describe a tensor.
+struct dnnl_graph_tensor;
+
+/// A tensor handle.
+typedef struct dnnl_graph_tensor *dnnl_graph_tensor_t;
+
+/// A constant tensor handle.
+typedef const struct dnnl_graph_tensor *const_dnnl_graph_tensor_t;
+
+/// @} dnnl_graph_api_tensor
+
+/// @} dnnl_graph_api
+
+/// @} dnnl_api
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_ocl.h b/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_ocl.h
new file mode 100644
index 0000000000000000000000000000000000000000..225dec78608f7bb4b88f0d4d7dcbaa72ce153abd
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_ocl.h
@@ -0,0 +1,276 @@
+/*******************************************************************************
+* Copyright 2020-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef ONEAPI_DNNL_DNNL_OCL_H
+#define ONEAPI_DNNL_DNNL_OCL_H
+
+#include "oneapi/dnnl/dnnl.h"
+
+#include "oneapi/dnnl/dnnl_ocl_types.h"
+
+/// @cond DO_NOT_DOCUMENT_THIS
+// Set target version for OpenCL explicitly to suppress a compiler warning.
+#ifndef CL_TARGET_OPENCL_VERSION
+#define CL_TARGET_OPENCL_VERSION 120
+#endif
+
+#include <CL/cl.h>
+/// @endcond
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// @addtogroup dnnl_api
+/// @{
+
+/// @addtogroup dnnl_api_interop
+/// @{
+
+/// @addtogroup dnnl_api_ocl_interop
+/// @{
+
+/// Creates a memory object.
+///
+/// Unless @p handle is equal to DNNL_MEMORY_NONE or DNNL_MEMORY_ALLOCATE, the
+/// constructed memory object will have the underlying buffer set. In this
+/// case, the buffer will be initialized as if:
+/// - dnnl_memory_set_data_handle() has been called, if @p memory_kind is equal
+///   to dnnl_ocl_interop_usm, or
+/// - dnnl_ocl_interop_memory_set_mem_object() has been called, if @p memory_kind
+///   is equal to dnnl_ocl_interop_buffer.
+///
+/// @param memory Output memory object.
+/// @param memory_desc Memory descriptor.
+/// @param engine Engine to use.
+/// @param memory_kind Memory allocation kind to specify the type of handle.
+/// @param handle Handle of the memory buffer to use as an underlying storage.
+///     - A USM pointer to the user-allocated buffer. In this case the library
+///       doesn't own the buffer. Requires @p memory_kind to be equal to
+///       dnnl_ocl_interop_usm.
+///     - An OpenCL buffer. In this case the library doesn't own the buffer.
+///       Requires @p memory_kind be equal to be equal to dnnl_ocl_interop_buffer.
+///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+///       allocate the buffer that corresponds to the memory allocation kind
+///       @p memory_kind for the memory object. In this case the library
+///       owns the buffer.
+///     - The DNNL_MEMORY_NONE specific value. Instructs the library to
+///       create memory object without an underlying buffer.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_memory_create(dnnl_memory_t *memory,
+        const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine,
+        dnnl_ocl_interop_memory_kind_t memory_kind, void *handle);
+
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+/// Creates a memory object with multiple handles.
+///
+/// @param memory Output memory object.
+/// @param memory_desc Memory descriptor.
+/// @param engine Engine to use.
+/// @param memory_kind Memory allocation kind to specify the type of handles.
+/// @param nhandles Number of handles.
+/// @param handles Handles of the memory buffers to use as underlying storages.
+///     For each element of the @p handles array the following applies:
+///     - A USM pointer to the user-allocated buffer. In this case the library
+///       doesn't own the buffer. Requires @p memory_kind to be equal to
+///       dnnl_ocl_interop_usm.
+///     - An OpenCL buffer. In this case the library doesn't own the buffer.
+///       Requires @p memory_kind be equal to be equal to dnnl_ocl_interop_buffer.
+///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+///       allocate the buffer that corresponds to the memory allocation kind
+///       @p memory_kind for the memory object. In this case the library
+///       owns the buffer.
+///     - The DNNL_MEMORY_NONE specific value. Instructs the library to
+///       create memory object without an underlying buffer.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_memory_create_v2(dnnl_memory_t *memory,
+        const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine,
+        dnnl_ocl_interop_memory_kind_t memory_kind, int nhandles,
+        void **handles);
+#endif
+
+/// Returns the memory allocation kind associated with a memory object.
+///
+/// @param memory Memory to query.
+/// @param memory_kind Output underlying memory allocation kind of the memory
+///     object.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_memory_get_memory_kind(
+        const_dnnl_memory_t memory,
+        dnnl_ocl_interop_memory_kind_t *memory_kind);
+
+/// Returns an OpenCL memory object associated with a memory object.
+///
+/// @param memory Memory object.
+/// @param mem_object Output OpenCL memory object.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_memory_get_mem_object(
+        const_dnnl_memory_t memory, cl_mem *mem_object);
+
+/// Sets OpenCL memory object associated with a memory object.
+///
+/// For behavioral details, see dnnl_memory_set_data_handle().
+///
+/// @param memory Memory object.
+/// @param mem_object OpenCL memory object.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_memory_set_mem_object(
+        dnnl_memory_t memory, cl_mem mem_object);
+
+/// Retrieves a cache blob ID for the OpenCL device.
+///
+/// @warning
+///     This API is intended to be used with
+///     #dnnl_ocl_interop_engine_get_cache_blob() and
+///     #dnnl_ocl_interop_engine_create_from_cache_blob(). The returned cache
+///     blob ID can only be used as an ID of the cache blob returned by
+///     #dnnl_ocl_interop_engine_get_cache_blob().
+///
+/// @note The cache blob ID can be empty (@p size will be 0 and
+///     @p cache_blob_id will be nullptr) if oneDNN doesn't have anything to
+///     put in the cache blob. (#dnnl_ocl_interop_engine_get_cache_blob will
+///     return an empty cache blob).
+///
+/// @param device An OpenCL device.
+/// @param size Size of the cache blob ID in bytes.
+/// @param cache_blob_id Cache blob id of size @p size. If
+///     the @p cache_blob_id is nullptr then the size of the cache blob ID is
+///     returned in @p size.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_engine_get_cache_blob_id(
+        cl_device_id device, size_t *size, uint8_t *cache_blob_id);
+
+/// Retrieves a cache blob associated with the given engine.
+///
+/// @note The cache blob can be empty (@p size will be 0 and @p cache_blob
+///     will be nullptr) if oneDNN doesn't have anything to put in the cache
+///     blob. It's the user's responsibility to check whether it's empty
+///     prior to passing it to
+///     #dnnl_ocl_interop_engine_create_from_cache_blob().
+///
+/// @param engine Engine to query for the cache blob.
+/// @param size Size of the cache blob in bytes.
+/// @param cache_blob Cache blob of size @p size. If the @p cache_blob is
+///     nullptr then the size of the cache blob is returned in @p size.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_engine_get_cache_blob(
+        dnnl_engine_t engine, size_t *size, uint8_t *cache_blob);
+
+/// Creates an engine from the given cache blob.
+///
+/// @param engine Output engine.
+/// @param device The OpenCL device that this engine will encapsulate.
+/// @param context The OpenCL context (containing the device) that this
+///     engine will use for all operations.
+/// @param size Size of the cache blob in bytes.
+/// @param cache_blob Cache blob of size @p size.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_engine_create_from_cache_blob(
+        dnnl_engine_t *engine, cl_device_id device, cl_context context,
+        size_t size, const uint8_t *cache_blob);
+
+/// Creates an engine associated with an OpenCL device and an OpenCL context.
+///
+/// @param engine Output engine.
+/// @param device Underlying OpenCL device to use for the engine.
+/// @param context Underlying OpenCL context to use for the engine.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_engine_create(
+        dnnl_engine_t *engine, cl_device_id device, cl_context context);
+
+/// Returns the OpenCL context associated with an engine.
+///
+/// @param engine Engine to query.
+/// @param context Output underlying OpenCL context of the engine.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_engine_get_context(
+        dnnl_engine_t engine, cl_context *context);
+
+/// Returns the OpenCL device associated with an engine.
+///
+/// @param engine Engine to query.
+/// @param device Output underlying OpenCL device of the engine.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_get_device(
+        dnnl_engine_t engine, cl_device_id *device);
+
+/// Creates an execution stream for a given engine associated with
+/// an OpenCL command queue.
+///
+/// @param stream Output execution stream.
+/// @param engine Engine to create the execution stream on.
+/// @param queue OpenCL command queue to use.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_stream_create(
+        dnnl_stream_t *stream, dnnl_engine_t engine, cl_command_queue queue);
+
+/// Returns the OpenCL command queue associated with an execution stream.
+///
+/// @param stream Execution stream to query.
+/// @param queue Output OpenCL command queue.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_stream_get_command_queue(
+        dnnl_stream_t stream, cl_command_queue *queue);
+
+/// Executes computations specified by the primitive in a specified stream and
+/// returns an OpenCL event.
+///
+/// @param primitive Primitive to execute.
+/// @param stream Stream to use.
+/// @param nargs Number of arguments.
+/// @param args Array of arguments. Each argument is an
+///     <index, #dnnl_memory_t> pair. The index is one of the `DNNL_ARG_*`
+///     values such as `DNNL_ARG_SRC`. Unless runtime shapes are used (see
+///     #DNNL_RUNTIME_DIM_VAL), the memory object must have the same memory
+///     descriptor as that returned by
+///     #dnnl_primitive_desc_query_md(#dnnl_query_exec_arg_md, index).
+/// @param deps A pointer to a vector of size @p ndeps that contains
+///     dependencies.
+/// @param ndeps Number of dependencies.
+/// @param return_event Output event. It's the user's responsibility to
+///     manage lifetime of the event. Can be NULL. When @p stream is in-order
+///     NULL will be returned.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_primitive_execute(
+        const_dnnl_primitive_t primitive, dnnl_stream_t stream, int nargs,
+        const dnnl_exec_arg_t *args, const cl_event *deps, int ndeps,
+        cl_event *return_event);
+
+/// @} dnnl_api_ocl_interop
+
+/// @} dnnl_api_interop
+
+/// @} dnnl_api
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_ocl.hpp b/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_ocl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..36416511e4f473e647f49a593f0ad1937462ecef
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_ocl.hpp
@@ -0,0 +1,445 @@
+/*******************************************************************************
+* Copyright 2020-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef ONEAPI_DNNL_DNNL_OCL_HPP
+#define ONEAPI_DNNL_DNNL_OCL_HPP
+
+#include "oneapi/dnnl/dnnl.hpp"
+
+/// @cond DO_NOT_DOCUMENT_THIS
+#include <algorithm>
+#include <cstdlib>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <vector>
+#include <unordered_map>
+
+#include "oneapi/dnnl/dnnl_ocl.h"
+
+#include <CL/cl.h>
+/// @endcond
+
+/// @addtogroup dnnl_api
+/// @{
+
+namespace dnnl {
+
+/// @addtogroup dnnl_api_interop Runtime interoperability API
+/// API extensions to interact with the underlying run-time.
+/// @{
+
+/// @addtogroup dnnl_api_ocl_interop OpenCL interoperability API
+/// API extensions to interact with the underlying OpenCL run-time.
+///
+/// @sa @ref dev_guide_opencl_interoperability in developer guide
+/// @{
+
+/// OpenCL interoperability namespace
+namespace ocl_interop {
+
+/// Memory allocation kind.
+enum class memory_kind {
+    /// USM (device, shared, host, or unknown) memory allocation kind.
+    usm = dnnl_ocl_interop_usm,
+    /// Buffer memory allocation kind - default.
+    buffer = dnnl_ocl_interop_buffer,
+};
+
+/// Converts a memory allocation kind enum value from C++ API to C API type.
+///
+/// @param akind C++ API memory allocation kind enum value.
+/// @returns Corresponding C API memory allocation kind enum value.
+inline dnnl_ocl_interop_memory_kind_t convert_to_c(memory_kind akind) {
+    return static_cast<dnnl_ocl_interop_memory_kind_t>(akind);
+}
+
+/// Returns the cache blob ID of the OpenCL device.
+///
+/// @warning
+///     This API is intended to be used with
+///     #dnnl::ocl_interop::get_engine_cache_blob() and
+///     #dnnl::ocl_interop::make_engine(cl_device_id, cl_context, const std::vector<uint8_t> &).
+///     The returned cache blob ID can only be used as an ID of the cache blob
+///     returned by #dnnl::ocl_interop::get_engine_cache_blob().
+///
+/// @note The cache blob ID can be empty (@p size will be 0 and
+///     @p cache_blob_id will be nullptr) if oneDNN doesn't have anything to
+///     put in the cache blob. (#dnnl_ocl_interop_engine_get_cache_blob will
+///     return an empty cache blob).
+///
+/// @param device An OpenCL device.
+/// @returns A vector containing the cache blob ID.
+inline std::vector<uint8_t> get_engine_cache_blob_id(cl_device_id device) {
+    size_t size = 0;
+    error::wrap_c_api(
+            dnnl_ocl_interop_engine_get_cache_blob_id(device, &size, nullptr),
+            "could not get an engine cache blob id size");
+
+    std::vector<uint8_t> cache_blob_id(size);
+    error::wrap_c_api(dnnl_ocl_interop_engine_get_cache_blob_id(
+                              device, &size, cache_blob_id.data()),
+            "could not get an engine cache blob id");
+    return cache_blob_id;
+}
+
+/// Returns a cache blob for the engine.
+///
+/// @note The cache blob vector can be empty if oneDNN doesn't have anything
+///     to put in the cache blob. It's the user's responsibility to check
+///     whether it's empty prior to passing it to
+///     #dnnl::ocl_interop::make_engine(cl_device_id, cl_context, const std::vector<uint8_t> &)
+///
+/// @param aengine Engine to query for the cache blob.
+/// @returns Vector containing the cache blob.
+inline std::vector<uint8_t> get_engine_cache_blob(const engine &aengine) {
+    size_t size = 0;
+    error::wrap_c_api(dnnl_ocl_interop_engine_get_cache_blob(
+                              aengine.get(), &size, nullptr),
+            "could not get an engine cache blob size");
+
+    std::vector<uint8_t> cache_blob(size);
+    error::wrap_c_api(dnnl_ocl_interop_engine_get_cache_blob(
+                              aengine.get(), &size, cache_blob.data()),
+            "could not get an engine cache blob");
+    return cache_blob;
+}
+
+/// Constructs an engine from the given cache blob.
+///
+/// @param device The OpenCL device that this engine will encapsulate.
+/// @param context The OpenCL context (containing the device) that this
+///     engine will use for all operations.
+/// @param cache_blob Cache blob.
+/// @returns An engine.
+inline engine make_engine(cl_device_id device, cl_context context,
+        const std::vector<uint8_t> &cache_blob) {
+    dnnl_engine_t c_engine;
+    error::wrap_c_api(
+            dnnl_ocl_interop_engine_create_from_cache_blob(&c_engine, device,
+                    context, cache_blob.size(), cache_blob.data()),
+            "could not create an engine from cache blob");
+    return engine(c_engine);
+}
+
+/// Constructs an engine from OpenCL device and context objects.
+///
+/// @param device The OpenCL device that this engine will encapsulate.
+/// @param context The OpenCL context (containing the device) that this
+///     engine will use for all operations.
+/// @returns An engine.
+inline engine make_engine(cl_device_id device, cl_context context) {
+    dnnl_engine_t c_engine;
+    error::wrap_c_api(
+            dnnl_ocl_interop_engine_create(&c_engine, device, context),
+            "could not create an engine");
+    return engine(c_engine);
+}
+
+/// Returns OpenCL context associated with the engine.
+///
+/// @param aengine An engine.
+/// @returns Underlying OpenCL context.
+inline cl_context get_context(const engine &aengine) {
+    cl_context context = nullptr;
+    error::wrap_c_api(
+            dnnl_ocl_interop_engine_get_context(aengine.get(), &context),
+            "could not get an OpenCL context from an engine");
+    return context;
+}
+
+/// Returns OpenCL device associated with the engine.
+///
+/// @param aengine An engine.
+/// @returns Underlying OpenCL device.
+inline cl_device_id get_device(const engine &aengine) {
+    cl_device_id device = nullptr;
+    error::wrap_c_api(dnnl_ocl_interop_get_device(aengine.get(), &device),
+            "could not get an OpenCL device from an engine");
+    return device;
+}
+
+/// Constructs an execution stream for the specified engine and OpenCL queue.
+///
+/// @param aengine Engine to create the stream on.
+/// @param queue OpenCL queue to use for the stream.
+/// @returns An execution stream.
+inline stream make_stream(const engine &aengine, cl_command_queue queue) {
+    dnnl_stream_t c_stream;
+    error::wrap_c_api(
+            dnnl_ocl_interop_stream_create(&c_stream, aengine.get(), queue),
+            "could not create a stream");
+    return stream(c_stream);
+}
+
+/// Returns OpenCL queue object associated with the execution stream.
+///
+/// @param astream An execution stream.
+/// @returns Underlying OpenCL queue.
+inline cl_command_queue get_command_queue(const stream &astream) {
+    cl_command_queue queue = nullptr;
+    error::wrap_c_api(
+            dnnl_ocl_interop_stream_get_command_queue(astream.get(), &queue),
+            "could not get an OpenCL command queue from a stream");
+    return queue;
+}
+
+/// Returns the OpenCL memory object associated with the memory object.
+///
+/// @param amemory A memory object.
+/// @returns Underlying OpenCL memory object.
+inline cl_mem get_mem_object(const memory &amemory) {
+    cl_mem mem_object;
+    error::wrap_c_api(
+            dnnl_ocl_interop_memory_get_mem_object(amemory.get(), &mem_object),
+            "could not get OpenCL buffer object from a memory object");
+    return mem_object;
+}
+
+/// Sets the OpenCL memory object associated with the memory object.
+///
+/// For behavioral details see memory::set_data_handle().
+///
+/// @param amemory A memory object.
+/// @param mem_object OpenCL cl_mem object to use as the underlying
+///     storage. It must have at least get_desc().get_size() bytes
+///     allocated.
+inline void set_mem_object(memory &amemory, cl_mem mem_object) {
+    error::wrap_c_api(
+            dnnl_ocl_interop_memory_set_mem_object(amemory.get(), mem_object),
+            "could not set OpenCL buffer object from a memory object");
+}
+
+/// Returns the memory allocation kind associated with a memory object.
+///
+/// @param amemory A memory object.
+///
+/// @returns The underlying memory allocation kind of the memory object.
+inline memory_kind get_memory_kind(const memory &amemory) {
+    dnnl_ocl_interop_memory_kind_t ckind;
+    error::wrap_c_api(
+            dnnl_ocl_interop_memory_get_memory_kind(amemory.get(), &ckind),
+            "could not get memory kind");
+    return static_cast<memory_kind>(ckind);
+}
+
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+/// Creates a memory object with multiple handles.
+///
+/// @param memory_desc Memory descriptor.
+/// @param aengine Engine to use.
+/// @param kind Memory allocation kind to specify the type of handles.
+/// @param handles Handles of the memory buffers to use as underlying storages.
+///     For each element of the @p handles array the following applies:
+///     - A USM pointer to the user-allocated buffer. In this case the library
+///       doesn't own the buffer. Requires @p memory_kind to be equal to
+///       dnnl_ocl_interop_usm.
+///     - An OpenCL buffer. In this case the library doesn't own the buffer.
+///       Requires @p memory_kind be equal to be equal to dnnl_ocl_interop_buffer.
+///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+///       allocate the buffer that corresponds to the memory allocation kind
+///       @p memory_kind for the memory object. In this case the library
+///       owns the buffer.
+///     - The DNNL_MEMORY_NONE specific value. Instructs the library to
+///       create memory object without an underlying buffer.
+///
+///  If the @p handles vector is not provided the library will allocate all
+///  buffers as if all handles have the special value DNNL_MEMORY_ALLOCATE.
+///
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+inline memory make_memory(const memory::desc &memory_desc,
+        const engine &aengine, memory_kind kind,
+        std::vector<void *> handles = {}) {
+    if (handles.empty()) {
+        const int nhandles = memory_desc.get_num_handles();
+        handles.resize(nhandles, DNNL_MEMORY_ALLOCATE);
+    }
+
+    dnnl_memory_t c_memory;
+    error::wrap_c_api(
+            dnnl_ocl_interop_memory_create_v2(&c_memory, memory_desc.get(),
+                    aengine.get(), convert_to_c(kind), (int)handles.size(),
+                    handles.data()),
+            "could not create a memory");
+    return memory(c_memory);
+}
+
+/// Constructs a memory object with multiple OpenCL buffers.
+///
+/// @param memory_desc Memory descriptor.
+/// @param aengine Engine to use.
+/// @param mem_objects A vector of OpenCL buffers to use.
+///
+/// @returns Created memory object.
+inline memory make_memory(const memory::desc &memory_desc,
+        const engine &aengine, std::vector<cl_mem> mem_objects) {
+    const int nhandles = memory_desc.get_num_handles();
+    std::vector<void *> handles(nhandles, DNNL_MEMORY_NONE);
+    memory amemory(memory_desc, aengine, handles);
+    for (int i = 0; i < nhandles; i++)
+        amemory.set_data_handle(mem_objects[i], i);
+    return amemory;
+}
+
+/// Creates a memory object.
+///
+/// Unless @p handle is equal to DNNL_MEMORY_NONE or DNNL_MEMORY_ALLOCATE, the
+/// constructed memory object will have the underlying buffer set. In this
+/// case, the buffer will be initialized as if:
+/// - dnnl::memory::set_data_handle() had been called, if @p memory_kind is
+///   equal to dnnl::ocl_interop::memory_kind::usm, or
+/// - dnnl::ocl_interop::set_mem_object() has been called, if @p memory_kind is
+///   equal to dnnl::ocl_interop::memory_kind::buffer.
+///
+/// @param memory_desc Memory descriptor.
+/// @param aengine Engine to use.
+/// @param kind Memory allocation kind to specify the type of handle.
+/// @param handle Handle of the memory buffer to use as an underlying storage.
+///     - A USM pointer to the user-allocated buffer. In this case the library
+///       doesn't own the buffer. Requires @p memory_kind to be equal to
+///       dnnl::ocl_interop::memory_kind::usm.
+///     - An OpenCL buffer. In this case the library doesn't own the buffer.
+///       Requires @p memory_kind be equal to be equal to
+///       dnnl::ocl_interop::memory_kind::buffer.
+///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+///       allocate the buffer that corresponds to the memory allocation kind
+///       @p memory_kind for the memory object. In this case the library
+///       owns the buffer.
+///     - The DNNL_MEMORY_NONE specific value. Instructs the library to
+///       create memory object without an underlying buffer.
+///
+/// @returns Created memory object.
+inline memory make_memory(const memory::desc &memory_desc,
+        const engine &aengine, memory_kind kind, void *handle) {
+    return make_memory(
+            memory_desc, aengine, kind, std::vector<void *> {handle});
+}
+
+/// Constructs a memory object from an OpenCL buffer.
+///
+/// @param memory_desc Memory descriptor.
+/// @param aengine Engine to use.
+/// @param mem_object An OpenCL buffer to use.
+///
+/// @returns Created memory object.
+inline memory make_memory(const memory::desc &memory_desc,
+        const engine &aengine, cl_mem mem_object) {
+    return make_memory(memory_desc, aengine, std::vector<cl_mem> {mem_object});
+}
+#else
+
+/// Creates a memory object.
+///
+/// Unless @p handle is equal to DNNL_MEMORY_NONE or DNNL_MEMORY_ALLOCATE, the
+/// constructed memory object will have the underlying buffer set. In this
+/// case, the buffer will be initialized as if:
+/// - dnnl::memory::set_data_handle() had been called, if @p memory_kind is
+///   equal to dnnl::ocl_interop::memory_kind::usm, or
+/// - dnnl::ocl_interop::set_mem_object() has been called, if @p memory_kind is
+///   equal to dnnl::ocl_interop::memory_kind::buffer.
+///
+/// @param memory_desc Memory descriptor.
+/// @param aengine Engine to use.
+/// @param kind Memory allocation kind to specify the type of handle.
+/// @param handle Handle of the memory buffer to use as an underlying storage.
+///     - A USM pointer to the user-allocated buffer. In this case the library
+///       doesn't own the buffer. Requires @p memory_kind to be equal to
+///       dnnl::ocl_interop::memory_kind::usm.
+///     - An OpenCL buffer. In this case the library doesn't own the buffer.
+///       Requires @p memory_kind be equal to be equal to
+///       dnnl::ocl_interop::memory_kind::buffer.
+///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+///       allocate the buffer that corresponds to the memory allocation kind
+///       @p memory_kind for the memory object. In this case the library
+///       owns the buffer.
+///     - The DNNL_MEMORY_NONE specific value. Instructs the library to
+///       create memory object without an underlying buffer.
+///
+/// @returns Created memory object.
+inline memory make_memory(const memory::desc &memory_desc,
+        const engine &aengine, memory_kind kind,
+        void *handle = DNNL_MEMORY_ALLOCATE) {
+    dnnl_memory_t c_memory;
+    error::wrap_c_api(
+            dnnl_ocl_interop_memory_create(&c_memory, memory_desc.get(),
+                    aengine.get(), convert_to_c(kind), handle),
+            "could not create a memory");
+    return memory(c_memory);
+}
+
+/// Constructs a memory object from an OpenCL buffer.
+///
+/// @param memory_desc Memory descriptor.
+/// @param aengine Engine to use.
+/// @param mem_object An OpenCL buffer to use.
+///
+/// @returns Created memory object.
+inline memory make_memory(const memory::desc &memory_desc,
+        const engine &aengine, cl_mem mem_object) {
+    memory amemory(memory_desc, aengine, DNNL_MEMORY_NONE);
+    set_mem_object(amemory, mem_object);
+    return amemory;
+}
+#endif
+
+/// Executes computations specified by the primitive in a specified stream and
+/// returns a SYCL event.
+///
+/// Arguments are passed via an arguments map containing
+/// <index, memory object> pairs. The index must be one of the `DNNL_ARG_*`
+/// values such as `DNNL_ARG_SRC`, and the memory must have a memory descriptor
+/// matching the one returned by
+/// #dnnl::primitive_desc::query_md(#query::exec_arg_md, index) unless using
+/// dynamic shapes (see #DNNL_RUNTIME_DIM_VAL).
+///
+/// @param aprimitive Primitive to execute.
+/// @param astream Stream object. The stream must belong to the same engine
+///     as the primitive.
+/// @param args Arguments map.
+/// @param deps Optional vector with `cl_event` dependencies.
+///
+/// @returns Output event. It's the user's responsibility to manage lifetime
+///     of the event.
+inline cl_event execute(const dnnl::primitive &aprimitive,
+        const stream &astream, const std::unordered_map<int, memory> &args,
+        const std::vector<cl_event> &deps = {}) {
+    std::vector<dnnl_exec_arg_t> c_args;
+    c_args.reserve(args.size());
+    for (const auto &a : args)
+        c_args.push_back({a.first, a.second.get()});
+
+    const cl_event *c_deps = deps.empty() ? nullptr : deps.data();
+
+    cl_event return_event;
+    error::wrap_c_api(dnnl_ocl_interop_primitive_execute(aprimitive.get(),
+                              astream.get(), (int)c_args.size(), c_args.data(),
+                              c_deps, (int)deps.size(), &return_event),
+            "could not execute a primitive");
+    return return_event;
+}
+
+} // namespace ocl_interop
+
+/// @} dnnl_api_ocl_interop
+
+/// @} dnnl_api_interop
+
+} // namespace dnnl
+
+/// @} dnnl_api
+
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_ocl_types.h b/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_ocl_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..168e1259dd8d60c23640bdf405334eda6482841a
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_ocl_types.h
@@ -0,0 +1,51 @@
+/*******************************************************************************
+* Copyright 2021 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef ONEAPI_DNNL_DNNL_OCL_TYPES_H
+#define ONEAPI_DNNL_DNNL_OCL_TYPES_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// @addtogroup dnnl_api
+/// @{
+
+/// @addtogroup dnnl_api_interop
+/// @{
+
+/// @addtogroup dnnl_api_ocl_interop
+/// @{
+
+/// Memory allocation kind.
+typedef enum {
+    /// USM (device, shared, host, or unknown) memory allocation kind.
+    dnnl_ocl_interop_usm,
+    /// Buffer memory allocation kind - default.
+    dnnl_ocl_interop_buffer,
+} dnnl_ocl_interop_memory_kind_t;
+
+/// @} dnnl_api_ocl_interop
+
+/// @} dnnl_api_interop
+
+/// @} dnnl_api
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_sycl.h b/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_sycl.h
new file mode 100644
index 0000000000000000000000000000000000000000..09f7d632a8ab51ed5ee5a4898fac6bbaef309cab
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_sycl.h
@@ -0,0 +1,199 @@
+/*******************************************************************************
+* Copyright 2020-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef ONEAPI_DNNL_DNNL_SYCL_H
+#define ONEAPI_DNNL_DNNL_SYCL_H
+
+#include "oneapi/dnnl/dnnl.h"
+
+#include "oneapi/dnnl/dnnl_sycl_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// @addtogroup dnnl_api
+/// @{
+
+/// @addtogroup dnnl_api_interop
+/// @{
+
+/// @addtogroup dnnl_api_sycl_interop
+/// @{
+
+/// Creates an engine associated with a SYCL device and a SYCL context.
+///
+/// @param engine Output engine.
+/// @param device Pointer to the SYCL device to use for the engine.
+/// @param context Pointer to the SYCL context to use for the engine.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_sycl_interop_engine_create(
+        dnnl_engine_t *engine, const void *device, const void *context);
+
+/// Returns the SYCL context associated with an engine.
+///
+/// @param engine Engine to query.
+/// @param context Pointer to the underlying SYCL context of the engine.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_sycl_interop_engine_get_context(
+        dnnl_engine_t engine, void **context);
+
+/// Returns the SYCL device associated with an engine.
+///
+/// @param engine Engine to query.
+/// @param device Pointer to the underlying SYCL device of the engine.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_sycl_interop_engine_get_device(
+        dnnl_engine_t engine, void **device);
+
+/// Creates a memory object.
+///
+/// Unless @p handle is equal to DNNL_MEMORY_NONE or DNNL_MEMORY_ALLOCATE, the
+/// constructed memory object will have the underlying buffer set. In this
+/// case, the buffer will be initialized as if:
+/// - dnnl_memory_set_data_handle() had been called, if @p memory_kind is equal
+///   to dnnl_sycl_interop_usm, or
+/// - dnnl_sycl_interop_memory_set_buffer() has been called, if @p memory_kind
+///   is equal to dnnl_sycl_interop_buffer.
+///
+/// @param memory Output memory object.
+/// @param memory_desc Memory descriptor.
+/// @param engine Engine to use.
+/// @param memory_kind Memory allocation kind to specify the type of handle.
+/// @param handle Handle of the memory buffer to use as an underlying storage.
+///     - A USM pointer to the user-allocated buffer. In this case the library
+///       doesn't own the buffer. Requires @p memory_kind to be equal to
+///       dnnl_sycl_interop_usm.
+///     - A pointer to SYCL buffer. In this case the library doesn't own the
+///       buffer. Requires @p memory_kind be equal to be equal to
+///       dnnl_sycl_interop_buffer.
+///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+///       allocate the buffer that corresponds to the memory allocation kind
+///       @p memory_kind for the memory object. In this case the library
+///       owns the buffer.
+///     - The DNNL_MEMORY_NONE specific value. Instructs the library to
+///       create memory object without an underlying buffer.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_sycl_interop_memory_create(dnnl_memory_t *memory,
+        const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine,
+        dnnl_sycl_interop_memory_kind_t memory_kind, void *handle);
+
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+/// Creates a memory object with multiple handles.
+///
+/// @param memory Output memory object.
+/// @param memory_desc Memory descriptor.
+/// @param engine Engine to use.
+/// @param memory_kind Memory allocation kind to specify the type of handles.
+/// @param nhandles Number of handles.
+/// @param handles Handles of the memory buffers to use as underlying storages.
+///     For each element of the @p handles array the following applies:
+///     - A USM pointer to the user-allocated buffer. In this case the library
+///       doesn't own the buffer. Requires @p memory_kind to be equal to
+///       dnnl_sycl_interop_usm.
+///     - A pointer to SYCL buffer. In this case the library doesn't own the
+///       buffer. Requires @p memory_kind be equal to be equal to
+///       dnnl_sycl_interop_buffer.
+///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+///       allocate the buffer that corresponds to the memory allocation kind
+///       @p memory_kind for the memory object. In this case the library
+///       owns the buffer.
+///     - The DNNL_MEMORY_NONE specific value. Instructs the library to
+///       create memory object without an underlying buffer.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_sycl_interop_memory_create_v2(dnnl_memory_t *memory,
+        const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine,
+        dnnl_sycl_interop_memory_kind_t memory_kind, int nhandles,
+        void **handles);
+#endif
+
+/// Returns the memory allocation kind associated with a memory object.
+///
+/// @param memory Memory to query.
+/// @param memory_kind Output underlying memory allocation kind of the memory
+///     object.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_sycl_interop_memory_get_memory_kind(
+        const_dnnl_memory_t memory,
+        dnnl_sycl_interop_memory_kind_t *memory_kind);
+
+/// Sets a SYCL buffer for a memory object.
+///
+/// @param memory Memory object.
+/// @param buffer SYCL buffer to be set in the memory object.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_sycl_interop_memory_set_buffer(
+        dnnl_memory_t memory, void *buffer);
+
+/// Creates an execution stream for a given engine associated with a SYCL
+/// queue.
+///
+/// @param stream Output execution stream.
+/// @param engine Engine to create the execution stream on.
+/// @param queue SYCL queue to use.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_sycl_interop_stream_create(
+        dnnl_stream_t *stream, dnnl_engine_t engine, void *queue);
+
+/// Returns the SYCL queue associated with an execution stream.
+///
+/// @param stream Execution stream to query.
+/// @param queue Output SYCL command queue.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_sycl_interop_stream_get_queue(
+        dnnl_stream_t stream, void **queue);
+
+/// Executes computations specified by the primitive in a specified stream and
+/// returns a SYCL event.
+///
+/// @param primitive Primitive to execute.
+/// @param stream Stream to use.
+/// @param nargs Number of arguments.
+/// @param args Array of arguments. Each argument is an
+///     <index, #dnnl_memory_t> pair. The index is one of the `DNNL_ARG_*`
+///     values such as `DNNL_ARG_SRC`. Unless runtime shapes are used (see
+///     #DNNL_RUNTIME_DIM_VAL), the memory object must have the same memory
+///     descriptor as that returned by
+///     #dnnl_primitive_desc_query_md(#dnnl_query_exec_arg_md, index).
+/// @param deps A pointer to std::vector<sycl::event> that contains
+///     dependencies.
+/// @param return_event Output event.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_sycl_interop_primitive_execute(
+        const_dnnl_primitive_t primitive, dnnl_stream_t stream, int nargs,
+        const dnnl_exec_arg_t *args, const void *deps, void *return_event);
+
+/// @} dnnl_api_sycl_interop
+
+/// @} dnnl_api_interop
+
+/// @} dnnl_api
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_sycl.hpp b/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_sycl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c5844e9bbbdfe0c0e5900da89374c9842d24ffbf
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_sycl.hpp
@@ -0,0 +1,384 @@
+/*******************************************************************************
+* Copyright 2020-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef ONEAPI_DNNL_DNNL_SYCL_HPP
+#define ONEAPI_DNNL_DNNL_SYCL_HPP
+
+/// @cond DO_NOT_DOCUMENT_THIS
+#include <algorithm>
+#include <cstdlib>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <vector>
+#include <unordered_map>
+
+#if __has_include(<sycl/sycl.hpp>)
+#include <sycl/sycl.hpp>
+#else
+#error "Unsupported compiler"
+#endif
+
+#include "oneapi/dnnl/dnnl.hpp"
+#include "oneapi/dnnl/dnnl_sycl.h"
+
+/// @endcond
+
+/// @addtogroup dnnl_api
+/// @{
+
+namespace dnnl {
+
+/// @addtogroup dnnl_api_interop
+/// @{
+
+/// @addtogroup dnnl_api_sycl_interop SYCL interoperability API
+/// API extensions to interact with the underlying SYCL run-time.
+///
+/// @sa @ref dev_guide_dpcpp_interoperability in developer guide
+/// @{
+
+/// SYCL interoperability namespace
+namespace sycl_interop {
+
+/// Memory allocation kind.
+enum class memory_kind {
+    /// USM (device, shared, host, or unknown) memory allocation kind - default.
+    usm = dnnl_sycl_interop_usm,
+    /// Buffer memory allocation kind.
+    buffer = dnnl_sycl_interop_buffer,
+};
+
+/// Converts a memory allocation kind enum value from C++ API to C API type.
+///
+/// @param akind C++ API memory allocation kind enum value.
+/// @returns Corresponding C API memory allocation kind enum value.
+inline dnnl_sycl_interop_memory_kind_t convert_to_c(memory_kind akind) {
+    return static_cast<dnnl_sycl_interop_memory_kind_t>(akind);
+}
+
+/// Constructs an engine from SYCL device and context objects.
+///
+/// @param adevice SYCL device.
+/// @param acontext SYCL context.
+///
+/// @returns Created engine.
+inline engine make_engine(
+        const sycl::device &adevice, const sycl::context &acontext) {
+    dnnl_engine_t aengine;
+    error::wrap_c_api(dnnl_sycl_interop_engine_create(&aengine,
+                              static_cast<const void *>(&adevice),
+                              static_cast<const void *>(&acontext)),
+            "could not create an engine");
+    return engine(aengine);
+}
+
+/// Returns the SYCL context associated with an engine.
+///
+/// @param aengine Engine to query.
+///
+/// @returns The underlying SYCL device of the engine.
+inline sycl::context get_context(const engine &aengine) {
+    void *ctx_ptr;
+    error::wrap_c_api(
+            dnnl_sycl_interop_engine_get_context(aengine.get(), &ctx_ptr),
+            "could not get a context handle");
+    auto ctx = *static_cast<sycl::context *>(ctx_ptr);
+    return ctx;
+}
+
+/// Returns the SYCL device associated with an engine.
+///
+/// @param aengine Engine to query.
+///
+/// @returns The underlying SYCL context of the engine.
+inline sycl::device get_device(const engine &aengine) {
+    void *dev_ptr;
+    error::wrap_c_api(
+            dnnl_sycl_interop_engine_get_device(aengine.get(), &dev_ptr),
+            "could not get a device handle");
+    auto dev = *static_cast<sycl::device *>(dev_ptr);
+    return dev;
+}
+
+/// Creates an execution stream for a given engine associated with a SYCL
+/// queue.
+///
+/// @param aengine Engine object to use for the stream.
+/// @param aqueue SYCL queue to use for the stream.
+///
+/// @returns An execution stream.
+inline stream make_stream(const engine &aengine, sycl::queue &aqueue) {
+    dnnl_stream_t astream;
+    error::wrap_c_api(
+            dnnl_sycl_interop_stream_create(&astream, aengine.get(), &aqueue),
+            "could not create a stream");
+    return stream(astream);
+}
+
+/// Returns the SYCL queue associated with an execution stream.
+///
+/// @param astream Execution stream to query.
+///
+/// @returns SYCL queue object.
+inline sycl::queue get_queue(const stream &astream) {
+    void *queue_ptr;
+    error::wrap_c_api(
+            dnnl_sycl_interop_stream_get_queue(astream.get(), &queue_ptr),
+            "could not get a stream handle");
+    auto queue = *static_cast<sycl::queue *>(queue_ptr);
+    return queue;
+}
+
+/// Returns the SYCL buffer associated with a memory object.
+///
+/// Throws an exception if the memory allocation kind associated with the
+/// memory object is not equal to dnnl::sycl_interop::memory_kind::buffer.
+///
+/// @tparam T Type of the requested buffer.
+/// @tparam ndims Number of dimensions of the requested buffer.
+/// @param amemory Memory object.
+///
+/// @returns SYCL buffer associated with the memory object.
+template <typename T, int ndims = 1>
+sycl::buffer<T, ndims> get_buffer(const memory &amemory) {
+    static_assert(ndims == 1, "only 1D buffers supported");
+
+    // XXX: workaround: when CPU runtime is not SYCL and amemory was created
+    // for CPU engine `get_buffer` should return an error. Use interop API to
+    // implement the check.
+    dnnl_sycl_interop_memory_kind_t ckind;
+    error::wrap_c_api(
+            dnnl_sycl_interop_memory_get_memory_kind(amemory.get(), &ckind),
+            "could not get SYCL buffer object");
+
+    void *handle_ptr;
+    error::wrap_c_api(dnnl_memory_get_data_handle(amemory.get(), &handle_ptr),
+            "could not get SYCL buffer object");
+
+    // XXX: workaround: zero-range buffer cannot be constructed.
+    if (!handle_ptr) return sycl::buffer<T, ndims>(sycl::range<1>(1));
+
+    auto &buf_u8 = *static_cast<sycl::buffer<uint8_t, 1> *>(handle_ptr);
+
+    auto range = sycl::range<1>(buf_u8.byte_size() / sizeof(T));
+    return buf_u8.reinterpret<T, 1>(range);
+}
+
+/// Sets SYCL buffer associated with a memory object.
+///
+/// @tparam T Type of the buffer.
+/// @tparam ndims Number of dimensions of the buffer.
+/// @param amemory Memory object to change.
+/// @param abuffer SYCL buffer.
+template <typename T, int ndims>
+void set_buffer(memory &amemory, sycl::buffer<T, ndims> &abuffer) {
+    auto range = sycl::range<1>(abuffer.byte_size());
+    auto buf_u8 = abuffer.template reinterpret<uint8_t, 1>(range);
+    error::wrap_c_api(dnnl_sycl_interop_memory_set_buffer(
+                              amemory.get(), static_cast<void *>(&buf_u8)),
+            "could not set SYCL buffer object");
+}
+
+/// Returns the memory allocation kind associated with a memory object.
+///
+/// @param amemory A memory object.
+///
+/// @returns The underlying memory allocation kind of the memory object.
+inline memory_kind get_memory_kind(const memory &amemory) {
+    dnnl_sycl_interop_memory_kind_t ckind;
+    error::wrap_c_api(
+            dnnl_sycl_interop_memory_get_memory_kind(amemory.get(), &ckind),
+            "could not get memory kind");
+    return static_cast<memory_kind>(ckind);
+}
+
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+/// Creates a memory object with multiple handles.
+///
+/// @param memory_desc Memory descriptor.
+/// @param aengine Engine to use.
+/// @param kind Memory allocation kind to specify the type of handles.
+/// @param handles Handles of the memory buffers to use as underlying storages.
+///     For each element of the @p handles array the following applies:
+///     - A USM pointer to the user-allocated buffer. In this case the library
+///       doesn't own the buffer. Requires @p memory_kind to be equal to
+///       dnnl::sycl_interop::memory_kind::usm.
+///     - A pointer to SYCL buffer. In this case the library doesn't own the
+///       buffer. Requires @p memory_kind be equal to be equal to
+///       dnnl::sycl_interop::memory_kind::buffer.
+///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+///       allocate the buffer that corresponds to the memory allocation kind
+///       @p memory_kind for the memory object. In this case the library
+///       owns the buffer.
+///     - The DNNL_MEMORY_NONE specific value. Instructs the library to
+///       create memory object without an underlying buffer.
+///
+///  If the @p handles vector is not provided the library will allocate all
+///  buffers as if all handles have the special value DNNL_MEMORY_ALLOCATE.
+///
+/// @returns Created memory object.
+inline memory make_memory(const memory::desc &memory_desc,
+        const engine &aengine, memory_kind kind,
+        std::vector<void *> handles = {}) {
+    if (handles.empty()) {
+        const int nhandles = memory_desc.get_num_handles();
+        handles.resize(nhandles, DNNL_MEMORY_ALLOCATE);
+    }
+
+    dnnl_memory_t c_memory;
+    error::wrap_c_api(
+            dnnl_sycl_interop_memory_create_v2(&c_memory, memory_desc.get(),
+                    aengine.get(), convert_to_c(kind), (int)handles.size(),
+                    handles.data()),
+            "could not create a memory");
+    return memory(c_memory);
+}
+
+/// Creates a memory object.
+///
+/// Unless @p handle is equal to DNNL_MEMORY_NONE or DNNL_MEMORY_ALLOCATE, the
+/// constructed memory object will have the underlying buffer set. In this
+/// case, the buffer will be initialized as if:
+/// - dnnl::memory::set_data_handle() had been called, if @p memory_kind is
+///   equal to dnnl::sycl_interop::memory_kind::usm, or
+/// - dnnl::sycl_interop::set_buffer() has been called, if @p memory_kind is
+///   equal to dnnl::sycl_interop::memory_kind::buffer.
+///
+/// @param memory_desc Memory descriptor.
+/// @param aengine Engine to use.
+/// @param kind Memory allocation kind to specify the type of handle.
+/// @param handle Handle of the memory buffer to use as an underlying storage.
+///     - A USM pointer to the user-allocated buffer. In this case the library
+///       doesn't own the buffer. Requires @p memory_kind to be equal to
+///       dnnl::sycl_interop::memory_kind::usm.
+///     - A pointer to SYCL buffer. In this case the library doesn't own the
+///       buffer. Requires @p memory_kind be equal to be equal to
+///       dnnl::sycl_interop::memory_kind::buffer.
+///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+///       allocate the buffer that corresponds to the memory allocation kind
+///       @p memory_kind for the memory object. In this case the library
+///       owns the buffer.
+///     - The DNNL_MEMORY_NONE specific value. Instructs the library to
+///       create memory object without an underlying buffer.
+///
+/// @returns Created memory object.
+inline memory make_memory(const memory::desc &memory_desc,
+        const engine &aengine, memory_kind kind, void *handle) {
+    return make_memory(
+            memory_desc, aengine, kind, std::vector<void *> {handle});
+}
+#else
+
+/// Creates a memory object.
+///
+/// Unless @p handle is equal to DNNL_MEMORY_NONE or DNNL_MEMORY_ALLOCATE, the
+/// constructed memory object will have the underlying buffer set. In this
+/// case, the buffer will be initialized as if:
+/// - dnnl::memory::set_data_handle() had been called, if @p memory_kind is
+///   equal to dnnl::sycl_interop::memory_kind::usm, or
+/// - dnnl::sycl_interop::set_buffer() has been called, if @p memory_kind is
+///   equal to dnnl::sycl_interop::memory_kind::buffer.
+///
+/// @param memory_desc Memory descriptor.
+/// @param aengine Engine to use.
+/// @param kind Memory allocation kind to specify the type of handle.
+/// @param handle Handle of the memory buffer to use as an underlying storage.
+///     - A USM pointer to the user-allocated buffer. In this case the library
+///       doesn't own the buffer. Requires @p memory_kind to be equal to
+///       dnnl::sycl_interop::memory_kind::usm.
+///     - A pointer to SYCL buffer. In this case the library doesn't own the
+///       buffer. Requires @p memory_kind be equal to be equal to
+///       dnnl::sycl_interop::memory_kind::buffer.
+///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+///       allocate the buffer that corresponds to the memory allocation kind
+///       @p memory_kind for the memory object. In this case the library
+///       owns the buffer.
+///     - The DNNL_MEMORY_NONE specific value. Instructs the library to
+///       create memory object without an underlying buffer.
+///
+/// @returns Created memory object.
+inline memory make_memory(const memory::desc &memory_desc,
+        const engine &aengine, memory_kind kind,
+        void *handle = DNNL_MEMORY_ALLOCATE) {
+    dnnl_memory_t c_memory;
+    error::wrap_c_api(
+            dnnl_sycl_interop_memory_create(&c_memory, memory_desc.get(),
+                    aengine.get(), convert_to_c(kind), handle),
+            "could not create a memory");
+    return memory(c_memory);
+}
+#endif
+
+/// Constructs a memory object from a SYCL buffer.
+///
+/// @param memory_desc Memory descriptor.
+/// @param aengine Engine to use.
+/// @param abuffer A SYCL buffer to use.
+///
+/// @returns Created memory object.
+template <typename T, int ndims = 1>
+memory make_memory(const memory::desc &memory_desc, const engine &aengine,
+        sycl::buffer<T, ndims> &abuffer) {
+    memory amemory(memory_desc, aengine, DNNL_MEMORY_NONE);
+    set_buffer(amemory, abuffer);
+    return amemory;
+}
+
+/// Executes computations specified by the primitive in a specified stream and
+/// returns a SYCL event.
+///
+/// Arguments are passed via an arguments map containing
+/// <index, memory object> pairs. The index must be one of the `DNNL_ARG_*`
+/// values such as `DNNL_ARG_SRC`, and the memory must have a memory descriptor
+/// matching the one returned by
+/// #dnnl::primitive_desc::query_md(#query::exec_arg_md, index) unless using
+/// dynamic shapes (see #DNNL_RUNTIME_DIM_VAL).
+///
+/// @param aprimitive Primitive to execute.
+/// @param astream Stream object. The stream must belong to the same engine
+///     as the primitive.
+/// @param args Arguments map.
+/// @param deps Optional vector with `sycl::event` dependencies.
+///
+/// @returns Output event.
+inline sycl::event execute(const dnnl::primitive &aprimitive,
+        const stream &astream, const std::unordered_map<int, memory> &args,
+        const std::vector<sycl::event> &deps = {}) {
+    std::vector<dnnl_exec_arg_t> c_args;
+    c_args.reserve(args.size());
+    for (const auto &a : args)
+        c_args.push_back({a.first, a.second.get()});
+
+    sycl::event return_event;
+    error::wrap_c_api(
+            dnnl_sycl_interop_primitive_execute(aprimitive.get(), astream.get(),
+                    (int)c_args.size(), c_args.data(), &deps, &return_event),
+            "could not execute a primitive");
+    return return_event;
+}
+
+} // namespace sycl_interop
+
+/// @} dnnl_api_sycl_interop
+
+/// @} dnnl_api_interop
+
+} // namespace dnnl
+
+/// @} dnnl_api
+
+#endif // DNNL_SYCL_HPP
diff --git a/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_sycl_types.h b/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_sycl_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..979d4cddaad34d3980ceb07ad785bc3f00d49cf6
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_sycl_types.h
@@ -0,0 +1,51 @@
+/*******************************************************************************
+* Copyright 2020-2021 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef ONEAPI_DNNL_DNNL_SYCL_TYPES_H
+#define ONEAPI_DNNL_DNNL_SYCL_TYPES_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// @addtogroup dnnl_api
+/// @{
+
+/// @addtogroup dnnl_api_interop
+/// @{
+
+/// @addtogroup dnnl_api_sycl_interop
+/// @{
+
+/// Memory allocation kind.
+typedef enum {
+    /// USM (device, shared, host, or unknown) memory allocation kind - default.
+    dnnl_sycl_interop_usm,
+    /// Buffer memory allocation kind.
+    dnnl_sycl_interop_buffer,
+} dnnl_sycl_interop_memory_kind_t;
+
+/// @} dnnl_api_sycl_interop
+
+/// @} dnnl_api_interop
+
+/// @} dnnl_api
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_threadpool.h b/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_threadpool.h
new file mode 100644
index 0000000000000000000000000000000000000000..e6a931b702e3b8cba222782d3f5d656074986243
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_threadpool.h
@@ -0,0 +1,118 @@
+/*******************************************************************************
+* Copyright 2020-2022 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef ONEAPI_DNNL_DNNL_THREADPOOL_H
+#define ONEAPI_DNNL_DNNL_THREADPOOL_H
+
+#include "oneapi/dnnl/dnnl_config.h"
+#include "oneapi/dnnl/dnnl_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// @addtogroup dnnl_api
+/// @{
+
+/// @addtogroup dnnl_api_interop
+/// @{
+
+/// @addtogroup dnnl_api_threadpool_interop
+/// @{
+
+/// Creates an execution stream with specified threadpool.
+///
+/// @sa @ref dev_guide_threadpool
+///
+/// @param stream Output execution stream.
+/// @param engine Engine to create the execution stream on.
+/// @param threadpool Pointer to an instance of a C++ class that implements
+///     dnnl::threapdool_iface interface.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_threadpool_interop_stream_create(
+        dnnl_stream_t *stream, dnnl_engine_t engine, void *threadpool);
+
+/// Returns a threadpool to be used by the execution stream.
+///
+/// @sa @ref dev_guide_threadpool
+///
+/// @param astream Execution stream.
+/// @param threadpool Output pointer to an instance of a C++ class that
+///     implements dnnl::threapdool_iface interface. Set to NULL if the
+///     stream was created without threadpool.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_threadpool_interop_stream_get_threadpool(
+        dnnl_stream_t astream, void **threadpool);
+
+/// Sets the maximum concurrency assumed by oneDNN when outside a
+/// parallel call.
+///
+/// @param max_concurrency The maximum concurrency assumed by oneDNN
+/// when outside a parallel call. This is a threadlocal setting.
+/// @returns #dnnl_success on success and a status describing the
+/// error otherwise.
+dnnl_status_t DNNL_API dnnl_threadpool_interop_set_max_concurrency(
+        int max_concurrency);
+
+/// Gets the maximum concurrency assumed by oneDNN when outside a
+/// parallel call.
+///
+/// @param max_concurrency The maximum concurrency assumed by oneDNN
+/// when outside a parallel call. This is a threadlocal setting.
+/// @returns #dnnl_success on success and a status describing the
+/// error otherwise.
+dnnl_status_t DNNL_API dnnl_threadpool_interop_get_max_concurrency(
+        int *max_concurrency);
+
+/// @copydoc dnnl_sgemm()
+/// @param threadpool A pointer to a threadpool interface (only when built with
+///     the THREADPOOL CPU runtime).
+dnnl_status_t DNNL_API dnnl_threadpool_interop_sgemm(char transa, char transb,
+        dnnl_dim_t M, dnnl_dim_t N, dnnl_dim_t K, float alpha, const float *A,
+        dnnl_dim_t lda, const float *B, dnnl_dim_t ldb, float beta, float *C,
+        dnnl_dim_t ldc, void *threadpool);
+
+/// @copydoc dnnl_gemm_u8s8s32()
+/// @param threadpool A pointer to a threadpool interface (only when built with
+///     the THREADPOOL CPU runtime).
+dnnl_status_t DNNL_API dnnl_threadpool_interop_gemm_u8s8s32(char transa,
+        char transb, char offsetc, dnnl_dim_t M, dnnl_dim_t N, dnnl_dim_t K,
+        float alpha, const uint8_t *A, dnnl_dim_t lda, uint8_t ao,
+        const int8_t *B, dnnl_dim_t ldb, int8_t bo, float beta, int32_t *C,
+        dnnl_dim_t ldc, const int32_t *co, void *threadpool);
+
+/// @copydoc dnnl_gemm_s8s8s32()
+/// @param threadpool A pointer to a threadpool interface (only when built with
+///     the THREADPOOL CPU runtime).
+dnnl_status_t DNNL_API dnnl_threadpool_interop_gemm_s8s8s32(char transa,
+        char transb, char offsetc, dnnl_dim_t M, dnnl_dim_t N, dnnl_dim_t K,
+        float alpha, const int8_t *A, dnnl_dim_t lda, int8_t ao,
+        const int8_t *B, dnnl_dim_t ldb, int8_t bo, float beta, int32_t *C,
+        dnnl_dim_t ldc, const int32_t *co, void *threadpool);
+
+/// @} dnnl_api_threadpool_interop
+
+/// @} dnnl_api_interop
+
+/// @} dnnl_api
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_threadpool.hpp b/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_threadpool.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9d9e804e14080f3e70f0c4364b298eaff97aedae
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_threadpool.hpp
@@ -0,0 +1,113 @@
+/*******************************************************************************
+* Copyright 2020-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef ONEAPI_DNNL_DNNL_THREADPOOL_HPP
+#define ONEAPI_DNNL_DNNL_THREADPOOL_HPP
+
+#include "oneapi/dnnl/dnnl.hpp"
+#include "oneapi/dnnl/dnnl_threadpool.h"
+
+#include "oneapi/dnnl/dnnl_threadpool_iface.hpp"
+
+/// @addtogroup dnnl_api
+/// @{
+
+namespace dnnl {
+
+/// @addtogroup dnnl_api_interop
+/// @{
+
+/// @addtogroup dnnl_api_threadpool_interop Threadpool interoperability API
+/// API extensions to interact with the underlying Threadpool run-time.
+/// @{
+
+/// Threadpool interoperability namespace
+namespace threadpool_interop {
+
+/// Constructs an execution stream for the specified engine and threadpool.
+///
+/// @sa @ref dev_guide_threadpool
+///
+/// @param aengine Engine to create the stream on.
+/// @param threadpool Pointer to an instance of a C++ class that implements
+///     dnnl::threapdool_iface interface.
+/// @returns An execution stream.
+inline dnnl::stream make_stream(
+        const dnnl::engine &aengine, threadpool_iface *threadpool) {
+    dnnl_stream_t c_stream;
+    dnnl::error::wrap_c_api(dnnl_threadpool_interop_stream_create(
+                                    &c_stream, aengine.get(), threadpool),
+            "could not create stream");
+    return dnnl::stream(c_stream);
+}
+
+/// Returns the pointer to a threadpool that is used by an execution stream.
+///
+/// @sa @ref dev_guide_threadpool
+///
+/// @param astream An execution stream.
+/// @returns Output pointer to an instance of a C++ class that implements
+///     dnnl::threapdool_iface interface or NULL if the stream was created
+///     without threadpool.
+inline threadpool_iface *get_threadpool(const dnnl::stream &astream) {
+    void *tp;
+    dnnl::error::wrap_c_api(
+            dnnl_threadpool_interop_stream_get_threadpool(astream.get(), &tp),
+            "could not get stream threadpool");
+    return static_cast<threadpool_iface *>(tp);
+}
+
+/// @copydoc dnnl_threadpool_interop_sgemm()
+inline status sgemm(char transa, char transb, dnnl_dim_t M, dnnl_dim_t N,
+        dnnl_dim_t K, float alpha, const float *A, dnnl_dim_t lda,
+        const float *B, dnnl_dim_t ldb, float beta, float *C, dnnl_dim_t ldc,
+        threadpool_iface *threadpool) {
+    return static_cast<status>(dnnl_threadpool_interop_sgemm(transa, transb, M,
+            N, K, alpha, A, lda, B, ldb, beta, C, ldc, threadpool));
+}
+/// @copydoc dnnl_threadpool_interop_gemm_u8s8s32()
+inline status gemm_u8s8s32(char transa, char transb, char offsetc, dnnl_dim_t M,
+        dnnl_dim_t N, dnnl_dim_t K, float alpha, const uint8_t *A,
+        dnnl_dim_t lda, uint8_t ao, const int8_t *B, dnnl_dim_t ldb, int8_t bo,
+        float beta, int32_t *C, dnnl_dim_t ldc, const int32_t *co,
+        threadpool_iface *threadpool) {
+    return static_cast<status>(dnnl_threadpool_interop_gemm_u8s8s32(transa,
+            transb, offsetc, M, N, K, alpha, A, lda, ao, B, ldb, bo, beta, C,
+            ldc, co, threadpool));
+}
+
+/// @copydoc dnnl_threadpool_interop_gemm_s8s8s32()
+inline status gemm_s8s8s32(char transa, char transb, char offsetc, dnnl_dim_t M,
+        dnnl_dim_t N, dnnl_dim_t K, float alpha, const int8_t *A,
+        dnnl_dim_t lda, int8_t ao, const int8_t *B, dnnl_dim_t ldb, int8_t bo,
+        float beta, int32_t *C, dnnl_dim_t ldc, const int32_t *co,
+        threadpool_iface *threadpool) {
+    return static_cast<status>(dnnl_threadpool_interop_gemm_s8s8s32(transa,
+            transb, offsetc, M, N, K, alpha, A, lda, ao, B, ldb, bo, beta, C,
+            ldc, co, threadpool));
+}
+
+} // namespace threadpool_interop
+
+/// @} dnnl_api_threadpool_interop
+
+/// @} dnnl_api_interop
+
+} // namespace dnnl
+
+/// @} dnnl_api
+
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_threadpool_iface.hpp b/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_threadpool_iface.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..192952e9e55f48614fe1b6709c7f3383e6e53c55
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_threadpool_iface.hpp
@@ -0,0 +1,73 @@
+/*******************************************************************************
+* Copyright 2020-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef ONEAPI_DNNL_DNNL_THREADPOOL_IFACE_HPP
+#define ONEAPI_DNNL_DNNL_THREADPOOL_IFACE_HPP
+
+#include <cstdint>
+#include <functional>
+
+/// @addtogroup dnnl_api
+/// @{
+
+namespace dnnl {
+
+/// @addtogroup dnnl_api_interop
+/// @{
+
+/// @addtogroup dnnl_api_threadpool_interop
+/// @{
+
+namespace threadpool_interop {
+
+/// Abstract threadpool interface. The users are expected to subclass this
+/// interface and pass an object to the library during CPU stream creation or
+/// directly in case of BLAS functions.
+struct threadpool_iface {
+    /// Returns the number of worker threads.
+    virtual int get_num_threads() const = 0;
+
+    /// Returns true if the calling thread belongs to this threadpool.
+    virtual bool get_in_parallel() const = 0;
+
+    /// Submits n instances of a closure for execution in parallel:
+    ///
+    /// for (int i = 0; i < n; i++) fn(i, n);
+    ///
+    virtual void parallel_for(int n, const std::function<void(int, int)> &fn)
+            = 0;
+
+    /// Returns threadpool behavior flags bit mask (see below).
+    virtual uint64_t get_flags() const = 0;
+
+    /// If set, parallel_for() returns immediately and oneDNN needs implement
+    /// waiting for the submitted closures to finish execution on its own.
+    static constexpr uint64_t ASYNCHRONOUS = 1;
+
+    virtual ~threadpool_iface() {}
+};
+
+} // namespace threadpool_interop
+
+/// @} dnnl_api_threadpool_interop
+
+/// @} dnnl_api_interop
+
+} // namespace dnnl
+
+/// @} dnnl_api
+
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_types.h b/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f9507e95a9924577f767a474410edb158c3131d
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_types.h
@@ -0,0 +1,2936 @@
+/*******************************************************************************
+* Copyright 2016-2025 Intel Corporation
+* Copyright 2024 FUJITSU LIMITED
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/// @file
+/// C API types definitions
+
+#ifndef ONEAPI_DNNL_DNNL_TYPES_H
+#define ONEAPI_DNNL_DNNL_TYPES_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// @cond DO_NOT_DOCUMENT_THIS
+#include <stddef.h>
+#include <stdint.h>
+/// @endcond
+
+#include "oneapi/dnnl/dnnl_config.h"
+
+#include "oneapi/dnnl/dnnl_common_types.h"
+
+/// @addtogroup dnnl_api
+/// @{
+
+/// @addtogroup dnnl_api_memory
+/// @{
+
+/// Memory format kind
+typedef enum {
+    /// Undefined memory format kind, used for empty memory descriptors.
+    dnnl_format_kind_undef = 0,
+    /// A special format kind that indicates that the actual format will be
+    /// selected by a primitive automatically.
+    dnnl_format_kind_any,
+    /// A tensor in a generic format described by the stride and blocking
+    /// values in each dimension.
+    dnnl_blocked,
+    /// A special format kind that indicates that tensor format is opaque.
+    dnnl_format_kind_opaque,
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+    /// Format kind for sparse tensors.
+    dnnl_format_kind_sparse,
+#endif
+    /// Parameter to allow internal only format kinds without undefined
+    /// behavior. This parameter is chosen to be valid for so long as
+    /// sizeof(int) >= 2.
+    dnnl_format_kind_max = 0x7fff,
+} dnnl_format_kind_t;
+
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+/// Sparse encodings.
+typedef enum {
+    /// Undefined sparse encoding kind, used for empty memory descriptors.
+    dnnl_sparse_encoding_undef = 0,
+    /// Compressed Sparse Row (CSR) encoding.
+    dnnl_csr,
+    /// An encoding that is used for an opaque storage schema for
+    /// tensors with unstructured sparsity. A memory descriptor with the
+    /// packed encoding cannot be used to create a memory object. It can
+    /// only be used to create a primitive descriptor to query the
+    /// actual memory descriptor (similar to the format tag `any`).
+    dnnl_packed,
+    /// Coordinate Sparse Encoding (COO).
+    dnnl_coo,
+} dnnl_sparse_encoding_t;
+#endif
+
+#ifdef DNNL_EXPERIMENTAL_PROFILING
+/// Profiling data kind.
+typedef enum {
+    /// Undefined profiling data kind.
+    dnnl_profiling_data_kind_undef = 0,
+    /// Data kind to query an execution time in nanoseconds.
+    dnnl_profiling_data_kind_time,
+} dnnl_profiling_data_kind_t;
+
+#endif
+
+/// Memory format tag specification.
+///
+/// oneDNN formats describe physical data layout. The physical layout
+/// is described as a sequence of the dimensions as they are laid out in the
+/// memory (from the outer-most to the inner-most). Note that this order
+/// doesn't affect the logical order of the dimensions that is kept in the
+/// `dims` field of the dnnl_memory_desc_t structure. The logical order of the
+/// dimensions is specified by the primitive that uses the tensor.
+///
+/// For example, CNN 5D tensor always has its logical dimensions in the order
+/// `(batch, channels, depth, height, width)`, while the physical layout might be
+/// `NCDHW` (corresponds to #dnnl_ncdhw format tag) or
+/// `NDHWC` (corresponds to #dnnl_ndhwc format tag).
+///
+/// ~~~cpp
+/// int batch = 2, channels = 16, depth = 13, height = 13, width = 13;
+///
+/// int ndims = 5; // 5D tensor
+/// dnnl_dims_t dims = {batch, channels, depth, height, width};
+/// dnnl_memory_desc_t data_in_ncdhw;
+/// dnnl_memory_desc_create_with_tag(
+///      &data_in_ncdhw, 5, dims, dnnl_f32, dnnl_ncdhw);
+///
+/// // note that in both cases dims passed are the same
+/// dnnl_memory_desc_t data_in_ndhwc;
+/// dnnl_memory_desc_create_with_tag(
+///      &data_in_ndhwc, 5, dims, dnnl_f32, dnnl_ndhwc);
+///
+/// dnnl_memory_desc_destroy(data_in_ncdhw);
+/// dnnl_memory_desc_destroy(data_in_ndhwc);
+/// ~~~
+///
+/// Memory format tags can be further divided into two categories:
+///  - Domain-agnostic names, i.e. names the do not depend on the tensor usage
+///    in the specific primitive. These names use letters from `a` to `l` to
+///    denote logical dimension from 1 to 12, and form the order in which the
+///    dimensions are laid in memory. For instance, #dnnl_ab is used to denote
+///    2D tensor where the second logical dimension (aka `b`) is the innermost,
+///    i.e. has stride = 1, and the first logical dimension (`a`) laid out in
+///    memory with stride equal to the size of second dimension. On the other
+///    hand, #dnnl_ba is just transposed version of the same tensor: the
+///    first dimension (`a`) becomes the innermost one.
+///  - Domain-specific names, i.e. names that make sense only in the context of
+///    a certain domain, such as CNN. This names are just aliases to the
+///    corresponding domain-agnostic tags and used mostly for the convenience.
+///    For example, #dnnl_nc is used to denote 2D CNN activations tensor
+///    memory format, where channels are the innermost dimension and batch is an
+///    outermost one. Moreover, #dnnl_nc is just an alias to #dnnl_ab,
+///    since for oneDNN CNN primitives the logical dimensions of
+///    activations tensors come in order: batch, channels, spatial.
+///    In other words, batch corresponds to the first logical dimension (`a`),
+///    channels correspond to the second one (`b`).
+///
+/// The following domain-specific notation applies to memory format tags:
+///  - @c 'n' denotes the mini-batch dimension
+///  - @c 'c' denotes a channels dimension
+///  - When there are multiple channel dimensions (for example, in convolution
+///    weights tensor), @c 'i' and @c 'o' denote dimensions of input and output
+///    channels
+///  - @c 'd', @c 'h', and @c 'w' denote spatial depth, height, and width
+///    respectively
+///
+/// Upper-case letters indicate that the data is laid out in blocks for a
+/// particular dimension. In such cases, the format name contains both upper-
+/// and lower-case letters for that dimension with a lower-case letter preceded
+/// by the block size. For example: #dnnl_nChw8c describes a format where the
+/// outermost dimension is mini-batch, followed by the channel block number,
+/// followed by the spatial height and width, and finally followed by 8-element
+/// channel blocks.
+///
+/// @sa @ref dev_guide_understanding_memory_formats
+typedef enum {
+    /// Undefined memory format tag
+    dnnl_format_tag_undef = 0,
+    /// Undefined memory format tag.
+    /// The primitive selects a format automatically.
+    dnnl_format_tag_any,
+
+    // Semantic agnostic section
+    // The physical order of dimensions is defined by the permutation of the
+    // characters, assuming that ab..z defines the natural order.
+
+    // Plain formats
+
+    dnnl_a, ///< plain 1D tensor
+    dnnl_ab, ///< plain 2D tensor
+    dnnl_abc, ///< plain 3D tensor
+    dnnl_abcd, ///< plain 4D tensor
+    dnnl_abcde, ///< plain 5D tensor
+    dnnl_abcdef, ///< plain 6D tensor
+    dnnl_abcdefg, ///< plain 7D tensor
+    dnnl_abcdefgh, ///< plain 8D tensor
+    dnnl_abcdefghi, ///< plain 9D tensor
+    dnnl_abcdefghij, ///< plain 10D tensor
+    dnnl_abcdefghijk, ///< plain 11D tensor
+    dnnl_abcdefghijkl, ///< plain 12D tensor
+
+    // Permuted plain formats
+
+    dnnl_ba, ///< permuted 2D tensor
+    dnnl_acb, ///< permuted 3D tensor
+    dnnl_bac, ///< permuted 3D tensor
+    dnnl_bca, ///< permuted 3D tensor
+    dnnl_cab, ///< permuted 3D tensor
+    dnnl_cba, ///< permuted 3D tensor
+    dnnl_abdc, ///< permuted 4D tensor
+    dnnl_acbd, ///< permuted 4D tensor
+    dnnl_acdb, ///< permuted 4D tensor
+    dnnl_adbc, ///< permuted 4D tensor
+    dnnl_adcb, ///< permuted 4D tensor
+    dnnl_bacd, ///< permuted 4D tensor
+    dnnl_bcda, ///< permuted 4D tensor
+    dnnl_cdab, ///< permuted 4D tensor
+    dnnl_cdba, ///< permuted 4D tensor
+    dnnl_dcab, ///< permuted 4D tensor
+    dnnl_abced, ///< permuted 5D tensor
+    dnnl_abdec, ///< permuted 5D tensor
+    dnnl_acbde, ///< permuted 5D tensor
+    dnnl_acdeb, ///< permuted 5D tensor
+    dnnl_adecb, ///< permuted 5D tensor
+    dnnl_bacde, ///< permuted 5D tensor
+    dnnl_bcdea, ///< permuted 5D tensor
+    dnnl_cdeab, ///< permuted 5D tensor
+    dnnl_cdeba, ///< permuted 5D tensor
+    dnnl_decab, ///< permuted 5D tensor
+    dnnl_abcdfe, ///< permuted 6D tensor
+    dnnl_abdefc, ///< permuted 6D tensor
+    dnnl_abdfce, ///< permuted 6D tensor
+    dnnl_acbdef, ///< permuted 6D tensor
+    dnnl_adefcb, ///< permuted 6D tensor
+    dnnl_defcab, ///< permuted 6D tensor
+    dnnl_abcdegf, ///< permuted 7D tensor
+    dnnl_abcdefhg, ///< permuted 8D tensor
+    dnnl_abcdefgih, ///< permuted 9D tensor
+    dnnl_abcdefghji, ///< permuted 10D tensor
+    dnnl_abcdefghikj, ///< permuted 11D tensor
+    dnnl_abcdefghijlk, ///< permuted 12D tensor
+
+    // Opaque blocked formats
+
+    dnnl_Abc16a,
+    dnnl_ABc16a16b,
+    dnnl_ABc32a32b,
+    dnnl_ABc4a4b,
+    /// 3D tensor blocked by 2nd dimension with block size 16
+    dnnl_aBc16b,
+    dnnl_ABc16b16a,
+    dnnl_Abc4a,
+    /// 3D tensor blocked by 2nd dimension with block size 32
+    dnnl_aBc32b,
+    /// 3D tensor blocked by 2nd dimension with block size 4
+    dnnl_aBc4b,
+    dnnl_ABc4b16a4b,
+    dnnl_ABc2b8a4b,
+    dnnl_ABc16b16a4b,
+    dnnl_ABc16b16a2b,
+    dnnl_ABc4b4a,
+    dnnl_ABc8a16b2a,
+    dnnl_ABc8a8b,
+    dnnl_ABc8a4b,
+    /// 3D tensor blocked by 2nd dimension with block size 8
+    dnnl_aBc8b,
+    dnnl_ABc8b16a2b,
+    dnnl_BAc8a16b2a,
+    dnnl_ABc8b8a,
+    dnnl_Abcd16a,
+    dnnl_Abcd8a,
+    dnnl_ABcd16a16b,
+    dnnl_Abcd32a,
+    dnnl_ABcd32a32b,
+    /// 4D tensor blocked by 2nd dimension with block size 16
+    dnnl_aBcd16b,
+    dnnl_ABcd16b16a,
+    dnnl_aBCd16b16c,
+    dnnl_aBCd16c16b,
+    dnnl_Abcd4a,
+    /// 4D tensor blocked by 2nd dimension with block size 32
+    dnnl_aBcd32b,
+    /// 4D tensor blocked by 2nd dimension with block size 4
+    dnnl_aBcd4b,
+    dnnl_ABcd4b16a4b,
+    dnnl_ABcd16b16a4b,
+    dnnl_ABcd16b16a2b,
+    dnnl_ABcd4b4a,
+    dnnl_ABcd4a4b,
+    dnnl_aBCd2c4b2c,
+    dnnl_aBCd4b8c2b,
+    dnnl_aBCd4c16b4c,
+    dnnl_aBCd2c8b4c,
+    dnnl_aBCd16c16b4c,
+    dnnl_aBCd16c16b2c,
+    dnnl_aBCd4c4b,
+    dnnl_aBCd4b4c,
+    dnnl_ABcd8a16b2a,
+    dnnl_ABcd2b8a4b,
+    dnnl_ABcd8a8b,
+    dnnl_ABcd8a4b,
+    /// 4D tensor blocked by 2nd dimension with block size 8
+    dnnl_aBcd8b,
+    dnnl_aBCd4c8b2c,
+    dnnl_ABcd8b16a2b,
+    dnnl_aBCd8b16c2b,
+    dnnl_BAcd8a16b2a,
+    /// 4D tensor blocked by 1st and 2nd dimension with block size 8
+    dnnl_ABcd8b8a,
+    dnnl_aBCd8b8c,
+    dnnl_aBCd8b4c,
+    dnnl_aBCd8c16b2c,
+    dnnl_ABcde8a16b2a,
+    dnnl_aCBd8b16c2b,
+    dnnl_aBCd8c8b,
+    dnnl_Abcde16a,
+    dnnl_Abcde32a,
+    dnnl_ABcde16a16b,
+    dnnl_BAcde8a16b2a,
+    /// 4D tensor blocked by 3rd dimension with block size 4
+    dnnl_aBCd2b4c2b,
+    /// 5D tensor blocked by 1st dimension with block size 16
+    dnnl_ABcde4b16a4b,
+    /// 5D tensor blocked by 1st dimension with block size 8
+    dnnl_ABcde2b8a4b,
+    /// 5D tensor blocked by 2nd dimension with block size 16
+    dnnl_aBcde16b,
+    dnnl_ABcde16b16a,
+    dnnl_aBCde16b16c,
+    dnnl_aBCde16c16b,
+    dnnl_aBCde2c8b4c,
+    dnnl_Abcde4a,
+    /// 5D tensor blocked by 2nd dimension with block size 32
+    dnnl_aBcde32b,
+    /// 5D tensor blocked by 2nd dimension with block size 4
+    dnnl_aBcde4b,
+    dnnl_ABcde4b4a,
+    dnnl_ABcde4a4b,
+    dnnl_aBCde4b4c,
+    dnnl_aBCde2c4b2c,
+    dnnl_aBCde4b8c2b,
+    dnnl_aBCde4c16b4c,
+    dnnl_aBCde16c16b4c,
+    dnnl_aBCde16c16b2c,
+    dnnl_aBCde4c4b,
+    dnnl_Abcde8a,
+    dnnl_ABcde8a8b,
+    dnnl_ABcde8a4b,
+    dnnl_BAcde16b16a,
+    /// 5D tensor blocked by 2nd dimension with block size 8
+    dnnl_aBcde8b,
+    dnnl_ABcde8b16a2b,
+    dnnl_aBCde8b16c2b,
+    dnnl_aBCde4c8b2c,
+    dnnl_aCBde8b16c2b,
+    dnnl_ABcde8b8a,
+    dnnl_ABcde32a32b,
+    dnnl_aBCde8b8c,
+    dnnl_aBCde8b4c,
+    dnnl_ABc4a8b8a4b,
+    dnnl_ABcd4a8b8a4b,
+    dnnl_ABcde4a8b8a4b,
+    dnnl_BAc4b8a8b4a,
+    dnnl_BAcd4b8a8b4a,
+    dnnl_BAcde4b8a8b4a,
+    dnnl_ABcd2a8b8a2b,
+    dnnl_aBCd4b8c8b4c,
+    dnnl_aBCde4b8c8b4c,
+    dnnl_aBCde2b8c8b2c,
+    dnnl_aBCde8c16b2c,
+    dnnl_aBCde8c8b,
+    /// 5D tensor blocked by 3rd dimension with block size 4
+    dnnl_aBCde2b4c2b,
+    /// 6D tensor blocked by 2nd dimension with block size 16
+    dnnl_aBcdef16b,
+    dnnl_aBCdef16b16c,
+    dnnl_aBCdef16c16b,
+    dnnl_aBCdef4c16b4c,
+    /// 6D tensor blocked by 2nd dimension with block size 8
+    dnnl_aBCdef2c8b4c,
+    dnnl_aBCdef4c8b2c,
+    /// 6D tensor blocked by 3rd dimension with block size 4
+    dnnl_aBCdef2b4c2b,
+    /// 6D tensor blocked by 2nd dimension with block size 4
+    dnnl_aBcdef4b,
+    dnnl_aBCdef4c4b,
+    dnnl_aBCdef4b4c,
+    dnnl_aBCdef2c4b2c,
+    dnnl_aBCdef4b8c2b,
+    dnnl_aBCdef8b8c,
+    dnnl_aBCdef8b4c,
+    dnnl_aBCdef8c16b2c,
+    dnnl_aBCdef4b8c8b4c,
+    dnnl_aBCdef8b16c2b,
+    dnnl_aCBdef8b16c2b,
+    dnnl_aBCdef8c8b,
+    dnnl_aBdc16b,
+    dnnl_aBdC16b2c,
+    dnnl_aBdC16b4c,
+    dnnl_aBdc4b,
+    dnnl_aBdc8b,
+    dnnl_aBdec16b,
+    dnnl_aBdeC16b2c,
+    dnnl_aBdeC16b4c,
+    dnnl_aBdec32b,
+    dnnl_aBdec4b,
+    dnnl_aBdec8b,
+    dnnl_aBdefc16b,
+    dnnl_aBdefC16b2c,
+    dnnl_aCBdef16c16b,
+    dnnl_aBdefc4b,
+    dnnl_aBdefc8b,
+    dnnl_Abcdef16a,
+    dnnl_Abcdef32a,
+    dnnl_aBedc16b,
+    dnnl_Acb16a,
+    dnnl_AcB16a2b,
+    dnnl_AcB16a4b,
+    dnnl_Acb4a,
+    dnnl_Acb8a,
+    dnnl_aCBd16b16c,
+    dnnl_aCBd16c16b,
+    dnnl_aCBde16b16c,
+    dnnl_aCBde16c16b,
+    dnnl_Acdb16a,
+    dnnl_AcdB16a2b,
+    dnnl_AcdB16a4b,
+    dnnl_Acdb32a,
+    dnnl_Acdb4a,
+    dnnl_Acdb8a,
+    dnnl_Acdeb16a,
+    dnnl_AcdeB16a2b,
+    dnnl_Acdeb4a,
+    dnnl_Acdeb8a,
+    dnnl_Adcb16a,
+    dnnl_BAc16a16b,
+    dnnl_BAc16b16a,
+    dnnl_BAcd16a16b,
+    dnnl_BAcd16b16a,
+    dnnl_aCBd4c8b8c4b,
+    dnnl_aCBde4c8b8c4b,
+    dnnl_aCBdef4c8b8c4b,
+    dnnl_BAcde16a16b,
+    dnnl_aCBdef16b16c,
+    dnnl_ABc16b32a,
+    dnnl_ABc16b64a,
+    dnnl_ABc4b32a4b,
+    dnnl_ABc4b64a4b,
+    dnnl_ABc8b32a2b,
+    dnnl_ABc8b64a2b,
+    dnnl_AB16b16a,
+    dnnl_AB16b32a,
+    dnnl_AB16b64a,
+    dnnl_AB8b16a2b,
+    dnnl_AB8b32a2b,
+    dnnl_AB8b64a2b,
+    dnnl_AB4b16a4b,
+    dnnl_AB4b32a4b,
+    dnnl_AB4b64a4b,
+    dnnl_AB16b16a4b,
+    dnnl_ABcd16b32a,
+    dnnl_ABcd16b64a,
+    dnnl_ABcd4b32a4b,
+    dnnl_ABcd4b64a4b,
+    dnnl_ABcd8b32a2b,
+    dnnl_ABcd8b64a2b,
+    dnnl_ABcde4b32a4b,
+    dnnl_ABcde4b64a4b,
+    dnnl_ABcde16b16a4b,
+    dnnl_ABcde16b16a2b,
+    dnnl_ABcde16b32a,
+    dnnl_ABcde16b64a,
+    dnnl_ABcde8b32a2b,
+    dnnl_ABcde8b64a2b,
+    dnnl_aBCdef16c16b4c,
+    dnnl_aBCdef16c16b2c,
+    dnnl_AB32a32b8a4b,
+    dnnl_AB8a4b,
+    dnnl_AB32a32b8a2b,
+    dnnl_AB8a2b,
+    dnnl_abDc32d,
+    dnnl_abDC32d4c,
+    dnnl_abdEc32e,
+    dnnl_abdEC32e2c,
+    dnnl_abdEC32e4c,
+    dnnl_aBdefC16b4c,
+    dnnl_AcdeB16a4b,
+    dnnl_ABcd16a16b2a,
+    dnnl_ABc16a16b2a,
+    dnnl_aBCd16b16c2b,
+    dnnl_aBCde16b16c2b,
+    dnnl_Acb32a,
+    dnnl_AcB32a2b,
+    dnnl_AcB32a4b,
+    dnnl_Acb48a,
+    dnnl_AcB48a2b,
+    dnnl_AcB48a4b,
+    dnnl_Acb64a,
+    dnnl_AcB64a2b,
+    dnnl_AcB64a4b,
+    dnnl_cBa2b,
+    dnnl_cBa4b,
+    dnnl_aBdc32b,
+    dnnl_aBdC32b2c,
+    dnnl_aBdC32b4c,
+    dnnl_aBdc48b,
+    dnnl_aBdC48b2c,
+    dnnl_aBdC48b4c,
+    dnnl_aBdc64b,
+    dnnl_aBdC64b2c,
+    dnnl_aBdC64b4c,
+    dnnl_adCb2c,
+    dnnl_adCb4c,
+    dnnl_AcdB32a2b,
+    dnnl_AcdB32a4b,
+    dnnl_Acdb48a,
+    dnnl_AcdB48a2b,
+    dnnl_AcdB48a4b,
+    dnnl_Acdb64a,
+    dnnl_AcdB64a2b,
+    dnnl_AcdB64a4b,
+    dnnl_cdBa2b,
+    dnnl_cdBa4b,
+    dnnl_aBdeC32b2c,
+    dnnl_aBdeC32b4c,
+    dnnl_aBdec48b,
+    dnnl_aBdeC48b2c,
+    dnnl_aBdeC48b4c,
+    dnnl_aBdec64b,
+    dnnl_aBdeC64b2c,
+    dnnl_aBdeC64b4c,
+    dnnl_adeCb2c,
+    dnnl_adeCb4c,
+    dnnl_Acdeb32a,
+    dnnl_AcdeB32a2b,
+    dnnl_AcdeB32a4b,
+    dnnl_Acdeb48a,
+    dnnl_AcdeB48a2b,
+    dnnl_AcdeB48a4b,
+    dnnl_Acdeb64a,
+    dnnl_AcdeB64a2b,
+    dnnl_AcdeB64a4b,
+    dnnl_cdeBa2b,
+    dnnl_cdeBa4b,
+    dnnl_aBdefc32b,
+    dnnl_aBdefC32b2c,
+    dnnl_aBdefC32b4c,
+    dnnl_aBdefc48b,
+    dnnl_aBdefC48b2c,
+    dnnl_aBdefC48b4c,
+    dnnl_aBdefc64b,
+    dnnl_aBdefC64b2c,
+    dnnl_aBdefC64b4c,
+    dnnl_adefCb2c,
+    dnnl_adefCb4c,
+    dnnl_AB16b32a4b,
+    dnnl_AB16b48a4b,
+    dnnl_AB16b64a4b,
+    dnnl_AB16b16a2b,
+    dnnl_AB16b32a2b,
+    dnnl_AB16b48a2b,
+    dnnl_AB16b64a2b,
+    dnnl_ABc16b32a4b,
+    dnnl_ABc16b48a4b,
+    dnnl_ABc16b64a4b,
+    dnnl_ABc16b32a2b,
+    dnnl_ABc16b48a2b,
+    dnnl_ABc16b64a2b,
+    dnnl_ABcd16b32a4b,
+    dnnl_ABcd16b48a4b,
+    dnnl_ABcd16b64a4b,
+    dnnl_ABcd16b32a2b,
+    dnnl_ABcd16b48a2b,
+    dnnl_ABcd16b64a2b,
+    dnnl_ABcde16b32a4b,
+    dnnl_ABcde16b48a4b,
+    dnnl_ABcde16b64a4b,
+    dnnl_ABcde16b32a2b,
+    dnnl_ABcde16b48a2b,
+    dnnl_ABcde16b64a2b,
+    dnnl_ABc32a16b,
+    dnnl_ABcd32a16b,
+    dnnl_ABcde32a16b,
+    dnnl_AB48a16b,
+    dnnl_AB48a32b,
+    dnnl_ABc40a16b,
+    dnnl_ABc40a32b,
+    dnnl_aBC48b16c,
+    dnnl_aBC48b32c,
+    dnnl_ABcd40a16b,
+    dnnl_ABcd40a32b,
+    dnnl_abCd32c,
+    dnnl_abdCe32c,
+    dnnl_abdCE32c2e,
+    dnnl_BA16a16b2a,
+    dnnl_BA16a32b2a,
+    dnnl_BA16a48b2a,
+    dnnl_BA16a64b2a,
+    dnnl_BA16a16b4a,
+    dnnl_BA16a32b4a,
+    dnnl_BA16a48b4a,
+    dnnl_BA16a64b4a,
+    dnnl_ABcd8a2b,
+    dnnl_aBdeC16c16b2c,
+    dnnl_aBdeC16c16b4c,
+    dnnl_aBdefC16c16b2c,
+    dnnl_AcB16b16a2b,
+    dnnl_AcB16b16a4b,
+    dnnl_AcdB16b16a2b,
+    dnnl_AcdB16b16a4b,
+    dnnl_AcdeB16b16a2b,
+    dnnl_aBdefC16c16b4c,
+    dnnl_AcdeB16b16a4b,
+    dnnl_AcB16b32a2b,
+    dnnl_AcB16b32a4b,
+    dnnl_AcB16b48a2b,
+    dnnl_AcB16b48a4b,
+    dnnl_AcB16b64a2b,
+    dnnl_AcB16b64a4b,
+    dnnl_aBdC16c16b2c,
+    dnnl_aBdC16c16b4c,
+    dnnl_aBdC16c32b2c,
+    dnnl_aBdC16c32b4c,
+    dnnl_aBdC16c48b2c,
+    dnnl_aBdC16c48b4c,
+    dnnl_aBdC16c64b2c,
+    dnnl_aBdC16c64b4c,
+    dnnl_AcdB16b32a2b,
+    dnnl_AcdB16b32a4b,
+    dnnl_AcdB16b48a2b,
+    dnnl_AcdB16b48a4b,
+    dnnl_AcdB16b64a2b,
+    dnnl_AcdB16b64a4b,
+    dnnl_aBdeC16c32b2c,
+    dnnl_aBdeC16c32b4c,
+    dnnl_aBdeC16c48b2c,
+    dnnl_aBdeC16c48b4c,
+    dnnl_aBdeC16c64b2c,
+    dnnl_aBdeC16c64b4c,
+    dnnl_AcdeB16b32a2b,
+    dnnl_AcdeB16b32a4b,
+    dnnl_AcdeB16b48a2b,
+    dnnl_AcdeB16b48a4b,
+    dnnl_AcdeB16b64a2b,
+    dnnl_AcdeB16b64a4b,
+    dnnl_aBdefC16c32b2c,
+    dnnl_aBdefC16c32b4c,
+    dnnl_aBdefC16c48b2c,
+    dnnl_aBdefC16c48b4c,
+    dnnl_aBdefC16c64b2c,
+    dnnl_aBdefC16c64b4c,
+    dnnl_decbA16a,
+    dnnl_ABc4a2b,
+    dnnl_ABc8a2b,
+    dnnl_aBCd8b2c,
+    dnnl_ABcde4a2b,
+    dnnl_ABcde8a2b,
+    dnnl_ABcde40a16b,
+    dnnl_ABcde40a32b,
+    dnnl_aBCde8b2c,
+    dnnl_ABcde4a8b8a2b,
+    dnnl_ABcd4a8b8a2b,
+    dnnl_ABc4a8b8a2b,
+    dnnl_aBCdef4b8c8b2c,
+    dnnl_aBCde4b8c8b2c,
+    dnnl_aBCd4b8c8b2c,
+    dnnl_BAcde4b8a8b2a,
+    dnnl_BAcd4b8a8b2a,
+    dnnl_BAc4b8a8b2a,
+    dnnl_aCBdef4c8b8c2b,
+    dnnl_aCBde4c8b8c2b,
+    dnnl_aCBd4c8b8c2b,
+    dnnl_aBCdef8b2c,
+    dnnl_AB32a16b,
+    dnnl_AB32a32b,
+    dnnl_BA4b8a8b2a,
+    dnnl_BA4b8a8b4a,
+    dnnl_aBC32b16c,
+    dnnl_aBC32b32c,
+    dnnl_aCB4c8b8c2b,
+    dnnl_aCB4c8b8c4b,
+    dnnl_ABcd4a2b,
+    dnnl_ABc2b8a16b4a,
+    dnnl_ABcd2b8a16b4a,
+    dnnl_ABcde2b8a16b4a,
+    dnnl_ABc2a8b16a4b,
+    dnnl_ABc2a8b16a2b,
+    dnnl_ABc2b32a8b,
+    dnnl_ABcd2a8b16a4b,
+    dnnl_ABcd2a8b16a2b,
+    dnnl_aCBd2c8b16c2b,
+    dnnl_ABcd2b32a8b,
+    dnnl_aBCd2c8b16c2b,
+    dnnl_ABcde2a8b16a4b,
+    dnnl_ABcde2a8b16a2b,
+    dnnl_aCBde2c8b16c2b,
+    dnnl_ABcde2b32a8b,
+    dnnl_aBC2b8c16b2c,
+    dnnl_aBCd2b8c16b2c,
+    dnnl_aBCde2b8c16b2c,
+    dnnl_aBCdef2b8c16b2c,
+    dnnl_BAcde2b8a16b4a,
+    dnnl_BAcd2b8a16b4a,
+    dnnl_BAc2b8a16b4a,
+    dnnl_BAcde2b8a16b2a,
+    dnnl_BAcd2b8a16b2a,
+    dnnl_BAc2b8a16b2a,
+    dnnl_aBCde2c8b16c2b,
+    dnnl_aBCdef2c8b16c2b,
+    dnnl_aCBdef2c8b16c2b,
+    dnnl_aBCd2b8c16b4c,
+    dnnl_aBCde2b8c16b4c,
+    dnnl_BA4b8a16b2a,
+    dnnl_BA4b8a16b4a,
+    dnnl_aCB4c8b16c2b,
+    dnnl_aCB4c8b16c4b,
+    dnnl_BA16a16b,
+    dnnl_BA16a32b,
+    dnnl_BA16a48b,
+    dnnl_BA16a64b,
+    dnnl_aCB16c2b,
+    dnnl_aCB16c4b,
+    dnnl_BA16b2a,
+    dnnl_BA16b4a,
+    dnnl_aBC16b16c,
+    dnnl_aBC16b32c,
+    dnnl_AB16a16b,
+    dnnl_AB16a32b,
+    dnnl_ABcde16a16b2a,
+    dnnl_aBCdef16b16c2b,
+    dnnl_Acedb16a,
+    dnnl_aBdfec16b,
+    dnnl_abdEC64e2c,
+    dnnl_abdEC64e4c,
+    dnnl_aCB16b16c,
+    dnnl_aCB16b32c,
+    dnnl_aCB16b48c,
+    dnnl_aCB16b64c,
+    dnnl_aCB16b16c2b,
+    dnnl_aCB16b32c2b,
+    dnnl_aCB16b48c2b,
+    dnnl_aCB16b64c2b,
+    dnnl_aCB16b16c4b,
+    dnnl_aCB16b32c4b,
+    dnnl_aCB16b48c4b,
+    dnnl_aCB16b64c4b,
+    dnnl_abCd4c,
+    dnnl_abCde4c,
+    dnnl_abCdef4c,
+    dnnl_abCde32c,
+    dnnl_abCdef32c,
+    dnnl_ABcd16a32b,
+    dnnl_decbA8a,
+    dnnl_aCdefB16b32c2b,
+    dnnl_aCdefB16b32c4b,
+    dnnl_aCdefB16b48c2b,
+    dnnl_aCdefB16b48c4b,
+    dnnl_aCdefB16b64c2b,
+    dnnl_aCdefB16b64c4b,
+    dnnl_BcdeA16a32b2a,
+    dnnl_BcdeA16a32b4a,
+    dnnl_BcdeA16a48b2a,
+    dnnl_BcdeA16a48b4a,
+    dnnl_BcdeA16a64b2a,
+    dnnl_BcdeA16a64b4a,
+    dnnl_aCdefb32c,
+    dnnl_aCdefB32c2b,
+    dnnl_aCdefB32c4b,
+    dnnl_aCdefb48c,
+    dnnl_aCdefB48c2b,
+    dnnl_aCdefB48c4b,
+    dnnl_aCdefb64c,
+    dnnl_aCdefB64c2b,
+    dnnl_aCdefB64c4b,
+    dnnl_Bcdea32b,
+    dnnl_BcdeA32b2a,
+    dnnl_BcdeA32b4a,
+    dnnl_Bcdea48b,
+    dnnl_BcdeA48b2a,
+    dnnl_BcdeA48b4a,
+    dnnl_Bcdea64b,
+    dnnl_BcdeA64b2a,
+    dnnl_BcdeA64b4a,
+    dnnl_Bca32b,
+    dnnl_BcA32b2a,
+    dnnl_BcA32b4a,
+    dnnl_Bca48b,
+    dnnl_BcA48b2a,
+    dnnl_BcA48b4a,
+    dnnl_Bca64b,
+    dnnl_BcA64b2a,
+    dnnl_BcA64b4a,
+    dnnl_aCdb32c,
+    dnnl_aCdB32c2b,
+    dnnl_aCdB32c4b,
+    dnnl_aCdb48c,
+    dnnl_aCdB48c2b,
+    dnnl_aCdB48c4b,
+    dnnl_aCdb64c,
+    dnnl_aCdB64c2b,
+    dnnl_aCdB64c4b,
+    dnnl_BcA16a16b2a,
+    dnnl_BcA16a16b4a,
+    dnnl_BcdA16a16b2a,
+    dnnl_BcdA16a16b4a,
+    dnnl_BcdeA16a16b2a,
+    dnnl_BcdeA16a16b4a,
+    dnnl_aCdB16b16c2b,
+    dnnl_aCdB16b16c4b,
+    dnnl_aCdeB16b16c2b,
+    dnnl_aCdeB16b16c4b,
+    dnnl_aCdefB16b16c2b,
+    dnnl_aCdefB16b16c4b,
+    dnnl_BcA16a32b2a,
+    dnnl_BcA16a32b4a,
+    dnnl_BcA16a48b2a,
+    dnnl_BcA16a48b4a,
+    dnnl_BcA16a64b2a,
+    dnnl_BcA16a64b4a,
+    dnnl_aCdB16b32c2b,
+    dnnl_aCdB16b32c4b,
+    dnnl_aCdB16b48c2b,
+    dnnl_aCdB16b48c4b,
+    dnnl_aCdB16b64c2b,
+    dnnl_aCdB16b64c4b,
+    dnnl_BcdA16a32b2a,
+    dnnl_BcdA16a32b4a,
+    dnnl_BcdA16a48b2a,
+    dnnl_BcdA16a48b4a,
+    dnnl_BcdA16a64b2a,
+    dnnl_BcdA16a64b4a,
+    dnnl_aCdeB16b32c2b,
+    dnnl_aCdeB16b32c4b,
+    dnnl_aCdeB16b48c2b,
+    dnnl_aCdeB16b48c4b,
+    dnnl_aCdeB16b64c2b,
+    dnnl_aCdeB16b64c4b,
+    dnnl_Bca16b,
+    dnnl_BcA16b2a,
+    dnnl_BcA16b4a,
+    dnnl_Bcda16b,
+    dnnl_BcdA16b2a,
+    dnnl_BcdA16b4a,
+    dnnl_Bcdea16b,
+    dnnl_BcdeA16b2a,
+    dnnl_BcdeA16b4a,
+    dnnl_aCdb16c,
+    dnnl_aCdB16c2b,
+    dnnl_aCdB16c4b,
+    dnnl_aCdeb16c,
+    dnnl_aCdeB16c2b,
+    dnnl_aCdeB16c4b,
+    dnnl_aCdefb16c,
+    dnnl_aCdefB16c2b,
+    dnnl_aCdefB16c4b,
+    dnnl_Bcda32b,
+    dnnl_BcdA32b2a,
+    dnnl_BcdA32b4a,
+    dnnl_Bcda48b,
+    dnnl_BcdA48b2a,
+    dnnl_BcdA48b4a,
+    dnnl_Bcda64b,
+    dnnl_BcdA64b2a,
+    dnnl_BcdA64b4a,
+    dnnl_aCdeb32c,
+    dnnl_aCdeB32c2b,
+    dnnl_aCdeB32c4b,
+    dnnl_aCdeb48c,
+    dnnl_aCdeB48c2b,
+    dnnl_aCdeB48c4b,
+    dnnl_aCdeb64c,
+    dnnl_aCdeB64c2b,
+    dnnl_aCdeB64c4b,
+    dnnl_Acb24a,
+    dnnl_Acdb24a,
+    dnnl_Acdeb24a,
+    dnnl_aBdc24b,
+    dnnl_aBdec24b,
+    dnnl_aBdefc24b,
+    dnnl_abDc16d,
+    dnnl_abdEc16e,
+    dnnl_abdCe16c,
+    dnnl_AcB24a2b,
+    dnnl_AcdB24a2b,
+    dnnl_AcdeB24a2b,
+    dnnl_aBdC24b2c,
+    dnnl_aBdeC24b2c,
+    dnnl_aBdefC24b2c,
+    dnnl_AcB8a2b,
+    dnnl_AcdB8a2b,
+    dnnl_AcdeB8a2b,
+    dnnl_aBdC8b2c,
+    dnnl_aBdeC8b2c,
+    dnnl_aBdefC8b2c,
+    dnnl_AB8b32a,
+    dnnl_ABc8b32a,
+    dnnl_ABcd8b32a,
+    dnnl_ABcde8b32a,
+    dnnl_AB8b24a,
+    dnnl_ABc8b24a,
+    dnnl_ABcd8b24a,
+    dnnl_ABcde8b24a,
+    dnnl_AB8b16a,
+    dnnl_ABc8b16a,
+    dnnl_ABcd8b16a,
+    dnnl_ABcde8b16a,
+    dnnl_AB8b8a,
+    dnnl_AB4b8a4b,
+    dnnl_AB4b24a4b,
+    dnnl_ABc4b8a4b,
+    dnnl_ABc4b24a4b,
+    dnnl_ABcd4b8a4b,
+    dnnl_ABcd4b24a4b,
+    dnnl_ABcde4b8a4b,
+    dnnl_ABcde4b24a4b,
+    dnnl_AB8b24a2b,
+    dnnl_ABc8b24a2b,
+    dnnl_ABcd8b24a2b,
+    dnnl_ABcde8b24a2b,
+    dnnl_AB8b8a2b,
+    dnnl_ABc8b8a2b,
+    dnnl_ABcd8b8a2b,
+    dnnl_ABcde8b8a2b,
+    dnnl_AcB24a4b,
+    dnnl_AcdB24a4b,
+    dnnl_AcdeB24a4b,
+    dnnl_aBdC24b4c,
+    dnnl_aBdeC24b4c,
+    dnnl_aBdefC24b4c,
+    dnnl_AcB8a4b,
+    dnnl_AcdB8a4b,
+    dnnl_AcdeB8a4b,
+    dnnl_aBdC8b4c,
+    dnnl_aBdeC8b4c,
+    dnnl_aBdefC8b4c,
+    dnnl_Bca8b,
+    dnnl_BcA8b2a,
+    dnnl_Bcda8b,
+    dnnl_BcdA8b2a,
+    dnnl_Bcdea8b,
+    dnnl_BcdeA8b2a,
+    dnnl_aCdb8c,
+    dnnl_aCdB8c2b,
+    dnnl_aCdeb8c,
+    dnnl_aCdeB8c2b,
+    dnnl_aCdefb8c,
+    dnnl_aCdefB8c2b,
+    dnnl_Bca24b,
+    dnnl_BcA24b2a,
+    dnnl_Bcda24b,
+    dnnl_BcdA24b2a,
+    dnnl_Bcdea24b,
+    dnnl_BcdeA24b2a,
+    dnnl_aCdb24c,
+    dnnl_aCdB24c2b,
+    dnnl_aCdeb24c,
+    dnnl_aCdeB24c2b,
+    dnnl_aCdefb24c,
+    dnnl_aCdefB24c2b,
+    dnnl_BcA8b4a,
+    dnnl_BcdA8b4a,
+    dnnl_BcdeA8b4a,
+    dnnl_aCdB8c4b,
+    dnnl_aCdeB8c4b,
+    dnnl_aCdefB8c4b,
+    dnnl_BcA24b4a,
+    dnnl_BcdA24b4a,
+    dnnl_BcdeA24b4a,
+    dnnl_aCdB24c4b,
+    dnnl_aCdeB24c4b,
+    dnnl_aCdefB24c4b,
+    dnnl_AB16b48a,
+    dnnl_ABc16b48a,
+    dnnl_ABcd16b48a,
+    dnnl_ABcde16b48a,
+    dnnl_ABc16a4b,
+    dnnl_ABcd16a4b,
+    dnnl_ABcde16a4b,
+    dnnl_defcbA16a,
+    dnnl_defcbA8a,
+    dnnl_AcB16b64a,
+    dnnl_AcdB16b64a,
+    dnnl_AcdeB16b64a,
+    dnnl_AcB16b48a,
+    dnnl_AcdB16b48a,
+    dnnl_AcdeB16b48a,
+    dnnl_AcB16b32a,
+    dnnl_AcdB16b32a,
+    dnnl_AcdeB16b32a,
+    dnnl_AcB16b16a,
+    dnnl_AcdB16b16a,
+    dnnl_AcdeB16b16a,
+    dnnl_AcB8b32a,
+    dnnl_AcdB8b32a,
+    dnnl_AcdeB8b32a,
+    dnnl_AcB8b24a,
+    dnnl_AcdB8b24a,
+    dnnl_AcdeB8b24a,
+    dnnl_AcB8b16a,
+    dnnl_AcdB8b16a,
+    dnnl_AcdeB8b16a,
+    dnnl_AcB8b8a,
+    dnnl_AcdB8b8a,
+    dnnl_AcdeB8b8a,
+    dnnl_AcB8b64a2b,
+    dnnl_AcdB8b64a2b,
+    dnnl_AcdeB8b64a2b,
+    dnnl_AcB8b32a2b,
+    dnnl_AcdB8b32a2b,
+    dnnl_AcdeB8b32a2b,
+    dnnl_AcB8b24a2b,
+    dnnl_AcdB8b24a2b,
+    dnnl_AcdeB8b24a2b,
+    dnnl_AcB8b16a2b,
+    dnnl_AcdB8b16a2b,
+    dnnl_AcdeB8b16a2b,
+    dnnl_AcB8b8a2b,
+    dnnl_AcdB8b8a2b,
+    dnnl_AcdeB8b8a2b,
+    dnnl_AcB4b64a4b,
+    dnnl_AcdB4b64a4b,
+    dnnl_AcdeB4b64a4b,
+    dnnl_AcB4b32a4b,
+    dnnl_AcdB4b32a4b,
+    dnnl_AcdeB4b32a4b,
+    dnnl_AcB4b24a4b,
+    dnnl_AcdB4b24a4b,
+    dnnl_AcdeB4b24a4b,
+    dnnl_AcB4b16a4b,
+    dnnl_AcdB4b16a4b,
+    dnnl_AcdeB4b16a4b,
+    dnnl_AcB4b8a4b,
+    dnnl_AcdB4b8a4b,
+    dnnl_AcdeB4b8a4b,
+    dnnl_Ab4a,
+    dnnl_Ab8a,
+    dnnl_BA4b4a,
+    dnnl_BA8b4a,
+    dnnl_BA2a24b,
+    dnnl_aCB2b24c,
+    dnnl_BA2a8b,
+    dnnl_aCB2b8c,
+    dnnl_BA8a24b,
+    dnnl_aCB8b24c,
+    dnnl_BA8a16b,
+    dnnl_aCB8b16c,
+    dnnl_BA8a8b,
+    dnnl_aCB8b8c,
+    dnnl_bcad,
+    dnnl_cabd,
+    dnnl_dabc,
+    dnnl_Ab32a,
+    dnnl_aCBd8b8c,
+    dnnl_aCBde8b8c,
+    dnnl_BAc8a8b,
+    dnnl_BAcd8a8b,
+    dnnl_BAcde8a8b,
+    dnnl_aCBdef8b8c,
+    dnnl_abdEC16e4c,
+    dnnl_abDC16d4c,
+
+    /// Just a sentinel, not real memory format tag. Must be changed after new
+    /// format tag is added.
+    dnnl_format_tag_last,
+
+    // Aliases
+
+    /// 1D tensor, an alias to #dnnl_a
+    dnnl_x = dnnl_a,
+    /// 2D CNN activations tensor, an alias to #dnnl_ab
+    dnnl_nc = dnnl_ab,
+    /// 2D CNN activations tensor, an alias to #dnnl_ba
+    dnnl_cn = dnnl_ba,
+    /// 2D RNN statistics tensor, an alias to #dnnl_ab
+    dnnl_tn = dnnl_ab,
+    /// 2D RNN statistics tensor, an alias to #dnnl_ba
+    dnnl_nt = dnnl_ba,
+    /// 3D CNN activations tensor, an alias to #dnnl_abc
+    dnnl_ncw = dnnl_abc,
+    /// 3D CNN activations tensor, an alias to #dnnl_acb
+    dnnl_nwc = dnnl_acb,
+    /// 4D CNN activations tensor, an alias to #dnnl_abcd
+    dnnl_nchw = dnnl_abcd,
+    /// 4D CNN activations tensor, an alias to #dnnl_acdb
+    dnnl_nhwc = dnnl_acdb,
+    /// 4D CNN activations tensor, an alias to #dnnl_bcda
+    dnnl_chwn = dnnl_bcda,
+    /// 5D CNN activations tensor, an alias to #dnnl_abcde
+    dnnl_ncdhw = dnnl_abcde,
+    /// 5D CNN activations tensor, an alias to #dnnl_acdeb
+    dnnl_ndhwc = dnnl_acdeb,
+
+    /// 2D CNN weights tensor, an alias to #dnnl_ab
+    dnnl_oi = dnnl_ab,
+    /// 2D CNN weights tensor, an alias to #dnnl_ba
+    dnnl_io = dnnl_ba,
+    /// 3D CNN weights tensor, an alias to #dnnl_abc
+    dnnl_oiw = dnnl_abc,
+    /// 3D CNN weights tensor, an alias to #dnnl_acb
+    dnnl_owi = dnnl_acb,
+    /// 3D CNN weights tensor, an alias to #dnnl_cba
+    dnnl_wio = dnnl_cba,
+    /// 3D CNN weights tensor, an alias to #dnnl_cab
+    dnnl_woi = dnnl_cab,
+    /// 3D CNN weights tensor, an alias to #dnnl_bca
+    dnnl_iwo = dnnl_bca,
+    /// 4D CNN weights tensor, an alias to #dnnl_abcd
+    dnnl_oihw = dnnl_abcd,
+    /// 4D CNN weights tensor, an alias to #dnnl_cdba
+    dnnl_hwio = dnnl_cdba,
+    /// 4D CNN weights tensor, an alias to #dnnl_cdab
+    dnnl_hwoi = dnnl_cdab,
+    /// 4D CNN weights tensor, an alias to #dnnl_acdb
+    dnnl_ohwi = dnnl_acdb,
+    /// 4D CNN weights tensor, an alias to #dnnl_bcda
+    dnnl_ihwo = dnnl_bcda,
+    /// 4D CNN weights tensor, an alias to #dnnl_bacd
+    dnnl_iohw = dnnl_bacd,
+    /// 5D CNN weights tensor, an alias to #dnnl_abcde
+    dnnl_oidhw = dnnl_abcde,
+    /// 5D CNN weights tensor, an alias to #dnnl_bacde
+    dnnl_iodhw = dnnl_bacde,
+    /// 5D CNN weights tensor, an alias to #dnnl_cdeba
+    dnnl_dhwio = dnnl_cdeba,
+    /// 5D CNN weights tensor, an alias to #dnnl_cdeab
+    dnnl_dhwoi = dnnl_cdeab,
+    /// 5D CNN weights tensor, an alias to #dnnl_acdeb
+    dnnl_odhwi = dnnl_acdeb,
+    /// 5D CNN weights tensor, an alias to #dnnl_bcdea
+    dnnl_idhwo = dnnl_bcdea,
+
+    /// 4D CNN weights tensor (incl. groups), an alias to #dnnl_abcd
+    dnnl_goiw = dnnl_abcd,
+    /// 4D CNN weights tensor (incl. groups), an alias to #dnnl_abdc
+    dnnl_gowi = dnnl_abdc,
+    /// 4D CNN weights tensor (incl. groups), an alias to #dnnl_dcab
+    dnnl_wigo = dnnl_dcab,
+    /// 5D CNN weights tensor (incl. groups), an alias to #dnnl_abcde
+    dnnl_goihw = dnnl_abcde,
+    /// 5D CNN weights tensor (incl. groups), an alias to #dnnl_abdec
+    dnnl_gohwi = dnnl_abdec,
+    /// 5D CNN weights tensor (incl. groups), an alias to #dnnl_decab
+    dnnl_hwigo = dnnl_decab,
+    /// 5D CNN weights tensor (incl. groups), an alias to #dnnl_acbde
+    dnnl_giohw = dnnl_acbde,
+    /// 6D CNN weights tensor (incl. groups), an alias to #dnnl_abcdef
+    dnnl_goidhw = dnnl_abcdef,
+    /// 6D CNN weights tensor (incl. groups), an alias to #dnnl_abdefc
+    dnnl_godhwi = dnnl_abdefc,
+    /// 6D CNN weights tensor (incl. groups), an alias to #dnnl_acbdef
+    dnnl_giodhw = dnnl_acbdef,
+    /// 6D CNN weights tensor (incl. groups), an alias to #dnnl_defcab
+    dnnl_dhwigo = dnnl_defcab,
+
+    /// 3D RNN data tensor in the format (seq_length, batch, input channels),
+    /// an alias to #dnnl_abc.
+    dnnl_tnc = dnnl_abc,
+    /// 3D RNN data tensor in the format (batch, seq_length, input channels),
+    /// an alias to #dnnl_bac.
+    dnnl_ntc = dnnl_bac,
+    /// 4D RNN states tensor in the format (num_layers, num_directions,
+    /// batch, state channels), an alias to #dnnl_abcd.
+    dnnl_ldnc = dnnl_abcd,
+    /// 5D RNN weights tensor in the format (num_layers, num_directions,
+    /// input_channels, num_gates, output_channels), an alias to #dnnl_abcde.
+    ///
+    ///  - For LSTM cells, the gates order is input, forget, candidate
+    ///    and output gate.
+    ///  - For GRU cells, the gates order is update, reset and output gate.
+    dnnl_ldigo = dnnl_abcde,
+    /// 5D RNN weights tensor in the format (num_layers, num_directions,
+    /// num_gates, output_channels, input_channels), an alias to #dnnl_abdec.
+    ///
+    ///  - For LSTM cells, the gates order is input, forget, candidate
+    ///    and output gate.
+    ///  - For GRU cells, the gates order is update, reset and output gate.
+    dnnl_ldgoi = dnnl_abdec,
+    /// 4D LSTM projection tensor in the format (num_layers, num_directions,
+    /// num_channels_in_hidden_state, num_channels_in_recurrent_projection),
+    /// an alias to #dnnl_abcd.
+    dnnl_ldio = dnnl_abcd,
+    /// 4D LSTM projection tensor in the format (num_layers, num_directions,
+    /// num_channels_in_recurrent_projection, num_channels_in_hidden_state),
+    /// an alias to #dnnl_abdc.
+    dnnl_ldoi = dnnl_abdc,
+    /// 4D RNN bias tensor in the format (num_layers, num_directions,
+    /// num_gates, output_channels), an alias to #dnnl_abcd.
+    ///
+    ///  - For LSTM cells, the gates order is input, forget, candidate
+    ///    and output gate.
+    ///  - For GRU cells, the gates order is update, reset and output gate.
+    dnnl_ldgo = dnnl_abcd,
+    /// 5D LSTM projection tensor
+    dnnl_ldOi16o = dnnl_abDc16d,
+    dnnl_ldOi32o = dnnl_abDc32d,
+    dnnl_ldOI16o4i = dnnl_abDC16d4c,
+    dnnl_ldOI32o4i = dnnl_abDC32d4c,
+    dnnl_ldIo32i = dnnl_abCd32c,
+    /// 6D RNN weights tensor
+    dnnl_ldgOi16o = dnnl_abdEc16e,
+    dnnl_ldgOI16o4i = dnnl_abdEC16e4c,
+    dnnl_ldgOi32o = dnnl_abdEc32e,
+    dnnl_ldgOI32o2i = dnnl_abdEC32e2c,
+    dnnl_ldgOI32o4i = dnnl_abdEC32e4c,
+    dnnl_ldgOI64o2i = dnnl_abdEC64e2c,
+    dnnl_ldgOI64o4i = dnnl_abdEC64e4c,
+    dnnl_ldgIo16i = dnnl_abdCe16c,
+    dnnl_ldgIo32i = dnnl_abdCe32c,
+    dnnl_ldgIO32i2o = dnnl_abdCE32c2e,
+
+    // Opaque data types, are not to be used explicitly
+
+    // data
+    /// 5D CNN activations tensor blocked by channels with block size 32,
+    /// an alias to #dnnl_aBcde32b
+    dnnl_nCdhw32c = dnnl_aBcde32b,
+    /// 5D CNN activations tensor blocked by channels with block size 16,
+    /// an alias to #dnnl_aBcde16b
+    dnnl_nCdhw16c = dnnl_aBcde16b,
+    /// 5D CNN activations tensor blocked by channels with block size 4,
+    /// an alias to #dnnl_aBcde4b
+    dnnl_nCdhw4c = dnnl_aBcde4b,
+    /// 5D CNN activations tensor blocked by channels with block size 8,
+    /// an alias to #dnnl_aBcde8b
+    dnnl_nCdhw8c = dnnl_aBcde8b,
+    /// 4D CNN activations tensor blocked by channels with block size 32,
+    /// an alias to #dnnl_aBcd32b
+    dnnl_nChw32c = dnnl_aBcd32b,
+    /// 4D CNN activations tensor blocked by channels with block size 16,
+    /// an alias to #dnnl_aBcd16b
+    dnnl_nChw16c = dnnl_aBcd16b,
+    /// 4D CNN activations tensor blocked by channels with block size 4,
+    /// an alias to #dnnl_aBcd4b
+    dnnl_nChw4c = dnnl_aBcd4b,
+    /// 4D CNN activations tensor blocked by channels with block size 8,
+    /// an alias to #dnnl_aBcd8b
+    dnnl_nChw8c = dnnl_aBcd8b,
+    /// 3D CNN activations tensor blocked by channels with block size 32,
+    /// an alias to #dnnl_aBc32b
+    dnnl_nCw32c = dnnl_aBc32b,
+    /// 3D CNN activations tensor blocked by channels with block size 16,
+    /// an alias to #dnnl_aBc16b
+    dnnl_nCw16c = dnnl_aBc16b,
+    /// 3D CNN activations tensor blocked by channels with block size 4,
+    /// an alias to #dnnl_aBc4b
+    dnnl_nCw4c = dnnl_aBc4b,
+    /// 3D CNN activations tensor blocked by channels with block size 8,
+    /// an alias to #dnnl_aBc8b
+    dnnl_nCw8c = dnnl_aBc8b,
+    dnnl_NCw16n16c = dnnl_ABc16a16b,
+    dnnl_NCdhw16n16c = dnnl_ABcde16a16b,
+    dnnl_NChw16n16c = dnnl_ABcd16a16b,
+    dnnl_NCw32n16c = dnnl_ABc32a16b,
+    dnnl_NChw32n16c = dnnl_ABcd32a16b,
+    dnnl_NChw16n32c = dnnl_ABcd16a32b,
+    dnnl_NCdhw32n16c = dnnl_ABcde32a16b,
+    dnnl_NCw32n32c = dnnl_ABc32a32b,
+    dnnl_NChw32n32c = dnnl_ABcd32a32b,
+    dnnl_NCdhw32n32c = dnnl_ABcde32a32b,
+
+    // weights, 2D
+    dnnl_OI16i16o = dnnl_AB16b16a,
+    dnnl_OI16i32o = dnnl_AB16b32a,
+    dnnl_OI16i48o = dnnl_AB16b48a,
+    dnnl_OI16i64o = dnnl_AB16b64a,
+    dnnl_OI8i8o2i = dnnl_AB8b8a2b,
+    dnnl_OI8i16o2i = dnnl_AB8b16a2b,
+    dnnl_OI8i24o2i = dnnl_AB8b24a2b,
+    dnnl_OI8i32o2i = dnnl_AB8b32a2b,
+    dnnl_OI8i64o2i = dnnl_AB8b64a2b,
+    dnnl_OI4i8o4i = dnnl_AB4b8a4b,
+    dnnl_OI4i16o4i = dnnl_AB4b16a4b,
+    dnnl_OI4i24o4i = dnnl_AB4b24a4b,
+    dnnl_OI4i32o4i = dnnl_AB4b32a4b,
+    dnnl_OI4i64o4i = dnnl_AB4b64a4b,
+    dnnl_OI16i16o4i = dnnl_AB16b16a4b,
+    dnnl_OI8i32o = dnnl_AB8b32a,
+    dnnl_OI8i24o = dnnl_AB8b24a,
+    dnnl_OI8i16o = dnnl_AB8b16a,
+    dnnl_OI8i8o = dnnl_AB8b8a,
+
+    // weights, 3D
+    dnnl_IOw8o8i = dnnl_BAc8a8b,
+    dnnl_IOw16o16i = dnnl_BAc16a16b,
+    dnnl_IOw16i16o = dnnl_BAc16b16a,
+    dnnl_OIw16i16o = dnnl_ABc16b16a,
+    dnnl_OwI16i16o = dnnl_AcB16b16a,
+    dnnl_OIw16i32o = dnnl_ABc16b32a,
+    dnnl_OwI16i32o = dnnl_AcB16b32a,
+    dnnl_OIw16i48o = dnnl_ABc16b48a,
+    dnnl_OwI16i48o = dnnl_AcB16b48a,
+    dnnl_OIw16i64o = dnnl_ABc16b64a,
+    dnnl_OwI16i64o = dnnl_AcB16b64a,
+    dnnl_OIw16o16i = dnnl_ABc16a16b,
+    dnnl_Oiw16o = dnnl_Abc16a,
+    dnnl_OIw4i8o4i = dnnl_ABc4b8a4b,
+    dnnl_OwI4i8o4i = dnnl_AcB4b8a4b,
+    dnnl_OIw4i16o4i = dnnl_ABc4b16a4b,
+    dnnl_OwI4i16o4i = dnnl_AcB4b16a4b,
+    dnnl_OIw4i24o4i = dnnl_ABc4b24a4b,
+    dnnl_OwI4i24o4i = dnnl_AcB4b24a4b,
+    dnnl_OIw4i32o4i = dnnl_ABc4b32a4b,
+    dnnl_OwI4i32o4i = dnnl_AcB4b32a4b,
+    dnnl_OIw4i64o4i = dnnl_ABc4b64a4b,
+    dnnl_OwI4i64o4i = dnnl_AcB4b64a4b,
+    dnnl_OIw2i8o4i = dnnl_ABc2b8a4b,
+    dnnl_OIw16i16o4i = dnnl_ABc16b16a4b,
+    dnnl_OIw16i16o2i = dnnl_ABc16b16a2b,
+    dnnl_OIw16o16i2o = dnnl_ABc16a16b2a,
+    dnnl_OIw4i4o = dnnl_ABc4b4a,
+    dnnl_OIw4o4i = dnnl_ABc4a4b,
+    dnnl_Oiw4o = dnnl_Abc4a,
+    dnnl_OIw8i8o2i = dnnl_ABc8b8a2b,
+    dnnl_OwI8i8o2i = dnnl_AcB8b8a2b,
+    dnnl_OIw8i16o2i = dnnl_ABc8b16a2b,
+    dnnl_OwI8i16o2i = dnnl_AcB8b16a2b,
+    dnnl_OIw8i24o2i = dnnl_ABc8b24a2b,
+    dnnl_OwI8i24o2i = dnnl_AcB8b24a2b,
+    dnnl_OIw8i32o2i = dnnl_ABc8b32a2b,
+    dnnl_OwI8i32o2i = dnnl_AcB8b32a2b,
+    dnnl_OIw8i64o2i = dnnl_ABc8b64a2b,
+    dnnl_OwI8i64o2i = dnnl_AcB8b64a2b,
+    dnnl_OIw8i8o = dnnl_ABc8b8a,
+    dnnl_OwI8i8o = dnnl_AcB8b8a,
+    dnnl_OIw8o16i2o = dnnl_ABc8a16b2a,
+    dnnl_IOw8o16i2o = dnnl_BAc8a16b2a,
+    dnnl_OIw8o8i = dnnl_ABc8a8b,
+    dnnl_OIw8o4i = dnnl_ABc8a4b,
+    dnnl_Owi16o = dnnl_Acb16a,
+    dnnl_OwI16o2i = dnnl_AcB16a2b,
+    dnnl_OwI16o4i = dnnl_AcB16a4b,
+    dnnl_Iwo8i = dnnl_Bca8b,
+    dnnl_IwO8i2o = dnnl_BcA8b2a,
+    dnnl_IwO8i4o = dnnl_BcA8b4a,
+    dnnl_Iwo16i = dnnl_Bca16b,
+    dnnl_IwO16i2o = dnnl_BcA16b2a,
+    dnnl_IwO16i4o = dnnl_BcA16b4a,
+    dnnl_Iwo24i = dnnl_Bca24b,
+    dnnl_IwO24i2o = dnnl_BcA24b2a,
+    dnnl_IwO24i4o = dnnl_BcA24b4a,
+    dnnl_Owi4o = dnnl_Acb4a,
+    dnnl_Owi8o = dnnl_Acb8a,
+    dnnl_OwI8o2i = dnnl_AcB8a2b,
+    dnnl_OIw8i32o = dnnl_ABc8b32a,
+    dnnl_OwI8i32o = dnnl_AcB8b32a,
+    dnnl_OIw8i24o = dnnl_ABc8b24a,
+    dnnl_OwI8i24o = dnnl_AcB8b24a,
+    dnnl_OIw8i16o = dnnl_ABc8b16a,
+    dnnl_OwI8i16o = dnnl_AcB8b16a,
+    dnnl_OwI8o4i = dnnl_AcB8a4b,
+
+    // weights, 4D
+    dnnl_IOhw16i16o = dnnl_BAcd16b16a,
+    dnnl_IOhw8o8i = dnnl_BAcd8a8b,
+    dnnl_IOhw16o16i = dnnl_BAcd16a16b,
+    dnnl_Ohwi16o = dnnl_Acdb16a,
+    dnnl_OhwI16o2i = dnnl_AcdB16a2b,
+    dnnl_OhwI16o4i = dnnl_AcdB16a4b,
+    dnnl_Ihwo8i = dnnl_Bcda8b,
+    dnnl_IhwO8i2o = dnnl_BcdA8b2a,
+    dnnl_IhwO8i4o = dnnl_BcdA8b4a,
+    dnnl_Ihwo16i = dnnl_Bcda16b,
+    dnnl_IhwO16i2o = dnnl_BcdA16b2a,
+    dnnl_IhwO16i4o = dnnl_BcdA16b4a,
+    dnnl_Ihwo24i = dnnl_Bcda24b,
+    dnnl_IhwO24i2o = dnnl_BcdA24b2a,
+    dnnl_IhwO24i4o = dnnl_BcdA24b4a,
+    dnnl_Ohwi24o = dnnl_Acdb24a,
+    dnnl_Ohwi32o = dnnl_Acdb32a,
+    dnnl_Ohwi4o = dnnl_Acdb4a,
+    dnnl_Ohwi8o = dnnl_Acdb8a,
+    dnnl_OhwI8o2i = dnnl_AcdB8a2b,
+    dnnl_OhwI8o4i = dnnl_AcdB8a4b,
+    dnnl_OIhw16i16o = dnnl_ABcd16b16a,
+    dnnl_OhwI16i16o = dnnl_AcdB16b16a,
+    dnnl_OIhw16i32o = dnnl_ABcd16b32a,
+    dnnl_OhwI16i32o = dnnl_AcdB16b32a,
+    dnnl_OIhw16i48o = dnnl_ABcd16b48a,
+    dnnl_OhwI16i48o = dnnl_AcdB16b48a,
+    dnnl_OIhw16i64o = dnnl_ABcd16b64a,
+    dnnl_OhwI16i64o = dnnl_AcdB16b64a,
+    dnnl_OIhw16o16i = dnnl_ABcd16a16b,
+    dnnl_Oihw16o = dnnl_Abcd16a,
+    dnnl_OIhw4i8o4i = dnnl_ABcd4b8a4b,
+    dnnl_OhwI4i8o4i = dnnl_AcdB4b8a4b,
+    dnnl_OIhw4i16o4i = dnnl_ABcd4b16a4b,
+    dnnl_OhwI4i16o4i = dnnl_AcdB4b16a4b,
+    dnnl_OIhw4i24o4i = dnnl_ABcd4b24a4b,
+    dnnl_OhwI4i24o4i = dnnl_AcdB4b24a4b,
+    dnnl_OIhw4i32o4i = dnnl_ABcd4b32a4b,
+    dnnl_OhwI4i32o4i = dnnl_AcdB4b32a4b,
+    dnnl_OIhw4i64o4i = dnnl_ABcd4b64a4b,
+    dnnl_OhwI4i64o4i = dnnl_AcdB4b64a4b,
+    dnnl_OIhw16i16o4i = dnnl_ABcd16b16a4b,
+    dnnl_OIhw16i16o2i = dnnl_ABcd16b16a2b,
+    dnnl_OIhw16o16i2o = dnnl_ABcd16a16b2a,
+    dnnl_OIhw4i4o = dnnl_ABcd4b4a,
+    dnnl_OIhw4o4i = dnnl_ABcd4a4b,
+    dnnl_Oihw4o = dnnl_Abcd4a,
+    dnnl_OIhw8i8o2i = dnnl_ABcd8b8a2b,
+    dnnl_OhwI8i8o2i = dnnl_AcdB8b8a2b,
+    dnnl_OIhw8i16o2i = dnnl_ABcd8b16a2b,
+    dnnl_OhwI8i16o2i = dnnl_AcdB8b16a2b,
+    dnnl_OIhw8i32o2i = dnnl_ABcd8b32a2b,
+    dnnl_OhwI8i32o2i = dnnl_AcdB8b32a2b,
+    dnnl_OIhw8i24o2i = dnnl_ABcd8b24a2b,
+    dnnl_OhwI8i24o2i = dnnl_AcdB8b24a2b,
+    dnnl_OIhw8i64o2i = dnnl_ABcd8b64a2b,
+    dnnl_OhwI8i64o2i = dnnl_AcdB8b64a2b,
+    dnnl_OIhw8i8o = dnnl_ABcd8b8a,
+    dnnl_OhwI8i8o = dnnl_AcdB8b8a,
+    dnnl_OIhw8o16i2o = dnnl_ABcd8a16b2a,
+    dnnl_OIhw2i8o4i = dnnl_ABcd2b8a4b,
+    dnnl_IOhw8o16i2o = dnnl_BAcd8a16b2a,
+    dnnl_OIhw8o8i = dnnl_ABcd8a8b,
+    dnnl_OIhw8o4i = dnnl_ABcd8a4b,
+    dnnl_Owhi16o = dnnl_Adcb16a,
+    dnnl_OIhw8i32o = dnnl_ABcd8b32a,
+    dnnl_OhwI8i32o = dnnl_AcdB8b32a,
+    dnnl_OIhw8i24o = dnnl_ABcd8b24a,
+    dnnl_OhwI8i24o = dnnl_AcdB8b24a,
+    dnnl_OIhw8i16o = dnnl_ABcd8b16a,
+    dnnl_OhwI8i16o = dnnl_AcdB8b16a,
+
+    // weights, 5D
+    dnnl_Odhwi16o = dnnl_Acdeb16a,
+    dnnl_OdhwI16o2i = dnnl_AcdeB16a2b,
+    dnnl_OdhwI16o4i = dnnl_AcdeB16a4b,
+    dnnl_Idhwo8i = dnnl_Bcdea8b,
+    dnnl_IdhwO8i2o = dnnl_BcdeA8b2a,
+    dnnl_IdhwO8i4o = dnnl_BcdeA8b4a,
+    dnnl_Idhwo16i = dnnl_Bcdea16b,
+    dnnl_IdhwO16i2o = dnnl_BcdeA16b2a,
+    dnnl_IdhwO16i4o = dnnl_BcdeA16b4a,
+    dnnl_Idhwo24i = dnnl_Bcdea24b,
+    dnnl_IdhwO24i2o = dnnl_BcdeA24b2a,
+    dnnl_IdhwO24i4o = dnnl_BcdeA24b4a,
+    dnnl_Odhwi4o = dnnl_Acdeb4a,
+    dnnl_Odhwi8o = dnnl_Acdeb8a,
+    dnnl_OdhwI8o2i = dnnl_AcdeB8a2b,
+    dnnl_OdhwI8o4i = dnnl_AcdeB8a4b,
+    dnnl_Odwhi16o = dnnl_Acedb16a,
+    dnnl_OIdhw16i16o = dnnl_ABcde16b16a,
+    dnnl_OdhwI16i16o = dnnl_AcdeB16b16a,
+    dnnl_OIdhw16i32o = dnnl_ABcde16b32a,
+    dnnl_OdhwI16i32o = dnnl_AcdeB16b32a,
+    dnnl_OIdhw16i48o = dnnl_ABcde16b48a,
+    dnnl_OdhwI16i48o = dnnl_AcdeB16b48a,
+    dnnl_OIdhw16i64o = dnnl_ABcde16b64a,
+    dnnl_OdhwI16i64o = dnnl_AcdeB16b64a,
+    dnnl_OIdhw16o16i = dnnl_ABcde16a16b,
+    dnnl_Oidhw16o = dnnl_Abcde16a,
+    dnnl_OIdhw4i4o = dnnl_ABcde4b4a,
+    dnnl_OIdhw4o4i = dnnl_ABcde4a4b,
+    dnnl_Oidhw4o = dnnl_Abcde4a,
+    dnnl_OIdhw8i8o2i = dnnl_ABcde8b8a2b,
+    dnnl_OdhwI8i8o2i = dnnl_AcdeB8b8a2b,
+    dnnl_OIdhw8i16o2i = dnnl_ABcde8b16a2b,
+    dnnl_OdhwI8i16o2i = dnnl_AcdeB8b16a2b,
+    dnnl_OIdhw8i32o2i = dnnl_ABcde8b32a2b,
+    dnnl_OdhwI8i32o2i = dnnl_AcdeB8b32a2b,
+    dnnl_OIdhw8i24o2i = dnnl_ABcde8b24a2b,
+    dnnl_OdhwI8i24o2i = dnnl_AcdeB8b24a2b,
+    dnnl_OIdhw8i64o2i = dnnl_ABcde8b64a2b,
+    dnnl_OdhwI8i64o2i = dnnl_AcdeB8b64a2b,
+    dnnl_OIdhw8i8o = dnnl_ABcde8b8a,
+    dnnl_OdhwI8i8o = dnnl_AcdeB8b8a,
+    dnnl_OIdhw8o16i2o = dnnl_ABcde8a16b2a,
+    dnnl_IOdhw8o16i2o = dnnl_BAcde8a16b2a,
+    dnnl_OIdhw4i8o4i = dnnl_ABcde4b8a4b,
+    dnnl_OdhwI4i8o4i = dnnl_AcdeB4b8a4b,
+    dnnl_OIdhw4i16o4i = dnnl_ABcde4b16a4b,
+    dnnl_OdhwI4i16o4i = dnnl_AcdeB4b16a4b,
+    dnnl_OIdhw4i24o4i = dnnl_ABcde4b24a4b,
+    dnnl_OdhwI4i24o4i = dnnl_AcdeB4b24a4b,
+    dnnl_OIdhw4i32o4i = dnnl_ABcde4b32a4b,
+    dnnl_OdhwI4i32o4i = dnnl_AcdeB4b32a4b,
+    dnnl_OIdhw4i64o4i = dnnl_ABcde4b64a4b,
+    dnnl_OdhwI4i64o4i = dnnl_AcdeB4b64a4b,
+    dnnl_OIdhw16i16o4i = dnnl_ABcde16b16a4b,
+    dnnl_OIdhw16i16o2i = dnnl_ABcde16b16a2b,
+    dnnl_OIdhw2i8o4i = dnnl_ABcde2b8a4b,
+    dnnl_OIdhw8o8i = dnnl_ABcde8a8b,
+    dnnl_OIdhw8o4i = dnnl_ABcde8a4b,
+    dnnl_IOdhw16i16o = dnnl_BAcde16b16a,
+    dnnl_OIdhw4o8i8o4i = dnnl_ABcde4a8b8a4b,
+    dnnl_IOdhw8o8i = dnnl_BAcde8a8b,
+    dnnl_IOdhw16o16i = dnnl_BAcde16a16b,
+    dnnl_OIdhw16o16i2o = dnnl_ABcde16a16b2a,
+    dnnl_OIdhw8i32o = dnnl_ABcde8b32a,
+    dnnl_OdhwI8i32o = dnnl_AcdeB8b32a,
+    dnnl_OIdhw8i24o = dnnl_ABcde8b24a,
+    dnnl_OdhwI8i24o = dnnl_AcdeB8b24a,
+    dnnl_OIdhw8i16o = dnnl_ABcde8b16a,
+    dnnl_OdhwI8i16o = dnnl_AcdeB8b16a,
+
+    // weights w/ groups, 3D
+    dnnl_Goiw16g = dnnl_Abcd16a,
+    dnnl_Goiw8g = dnnl_Abcd8a,
+    dnnl_Goiw4g = dnnl_Abcd4a,
+    dnnl_gIOw8o8i = dnnl_aCBd8b8c,
+    dnnl_gIOw16o16i = dnnl_aCBd16b16c,
+    dnnl_gIOw16i16o = dnnl_aCBd16c16b,
+    dnnl_gOIw16i16o = dnnl_aBCd16c16b,
+    dnnl_gOIw16o16i = dnnl_aBCd16b16c,
+    dnnl_gOiw16o = dnnl_aBcd16b,
+    dnnl_gOIw4i16o4i = dnnl_aBCd4c16b4c,
+    dnnl_gOIw2i8o4i = dnnl_aBCd2c8b4c,
+    dnnl_gOIw16i16o4i = dnnl_aBCd16c16b4c,
+    dnnl_gOIw16i16o2i = dnnl_aBCd16c16b2c,
+    dnnl_gOIw16o16i2o = dnnl_aBCd16b16c2b,
+    dnnl_gOIw4i4o = dnnl_aBCd4c4b,
+    dnnl_gOIw4o4i = dnnl_aBCd4b4c,
+    dnnl_gOiw4o = dnnl_aBcd4b,
+    dnnl_gOIw8i16o2i = dnnl_aBCd8c16b2c,
+    dnnl_gOIw8i8o = dnnl_aBCd8c8b,
+    dnnl_gOIw8o16i2o = dnnl_aBCd8b16c2b,
+    dnnl_gIOw8o16i2o = dnnl_aCBd8b16c2b,
+    dnnl_gOIw8o8i = dnnl_aBCd8b8c,
+    dnnl_gOIw8o4i = dnnl_aBCd8b4c,
+    dnnl_gOwi16o = dnnl_aBdc16b,
+    dnnl_gOwI16o2i = dnnl_aBdC16b2c,
+    dnnl_gOwI16o4i = dnnl_aBdC16b4c,
+    dnnl_gIwo8i = dnnl_aCdb8c,
+    dnnl_gIwO8i2o = dnnl_aCdB8c2b,
+    dnnl_gIwO8i4o = dnnl_aCdB8c4b,
+    dnnl_gIwo16i = dnnl_aCdb16c,
+    dnnl_gIwO16i2o = dnnl_aCdB16c2b,
+    dnnl_gIwO16i4o = dnnl_aCdB16c4b,
+    dnnl_gIwo24i = dnnl_aCdb24c,
+    dnnl_gIwO24i2o = dnnl_aCdB24c2b,
+    dnnl_gIwO24i4o = dnnl_aCdB24c4b,
+    dnnl_gOwi4o = dnnl_aBdc4b,
+    dnnl_gOwi8o = dnnl_aBdc8b,
+    dnnl_gOwI8o2i = dnnl_aBdC8b2c,
+    dnnl_gOwI8o4i = dnnl_aBdC8b4c,
+    dnnl_Goiw32g = dnnl_Abcd32a,
+    dnnl_gOIw2i4o2i = dnnl_aBCd2c4b2c,
+    dnnl_gOIw2o4i2o = dnnl_aBCd2b4c2b,
+    dnnl_gOIw4i8o2i = dnnl_aBCd4c8b2c,
+    dnnl_gOIw4o8i2o = dnnl_aBCd4b8c2b,
+    dnnl_goIw4i = dnnl_abCd4c,
+    dnnl_goIw32i = dnnl_abCd32c,
+
+    // weights w/ groups, 4D
+    dnnl_gIOhw16i16o = dnnl_aCBde16c16b,
+    dnnl_gIOhw8o8i = dnnl_aCBde8b8c,
+    dnnl_gIOhw16o16i = dnnl_aCBde16b16c,
+    dnnl_gOhwi16o = dnnl_aBdec16b,
+    dnnl_gOhwI16o2i = dnnl_aBdeC16b2c,
+    dnnl_gOhwI16o4i = dnnl_aBdeC16b4c,
+    dnnl_gIhwo8i = dnnl_aCdeb8c,
+    dnnl_gIhwO8i2o = dnnl_aCdeB8c2b,
+    dnnl_gIhwO8i4o = dnnl_aCdeB8c4b,
+    dnnl_gIhwo16i = dnnl_aCdeb16c,
+    dnnl_gIhwO16i2o = dnnl_aCdeB16c2b,
+    dnnl_gIhwO16i4o = dnnl_aCdeB16c4b,
+    dnnl_gIhwo24i = dnnl_aCdeb24c,
+    dnnl_gIhwO24i2o = dnnl_aCdeB24c2b,
+    dnnl_gIhwO24i4o = dnnl_aCdeB24c4b,
+    dnnl_gOhwi32o = dnnl_aBdec32b,
+    dnnl_gOhwi24o = dnnl_aBdec24b,
+    dnnl_gOhwI24o2i = dnnl_aBdeC24b2c,
+    dnnl_gOhwI24o4i = dnnl_aBdeC24b4c,
+    dnnl_gOhwi4o = dnnl_aBdec4b,
+    dnnl_gOhwi8o = dnnl_aBdec8b,
+    dnnl_gOhwI8o2i = dnnl_aBdeC8b2c,
+    dnnl_gOhwI8o4i = dnnl_aBdeC8b4c,
+    dnnl_Goihw16g = dnnl_Abcde16a,
+    dnnl_gOIhw16i16o = dnnl_aBCde16c16b,
+    dnnl_gOIhw16o16i = dnnl_aBCde16b16c,
+    dnnl_gOihw16o = dnnl_aBcde16b,
+    dnnl_gOIhw2i8o4i = dnnl_aBCde2c8b4c,
+    dnnl_gOIhw4i16o4i = dnnl_aBCde4c16b4c,
+    dnnl_gOIhw16i16o4i = dnnl_aBCde16c16b4c,
+    dnnl_gOIhw16i16o2i = dnnl_aBCde16c16b2c,
+    dnnl_gOIhw16o16i2o = dnnl_aBCde16b16c2b,
+    dnnl_gOIhw4i4o = dnnl_aBCde4c4b,
+    dnnl_gOIhw4o4i = dnnl_aBCde4b4c,
+    dnnl_gOihw4o = dnnl_aBcde4b,
+    dnnl_Goihw8g = dnnl_Abcde8a,
+    dnnl_Goihw4g = dnnl_Abcde4a,
+    dnnl_gOIhw8i16o2i = dnnl_aBCde8c16b2c,
+    dnnl_gOIhw8i8o = dnnl_aBCde8c8b,
+    dnnl_gOIhw8o16i2o = dnnl_aBCde8b16c2b,
+    dnnl_gIOhw8o16i2o = dnnl_aCBde8b16c2b,
+    dnnl_gOIhw8o8i = dnnl_aBCde8b8c,
+    dnnl_gOIhw8o4i = dnnl_aBCde8b4c,
+    dnnl_Goihw32g = dnnl_Abcde32a,
+    dnnl_gOwhi16o = dnnl_aBedc16b,
+    dnnl_goIhw4i = dnnl_abCde4c,
+    dnnl_goIhw32i = dnnl_abCde32c,
+
+    dnnl_OIw4o8i8o4i = dnnl_ABc4a8b8a4b,
+    dnnl_OIhw4o8i8o4i = dnnl_ABcd4a8b8a4b,
+    dnnl_IOw4i8o8i4o = dnnl_BAc4b8a8b4a,
+    dnnl_IOhw4i8o8i4o = dnnl_BAcd4b8a8b4a,
+    dnnl_IOdhw4i8o8i4o = dnnl_BAcde4b8a8b4a,
+
+    dnnl_OIhw2o8i8o2i = dnnl_ABcd2a8b8a2b,
+    dnnl_gOIw4o8i8o4i = dnnl_aBCd4b8c8b4c,
+    dnnl_gOIhw4o8i8o4i = dnnl_aBCde4b8c8b4c,
+    dnnl_gOIdhw4o8i8o4i = dnnl_aBCdef4b8c8b4c,
+    dnnl_gIOw4i8o8i4o = dnnl_aCBd4c8b8c4b,
+    dnnl_gIOhw4i8o8i4o = dnnl_aCBde4c8b8c4b,
+    dnnl_gIOdhw4i8o8i4o = dnnl_aCBdef4c8b8c4b,
+    dnnl_gOIhw2o8i8o2i = dnnl_aBCde2b8c8b2c,
+    dnnl_gOIhw2i4o2i = dnnl_aBCde2c4b2c,
+    dnnl_gOIhw2o4i2o = dnnl_aBCde2b4c2b,
+    dnnl_gOIhw4i8o2i = dnnl_aBCde4c8b2c,
+    dnnl_gOIhw4o8i2o = dnnl_aBCde4b8c2b,
+
+    // weights w/ groups, 6D
+    dnnl_gIOdhw16i16o = dnnl_aCBdef16c16b,
+    dnnl_gIOdhw8o8i = dnnl_aCBdef8b8c,
+    dnnl_gIOdhw16o16i = dnnl_aCBdef16b16c,
+    dnnl_gOdhwi16o = dnnl_aBdefc16b,
+    dnnl_gOdhwI16o2i = dnnl_aBdefC16b2c,
+    dnnl_gOdhwI16o4i = dnnl_aBdefC16b4c,
+    dnnl_gIdhwo8i = dnnl_aCdefb8c,
+    dnnl_gIdhwO8i2o = dnnl_aCdefB8c2b,
+    dnnl_gIdhwO8i4o = dnnl_aCdefB8c4b,
+    dnnl_gIdhwo16i = dnnl_aCdefb16c,
+    dnnl_gIdhwO16i2o = dnnl_aCdefB16c2b,
+    dnnl_gIdhwO16i4o = dnnl_aCdefB16c4b,
+    dnnl_gIdhwo24i = dnnl_aCdefb24c,
+    dnnl_gIdhwO24i2o = dnnl_aCdefB24c2b,
+    dnnl_gIdhwO24i4o = dnnl_aCdefB24c4b,
+    dnnl_gOdhwi4o = dnnl_aBdefc4b,
+    dnnl_gOdhwi8o = dnnl_aBdefc8b,
+    dnnl_gOdhwI8o2i = dnnl_aBdefC8b2c,
+    dnnl_gOdhwI8o4i = dnnl_aBdefC8b4c,
+    dnnl_gOdwhi16o = dnnl_aBdfec16b,
+    dnnl_gOIdhw16i16o = dnnl_aBCdef16c16b,
+    dnnl_gOIdhw4i16o4i = dnnl_aBCdef4c16b4c,
+    dnnl_gOIdhw16i16o4i = dnnl_aBCdef16c16b4c,
+    dnnl_gOIdhw2i8o4i = dnnl_aBCdef2c8b4c,
+    dnnl_gOIdhw16i16o2i = dnnl_aBCdef16c16b2c,
+    dnnl_gOIdhw16o16i = dnnl_aBCdef16b16c,
+    dnnl_gOIdhw16o16i2o = dnnl_aBCdef16b16c2b,
+    dnnl_gOidhw16o = dnnl_aBcdef16b,
+    dnnl_gOIdhw4i4o = dnnl_aBCdef4c4b,
+    dnnl_gOIdhw4o4i = dnnl_aBCdef4b4c,
+    dnnl_gOidhw4o = dnnl_aBcdef4b,
+    dnnl_gOIdhw8i16o2i = dnnl_aBCdef8c16b2c,
+    dnnl_gOIdhw8i8o = dnnl_aBCdef8c8b,
+    dnnl_gOIdhw8o16i2o = dnnl_aBCdef8b16c2b,
+    dnnl_gIOdhw8o16i2o = dnnl_aCBdef8b16c2b,
+    dnnl_gOIdhw8o8i = dnnl_aBCdef8b8c,
+    dnnl_gOIdhw8o4i = dnnl_aBCdef8b4c,
+    dnnl_Goidhw16g = dnnl_Abcdef16a,
+    dnnl_Goidhw32g = dnnl_Abcdef32a,
+    dnnl_gOIdhw2i4o2i = dnnl_aBCdef2c4b2c,
+    dnnl_gOIdhw4i8o2i = dnnl_aBCdef4c8b2c,
+    dnnl_gOIdhw2o4i2o = dnnl_aBCdef2b4c2b,
+    dnnl_gOIdhw4o8i2o = dnnl_aBCdef4b8c2b,
+    dnnl_goIdhw4i = dnnl_abCdef4c,
+    dnnl_goIdhw32i = dnnl_abCdef32c,
+
+    // weights, 3D
+    dnnl_Owi24o = dnnl_Acb24a,
+    dnnl_OwI24o2i = dnnl_AcB24a2b,
+    dnnl_OwI24o4i = dnnl_AcB24a4b,
+    dnnl_Owi32o = dnnl_Acb32a,
+    dnnl_OwI32o2i = dnnl_AcB32a2b,
+    dnnl_OwI32o4i = dnnl_AcB32a4b,
+    dnnl_Owi48o = dnnl_Acb48a,
+    dnnl_OwI48o2i = dnnl_AcB48a2b,
+    dnnl_OwI48o4i = dnnl_AcB48a4b,
+    dnnl_Owi64o = dnnl_Acb64a,
+    dnnl_OwI64o2i = dnnl_AcB64a2b,
+    dnnl_OwI64o4i = dnnl_AcB64a4b,
+    dnnl_Iwo32i = dnnl_Bca32b,
+    dnnl_IwO32i2o = dnnl_BcA32b2a,
+    dnnl_IwO32i4o = dnnl_BcA32b4a,
+    dnnl_Iwo48i = dnnl_Bca48b,
+    dnnl_IwO48i2o = dnnl_BcA48b2a,
+    dnnl_IwO48i4o = dnnl_BcA48b4a,
+    dnnl_Iwo64i = dnnl_Bca64b,
+    dnnl_IwO64i2o = dnnl_BcA64b2a,
+    dnnl_IwO64i4o = dnnl_BcA64b4a,
+    dnnl_wIo2i = dnnl_cBa2b,
+    dnnl_wIo4i = dnnl_cBa4b,
+    dnnl_gOwi24o = dnnl_aBdc24b,
+    dnnl_gOwI24o2i = dnnl_aBdC24b2c,
+    dnnl_gOwI24o4i = dnnl_aBdC24b4c,
+    dnnl_gOwi32o = dnnl_aBdc32b,
+    dnnl_gOwI32o2i = dnnl_aBdC32b2c,
+    dnnl_gOwI32o4i = dnnl_aBdC32b4c,
+    dnnl_gOwi48o = dnnl_aBdc48b,
+    dnnl_gOwI48o2i = dnnl_aBdC48b2c,
+    dnnl_gOwI48o4i = dnnl_aBdC48b4c,
+    dnnl_gOwi64o = dnnl_aBdc64b,
+    dnnl_gOwI64o2i = dnnl_aBdC64b2c,
+    dnnl_gOwI64o4i = dnnl_aBdC64b4c,
+    dnnl_gIwo32i = dnnl_aCdb32c,
+    dnnl_gIwO32i2o = dnnl_aCdB32c2b,
+    dnnl_gIwO32i4o = dnnl_aCdB32c4b,
+    dnnl_gIwo48i = dnnl_aCdb48c,
+    dnnl_gIwO48i2o = dnnl_aCdB48c2b,
+    dnnl_gIwO48i4o = dnnl_aCdB48c4b,
+    dnnl_gIwo64i = dnnl_aCdb64c,
+    dnnl_gIwO64i2o = dnnl_aCdB64c2b,
+    dnnl_gIwO64i4o = dnnl_aCdB64c4b,
+    dnnl_gwio = dnnl_adcb,
+    dnnl_gwIo2i = dnnl_adCb2c,
+    dnnl_gwIo4i = dnnl_adCb4c,
+    // weights, 4D
+    dnnl_OhwI24o = dnnl_Acdb24a,
+    dnnl_OhwI24o2i = dnnl_AcdB24a2b,
+    dnnl_OhwI24o4i = dnnl_AcdB24a4b,
+    dnnl_OhwI32o = dnnl_Acdb32a,
+    dnnl_OhwI32o2i = dnnl_AcdB32a2b,
+    dnnl_OhwI32o4i = dnnl_AcdB32a4b,
+    dnnl_Ohwi48o = dnnl_Acdb48a,
+    dnnl_OhwI48o2i = dnnl_AcdB48a2b,
+    dnnl_OhwI48o4i = dnnl_AcdB48a4b,
+    dnnl_Ohwi64o = dnnl_Acdb64a,
+    dnnl_OhwI64o2i = dnnl_AcdB64a2b,
+    dnnl_OhwI64o4i = dnnl_AcdB64a4b,
+    dnnl_Ihwo32i = dnnl_Bcda32b,
+    dnnl_IhwO32i2o = dnnl_BcdA32b2a,
+    dnnl_IhwO32i4o = dnnl_BcdA32b4a,
+    dnnl_Ihwo48i = dnnl_Bcda48b,
+    dnnl_IhwO48i2o = dnnl_BcdA48b2a,
+    dnnl_IhwO48i4o = dnnl_BcdA48b4a,
+    dnnl_Ihwo64i = dnnl_Bcda64b,
+    dnnl_IhwO64i2o = dnnl_BcdA64b2a,
+    dnnl_IhwO64i4o = dnnl_BcdA64b4a,
+    dnnl_hwIo2i = dnnl_cdBa2b,
+    dnnl_hwIo4i = dnnl_cdBa4b,
+    dnnl_gOhwI24o = dnnl_aBdec24b,
+    dnnl_gOhwI32o = dnnl_aBdec32b,
+    dnnl_gOhwI32o2i = dnnl_aBdeC32b2c,
+    dnnl_gOhwI32o4i = dnnl_aBdeC32b4c,
+    dnnl_gOhwi48o = dnnl_aBdec48b,
+    dnnl_gOhwI48o2i = dnnl_aBdeC48b2c,
+    dnnl_gOhwI48o4i = dnnl_aBdeC48b4c,
+    dnnl_gOhwi64o = dnnl_aBdec64b,
+    dnnl_gOhwI64o2i = dnnl_aBdeC64b2c,
+    dnnl_gOhwI64o4i = dnnl_aBdeC64b4c,
+    dnnl_gIhwo32i = dnnl_aCdeb32c,
+    dnnl_gIhwO32i2o = dnnl_aCdeB32c2b,
+    dnnl_gIhwO32i4o = dnnl_aCdeB32c4b,
+    dnnl_gIhwo48i = dnnl_aCdeb48c,
+    dnnl_gIhwO48i2o = dnnl_aCdeB48c2b,
+    dnnl_gIhwO48i4o = dnnl_aCdeB48c4b,
+    dnnl_gIhwo64i = dnnl_aCdeb64c,
+    dnnl_gIhwO64i2o = dnnl_aCdeB64c2b,
+    dnnl_gIhwO64i4o = dnnl_aCdeB64c4b,
+    dnnl_ghwio = dnnl_adecb,
+    dnnl_ghwIo2i = dnnl_adeCb2c,
+    dnnl_ghwIo4i = dnnl_adeCb4c,
+    // weights, 5D
+    dnnl_Odhwi24o = dnnl_Acdeb24a,
+    dnnl_OdhwI24o2i = dnnl_AcdeB24a2b,
+    dnnl_OdhwI24o4i = dnnl_AcdeB24a4b,
+    dnnl_Odhwi32o = dnnl_Acdeb32a,
+    dnnl_OdhwI32o2i = dnnl_AcdeB32a2b,
+    dnnl_OdhwI32o4i = dnnl_AcdeB32a4b,
+    dnnl_Odhwi48o = dnnl_Acdeb48a,
+    dnnl_OdhwI48o2i = dnnl_AcdeB48a2b,
+    dnnl_OdhwI48o4i = dnnl_AcdeB48a4b,
+    dnnl_Odhwi64o = dnnl_Acdeb64a,
+    dnnl_OdhwI64o2i = dnnl_AcdeB64a2b,
+    dnnl_OdhwI64o4i = dnnl_AcdeB64a4b,
+    dnnl_Idhwo32i = dnnl_Bcdea32b,
+    dnnl_IdhwO32i2o = dnnl_BcdeA32b2a,
+    dnnl_IdhwO32i4o = dnnl_BcdeA32b4a,
+    dnnl_Idhwo48i = dnnl_Bcdea48b,
+    dnnl_IdhwO48i2o = dnnl_BcdeA48b2a,
+    dnnl_IdhwO48i4o = dnnl_BcdeA48b4a,
+    dnnl_Idhwo64i = dnnl_Bcdea64b,
+    dnnl_IdhwO64i2o = dnnl_BcdeA64b2a,
+    dnnl_IdhwO64i4o = dnnl_BcdeA64b4a,
+    dnnl_dhwIo2i = dnnl_cdeBa2b,
+    dnnl_dhwIo4i = dnnl_cdeBa4b,
+    dnnl_gOdhwi24o = dnnl_aBdefc24b,
+    dnnl_gOdhwI24o2i = dnnl_aBdefC24b2c,
+    dnnl_gOdhwI24o4i = dnnl_aBdefC24b4c,
+    dnnl_gOdhwi32o = dnnl_aBdefc32b,
+    dnnl_gOdhwI32o2i = dnnl_aBdefC32b2c,
+    dnnl_gOdhwI32o4i = dnnl_aBdefC32b4c,
+    dnnl_gOdhwi48o = dnnl_aBdefc48b,
+    dnnl_gOdhwI48o2i = dnnl_aBdefC48b2c,
+    dnnl_gOdhwI48o4i = dnnl_aBdefC48b4c,
+    dnnl_gOdhwi64o = dnnl_aBdefc64b,
+    dnnl_gOdhwI64o2i = dnnl_aBdefC64b2c,
+    dnnl_gOdhwI64o4i = dnnl_aBdefC64b4c,
+    dnnl_gIdhwo32i = dnnl_aCdefb32c,
+    dnnl_gIdhwO32i2o = dnnl_aCdefB32c2b,
+    dnnl_gIdhwO32i4o = dnnl_aCdefB32c4b,
+    dnnl_gIdhwo48i = dnnl_aCdefb48c,
+    dnnl_gIdhwO48i2o = dnnl_aCdefB48c2b,
+    dnnl_gIdhwO48i4o = dnnl_aCdefB48c4b,
+    dnnl_gIdhwo64i = dnnl_aCdefb64c,
+    dnnl_gIdhwO64i2o = dnnl_aCdefB64c2b,
+    dnnl_gIdhwO64i4o = dnnl_aCdefB64c4b,
+    dnnl_gdhwio = dnnl_adefcb,
+    dnnl_gdhwIo2i = dnnl_adefCb2c,
+    dnnl_gdhwIo4i = dnnl_adefCb4c,
+    dnnl_OI16i32o4i = dnnl_AB16b32a4b,
+    dnnl_OI16i48o4i = dnnl_AB16b48a4b,
+    dnnl_OI16i64o4i = dnnl_AB16b64a4b,
+    dnnl_OI16i16o2i = dnnl_AB16b16a2b,
+    dnnl_OI16i32o2i = dnnl_AB16b32a2b,
+    dnnl_OI16i48o2i = dnnl_AB16b48a2b,
+    dnnl_OI16i64o2i = dnnl_AB16b64a2b,
+    dnnl_OIw16i32o4i = dnnl_ABc16b32a4b,
+    dnnl_OIw16i48o4i = dnnl_ABc16b48a4b,
+    dnnl_OIw16i64o4i = dnnl_ABc16b64a4b,
+    dnnl_OIw16i32o2i = dnnl_ABc16b32a2b,
+    dnnl_OIw16i48o2i = dnnl_ABc16b48a2b,
+    dnnl_OIw16i64o2i = dnnl_ABc16b64a2b,
+    dnnl_OIhw16i32o4i = dnnl_ABcd16b32a4b,
+    dnnl_OIhw16i48o4i = dnnl_ABcd16b48a4b,
+    dnnl_OIhw16i64o4i = dnnl_ABcd16b64a4b,
+    dnnl_OIhw16i32o2i = dnnl_ABcd16b32a2b,
+    dnnl_OIhw16i48o2i = dnnl_ABcd16b48a2b,
+    dnnl_OIhw16i64o2i = dnnl_ABcd16b64a2b,
+    dnnl_OIdhw16i32o4i = dnnl_ABcde16b32a4b,
+    dnnl_OIdhw16i48o4i = dnnl_ABcde16b48a4b,
+    dnnl_OIdhw16i64o4i = dnnl_ABcde16b64a4b,
+    dnnl_OIdhw16i32o2i = dnnl_ABcde16b32a2b,
+    dnnl_OIdhw16i48o2i = dnnl_ABcde16b48a2b,
+    dnnl_OIdhw16i64o2i = dnnl_ABcde16b64a2b,
+    dnnl_OwI16i16o2i = dnnl_AcB16b16a2b,
+    dnnl_OwI16i16o4i = dnnl_AcB16b16a4b,
+    dnnl_OhwI16i16o2i = dnnl_AcdB16b16a2b,
+    dnnl_OhwI16i16o4i = dnnl_AcdB16b16a4b,
+    dnnl_OdhwI16i16o2i = dnnl_AcdeB16b16a2b,
+    dnnl_OdhwI16i16o4i = dnnl_AcdeB16b16a4b,
+    dnnl_IwO16o16i2o = dnnl_BcA16a16b2a,
+    dnnl_IwO16o16i4o = dnnl_BcA16a16b4a,
+    dnnl_IhwO16o16i2o = dnnl_BcdA16a16b2a,
+    dnnl_IhwO16o16i4o = dnnl_BcdA16a16b4a,
+    dnnl_IdhwO16o16i2o = dnnl_BcdeA16a16b2a,
+    dnnl_IdhwO16o16i4o = dnnl_BcdeA16a16b4a,
+    dnnl_gOwI16i16o2i = dnnl_aBdC16c16b2c,
+    dnnl_gOwI16i16o4i = dnnl_aBdC16c16b4c,
+    dnnl_gOhwI16i16o2i = dnnl_aBdeC16c16b2c,
+    dnnl_gOhwI16i16o4i = dnnl_aBdeC16c16b4c,
+    dnnl_gOdhwI16i16o2i = dnnl_aBdefC16c16b2c,
+    dnnl_gOdhwI16i16o4i = dnnl_aBdefC16c16b4c,
+    dnnl_gIwO16o16i2o = dnnl_aCdB16b16c2b,
+    dnnl_gIwO16o16i4o = dnnl_aCdB16b16c4b,
+    dnnl_gIhwO16o16i2o = dnnl_aCdeB16b16c2b,
+    dnnl_gIhwO16o16i4o = dnnl_aCdeB16b16c4b,
+    dnnl_gIdhwO16o16i2o = dnnl_aCdefB16b16c2b,
+    dnnl_gIdhwO16o16i4o = dnnl_aCdefB16b16c4b,
+    dnnl_OwI16i32o2i = dnnl_AcB16b32a2b,
+    dnnl_OwI16i32o4i = dnnl_AcB16b32a4b,
+    dnnl_OwI16i48o2i = dnnl_AcB16b48a2b,
+    dnnl_OwI16i48o4i = dnnl_AcB16b48a4b,
+    dnnl_OwI16i64o2i = dnnl_AcB16b64a2b,
+    dnnl_OwI16i64o4i = dnnl_AcB16b64a4b,
+    dnnl_IwO16o32i2o = dnnl_BcA16a32b2a,
+    dnnl_IwO16o32i4o = dnnl_BcA16a32b4a,
+    dnnl_IwO16o48i2o = dnnl_BcA16a48b2a,
+    dnnl_IwO16o48i4o = dnnl_BcA16a48b4a,
+    dnnl_IwO16o64i2o = dnnl_BcA16a64b2a,
+    dnnl_IwO16o64i4o = dnnl_BcA16a64b4a,
+    dnnl_gOwI16i32o2i = dnnl_aBdC16c32b2c,
+    dnnl_gOwI16i32o4i = dnnl_aBdC16c32b4c,
+    dnnl_gOwI16i48o2i = dnnl_aBdC16c48b2c,
+    dnnl_gOwI16i48o4i = dnnl_aBdC16c48b4c,
+    dnnl_gOwI16i64o2i = dnnl_aBdC16c64b2c,
+    dnnl_gOwI16i64o4i = dnnl_aBdC16c64b4c,
+    dnnl_gIwO16o32i2o = dnnl_aCdB16b32c2b,
+    dnnl_gIwO16o32i4o = dnnl_aCdB16b32c4b,
+    dnnl_gIwO16o48i2o = dnnl_aCdB16b48c2b,
+    dnnl_gIwO16o48i4o = dnnl_aCdB16b48c4b,
+    dnnl_gIwO16o64i2o = dnnl_aCdB16b64c2b,
+    dnnl_gIwO16o64i4o = dnnl_aCdB16b64c4b,
+    dnnl_OhwI16i32o2i = dnnl_AcdB16b32a2b,
+    dnnl_OhwI16i32o4i = dnnl_AcdB16b32a4b,
+    dnnl_OhwI16i48o2i = dnnl_AcdB16b48a2b,
+    dnnl_OhwI16i48o4i = dnnl_AcdB16b48a4b,
+    dnnl_OhwI16i64o2i = dnnl_AcdB16b64a2b,
+    dnnl_OhwI16i64o4i = dnnl_AcdB16b64a4b,
+    dnnl_IhwO16o32i2o = dnnl_BcdA16a32b2a,
+    dnnl_IhwO16o32i4o = dnnl_BcdA16a32b4a,
+    dnnl_IhwO16o48i2o = dnnl_BcdA16a48b2a,
+    dnnl_IhwO16o48i4o = dnnl_BcdA16a48b4a,
+    dnnl_IhwO16o64i2o = dnnl_BcdA16a64b2a,
+    dnnl_IhwO16o64i4o = dnnl_BcdA16a64b4a,
+    dnnl_gOhwI16i32o2i = dnnl_aBdeC16c32b2c,
+    dnnl_gOhwI16i32o4i = dnnl_aBdeC16c32b4c,
+    dnnl_gOhwI16i48o2i = dnnl_aBdeC16c48b2c,
+    dnnl_gOhwI16i48o4i = dnnl_aBdeC16c48b4c,
+    dnnl_gOhwI16i64o2i = dnnl_aBdeC16c64b2c,
+    dnnl_gOhwI16i64o4i = dnnl_aBdeC16c64b4c,
+    dnnl_gIhwO16o32i2o = dnnl_aCdeB16b32c2b,
+    dnnl_gIhwO16o32i4o = dnnl_aCdeB16b32c4b,
+    dnnl_gIhwO16o48i2o = dnnl_aCdeB16b48c2b,
+    dnnl_gIhwO16o48i4o = dnnl_aCdeB16b48c4b,
+    dnnl_gIhwO16o64i2o = dnnl_aCdeB16b64c2b,
+    dnnl_gIhwO16o64i4o = dnnl_aCdeB16b64c4b,
+    dnnl_OdhwI16i32o2i = dnnl_AcdeB16b32a2b,
+    dnnl_OdhwI16i32o4i = dnnl_AcdeB16b32a4b,
+    dnnl_OdhwI16i48o2i = dnnl_AcdeB16b48a2b,
+    dnnl_OdhwI16i48o4i = dnnl_AcdeB16b48a4b,
+    dnnl_OdhwI16i64o2i = dnnl_AcdeB16b64a2b,
+    dnnl_OdhwI16i64o4i = dnnl_AcdeB16b64a4b,
+    dnnl_IdhwO16o32i2o = dnnl_BcdeA16a32b2a,
+    dnnl_IdhwO16o32i4o = dnnl_BcdeA16a32b4a,
+    dnnl_IdhwO16o48i2o = dnnl_BcdeA16a48b2a,
+    dnnl_IdhwO16o48i4o = dnnl_BcdeA16a48b4a,
+    dnnl_IdhwO16o64i2o = dnnl_BcdeA16a64b2a,
+    dnnl_IdhwO16o64i4o = dnnl_BcdeA16a64b4a,
+    dnnl_gOdhwI16i32o2i = dnnl_aBdefC16c32b2c,
+    dnnl_gOdhwI16i32o4i = dnnl_aBdefC16c32b4c,
+    dnnl_gOdhwI16i48o2i = dnnl_aBdefC16c48b2c,
+    dnnl_gOdhwI16i48o4i = dnnl_aBdefC16c48b4c,
+    dnnl_gOdhwI16i64o2i = dnnl_aBdefC16c64b2c,
+    dnnl_gOdhwI16i64o4i = dnnl_aBdefC16c64b4c,
+    dnnl_gIdhwO16o32i2o = dnnl_aCdefB16b32c2b,
+    dnnl_gIdhwO16o32i4o = dnnl_aCdefB16b32c4b,
+    dnnl_gIdhwO16o48i2o = dnnl_aCdefB16b48c2b,
+    dnnl_gIdhwO16o48i4o = dnnl_aCdefB16b48c4b,
+    dnnl_gIdhwO16o64i2o = dnnl_aCdefB16b64c2b,
+    dnnl_gIdhwO16o64i4o = dnnl_aCdefB16b64c4b,
+    dnnl_hwioG16g = dnnl_decbA16a,
+    dnnl_hwioG8g = dnnl_decbA8a,
+    dnnl_dhwioG16g = dnnl_defcbA16a,
+    dnnl_dhwioG8g = dnnl_defcbA8a,
+    dnnl_NCdhw40n16c = dnnl_ABcde40a16b,
+    dnnl_NCw40n16c = dnnl_ABc40a16b,
+    dnnl_NChw40n16c = dnnl_ABcd40a16b,
+    dnnl_NCw40n32c = dnnl_ABc40a32b,
+    dnnl_NChw40n32c = dnnl_ABcd40a32b,
+    dnnl_NCdhw40n32c = dnnl_ABcde40a32b,
+    dnnl_OIdhw4o8i8o2i = dnnl_ABcde4a8b8a2b,
+    dnnl_OIhw4o8i8o2i = dnnl_ABcd4a8b8a2b,
+    dnnl_OIw4o8i8o2i = dnnl_ABc4a8b8a2b,
+    dnnl_gOIdhw4o8i8o2i = dnnl_aBCdef4b8c8b2c,
+    dnnl_gOIhw4o8i8o2i = dnnl_aBCde4b8c8b2c,
+    dnnl_gOIw4o8i8o2i = dnnl_aBCd4b8c8b2c,
+    dnnl_IOdhw4i8o8i2o = dnnl_BAcde4b8a8b2a,
+    dnnl_IOhw4i8o8i2o = dnnl_BAcd4b8a8b2a,
+    dnnl_IOw4i8o8i2o = dnnl_BAc4b8a8b2a,
+    dnnl_gIOdhw4i8o8i2o = dnnl_aCBdef4c8b8c2b,
+    dnnl_gIOhw4i8o8i2o = dnnl_aCBde4c8b8c2b,
+    dnnl_gIOw4i8o8i2o = dnnl_aCBd4c8b8c2b,
+    dnnl_NCw2c32n8c = dnnl_ABc2b32a8b,
+    dnnl_NChw2c32n8c = dnnl_ABcd2b32a8b,
+    dnnl_NCdhw2c32n8c = dnnl_ABcde2b32a8b,
+    dnnl_OIw2i8o16i4o = dnnl_ABc2b8a16b4a,
+    dnnl_OIhw2i8o16i4o = dnnl_ABcd2b8a16b4a,
+    dnnl_OIdhw2i8o16i4o = dnnl_ABcde2b8a16b4a,
+    dnnl_OIw2o8i16o4i = dnnl_ABc2a8b16a4b,
+    dnnl_OIw2o8i16o2i = dnnl_ABc2a8b16a2b,
+    dnnl_IOw2i8o16i4o = dnnl_BAc2b8a16b4a,
+    dnnl_IOw2i8o16i2o = dnnl_BAc2b8a16b2a,
+    dnnl_OIhw2o8i16o4i = dnnl_ABcd2a8b16a4b,
+    dnnl_OIhw2o8i16o2i = dnnl_ABcd2a8b16a2b,
+    dnnl_IOhw2i8o16i4o = dnnl_BAcd2b8a16b4a,
+    dnnl_IOhw2i8o16i2o = dnnl_BAcd2b8a16b2a,
+    dnnl_OIdhw2o8i16o4i = dnnl_ABcde2a8b16a4b,
+    dnnl_OIdhw2o8i16o2i = dnnl_ABcde2a8b16a2b,
+    dnnl_IOdhw2i8o16i4o = dnnl_BAcde2b8a16b4a,
+    dnnl_IOdhw2i8o16i2o = dnnl_BAcde2b8a16b2a,
+    dnnl_gOIw2o8i16o2i = dnnl_aBCd2b8c16b2c,
+    dnnl_gIOw2i8o16i2o = dnnl_aCBd2c8b16c2b,
+    dnnl_gIOhw2i8o16i2o = dnnl_aBCde2c8b16c2b,
+    dnnl_gIOdhw2i8o16i2o = dnnl_aBCdef2c8b16c2b,
+    dnnl_gOIhw2o8i16o2i = dnnl_aBCde2b8c16b2c,
+    dnnl_gOIdhw2o8i16o2i = dnnl_aBCdef2b8c16b2c,
+    dnnl_gOIw2o8i16o4i = dnnl_aBCd2b8c16b4c,
+    dnnl_gOIhw2o8i16o4i = dnnl_aBCde2b8c16b4c,
+} dnnl_format_tag_t;
+
+/// @} dnnl_api_memory
+
+/// @addtogroup dnnl_api_primitives
+/// @{
+/// @addtogroup dnnl_api_primitives_common
+/// @{
+
+/// Kinds of propagation.
+typedef enum {
+    // TODO: suggest renames
+    /// Undefined propagation type.
+    dnnl_prop_kind_undef = 0,
+    /// Forward data propagation (training mode). In this mode primitives
+    /// perform computations necessary for subsequent backward propagation.
+    dnnl_forward_training = 64,
+    /// Forward data propagation (inference mode). In this mode primitives
+    /// perform only computations that are necessary for inference and omit
+    /// computations that are necessary only for backward propagation.
+    dnnl_forward_inference = 96,
+    /// Forward data propagation (alias for @c dnnl_forward_training).
+    dnnl_forward = dnnl_forward_training,
+    /// Backward propagation (with respect to all parameters).
+    dnnl_backward = 128,
+    /// Backward data propagation.
+    dnnl_backward_data = 160,
+    /// Backward weights propagation.
+    dnnl_backward_weights = 192,
+    /// Backward bias propagation.
+    dnnl_backward_bias = 193,
+} dnnl_prop_kind_t;
+
+/// Kinds of primitives. Used to implement a way to extend the library with new
+/// primitives without changing the ABI.
+typedef enum {
+    /// Undefined primitive
+    dnnl_undefined_primitive,
+    /// A reorder primitive.
+    dnnl_reorder,
+    /// A shuffle primitive.
+    dnnl_shuffle,
+    /// A (out-of-place) concat primitive.
+    dnnl_concat,
+    /// A sum primitive.
+    dnnl_sum,
+    /// A convolution primitive.
+    dnnl_convolution,
+    /// A deconvolution primitive.
+    dnnl_deconvolution,
+    /// An element-wise primitive.
+    dnnl_eltwise,
+    /// An LRN primitive.
+    dnnl_lrn,
+    /// A batch normalization primitive.
+    dnnl_batch_normalization,
+    /// An inner product primitive.
+    dnnl_inner_product,
+    /// A rnn primitive.
+    dnnl_rnn,
+    /// A matrix multiplication primitive (internal).
+    dnnl_gemm,
+    /// A binary primitive.
+    dnnl_binary,
+    /// A matrix multiplication primitive.
+    dnnl_matmul,
+    /// A resampling primitive.
+    dnnl_resampling,
+    /// A pooling primitive.
+    dnnl_pooling,
+    /// A reduction primitive.
+    dnnl_reduction,
+    /// A PReLU primitive.
+    dnnl_prelu,
+    /// A softmax primitive.
+    dnnl_softmax,
+    /// A layer normalization primitive.
+    dnnl_layer_normalization,
+    /// A group normalization primitive.
+    dnnl_group_normalization,
+
+    /// Parameter to allow internal only primitives without undefined behavior.
+    /// This parameter is chosen to be valid for so long as sizeof(int) >= 2.
+    dnnl_primitive_kind_max = 0x7fff,
+} dnnl_primitive_kind_t;
+
+/// Kinds of algorithms.
+typedef enum {
+    dnnl_alg_kind_undef,
+    /// Direct convolution
+    dnnl_convolution_direct = 0x1,
+    /// Winograd convolution
+    dnnl_convolution_winograd = 0x2,
+    /// Convolution algorithm(either direct or Winograd) is chosen just in time
+    dnnl_convolution_auto = 0x3,
+    /// Direct deconvolution
+    dnnl_deconvolution_direct = 0xa,
+    /// Winograd deconvolution
+    dnnl_deconvolution_winograd = 0xb,
+    /// Eltwise: ReLU
+    dnnl_eltwise_relu = 0x20,
+    /// Eltwise: hyperbolic tangent non-linearity (tanh)
+    dnnl_eltwise_tanh,
+    /// Eltwise: exponential linear unit (elu)
+    dnnl_eltwise_elu,
+    /// Eltwise: square
+    dnnl_eltwise_square,
+    /// Eltwise: abs
+    dnnl_eltwise_abs,
+    /// Eltwise: square root
+    dnnl_eltwise_sqrt,
+    /// Eltwise: linear
+    dnnl_eltwise_linear,
+    /// Eltwise: soft_relu
+    dnnl_eltwise_soft_relu,
+    /// Eltwise: hardsigmoid
+    dnnl_eltwise_hardsigmoid,
+    /// Eltwise: logistic
+    dnnl_eltwise_logistic,
+    /// Eltwise: exponent
+    dnnl_eltwise_exp,
+    /// Eltwise: gelu
+    ///
+    /// @note Tanh approximation formula is used to approximate
+    /// the cumulative distribution function of a Gaussian here
+    dnnl_eltwise_gelu_tanh,
+    /// Eltwise: swish
+    dnnl_eltwise_swish,
+    /// Eltwise: natural logarithm
+    dnnl_eltwise_log,
+    /// Eltwise: clip
+    dnnl_eltwise_clip,
+    /// Eltwise: clip version 2
+    dnnl_eltwise_clip_v2,
+    /// Eltwise: pow
+    dnnl_eltwise_pow,
+    /// Eltwise: erf-based gelu
+    dnnl_eltwise_gelu_erf,
+    /// Eltwise: round
+    dnnl_eltwise_round,
+    /// Eltwise: mish
+    dnnl_eltwise_mish,
+    /// Eltwise: hardswish
+    dnnl_eltwise_hardswish,
+    /// Eltwise: ReLU (dst for backward)
+    dnnl_eltwise_relu_use_dst_for_bwd = 0x100,
+    /// Eltwise: hyperbolic tangent non-linearity (tanh) (dst for backward)
+    dnnl_eltwise_tanh_use_dst_for_bwd,
+    /// Eltwise: exponential linear unit (elu) (dst for backward)
+    dnnl_eltwise_elu_use_dst_for_bwd,
+    /// Eltwise: square root (dst for backward)
+    dnnl_eltwise_sqrt_use_dst_for_bwd,
+    /// Eltwise: logistic (dst for backward)
+    dnnl_eltwise_logistic_use_dst_for_bwd,
+    /// Eltwise: exp (dst for backward)
+    dnnl_eltwise_exp_use_dst_for_bwd,
+    /// Eltwise: clip version 2 (dst for backward)
+    dnnl_eltwise_clip_v2_use_dst_for_bwd,
+    /// Max pooling
+    dnnl_pooling_max = 0x1ff,
+    /// Average pooling include padding
+    dnnl_pooling_avg_include_padding = 0x2ff,
+    /// Average pooling exclude padding
+    dnnl_pooling_avg_exclude_padding = 0x3ff,
+    /// Local response normalization (LRN) across multiple channels
+    dnnl_lrn_across_channels = 0xaff,
+    /// LRN within a single channel
+    dnnl_lrn_within_channel = 0xbff,
+    /// RNN cell
+    dnnl_vanilla_rnn = 0x1fff,
+    /// LSTM cell
+    dnnl_vanilla_lstm = 0x2fff,
+    /// GRU cell
+    dnnl_vanilla_gru = 0x3fff,
+    /// GRU cell with linear before reset
+    ///
+    /// Modification of original GRU cell. Differs from #dnnl_vanilla_gru
+    /// in how the new memory gate is calculated:
+    /// \f[ c_t = tanh(W_c*x_t + b_{c_x} + r_t*(U_c*h_{t-1}+b_{c_h})) \f]
+    /// Primitive expects 4 biases on input:
+    /// \f$[b_{u}, b_{r}, b_{c_x}, b_{c_h}]\f$
+    dnnl_lbr_gru = 0x4fff,
+    /// AUGRU cell
+    dnnl_vanilla_augru = 0x5fff,
+    /// AUGRU cell with linear before reset
+    dnnl_lbr_augru = 0x6fff,
+    /// Binary add
+    dnnl_binary_add = 0x1fff0,
+    /// Binary mul
+    dnnl_binary_mul = 0x1fff1,
+    /// Binary max
+    dnnl_binary_max = 0x1fff2,
+    /// Binary min
+    dnnl_binary_min = 0x1fff3,
+    /// Binary div
+    dnnl_binary_div = 0x1fff4,
+    /// Binary sub
+    dnnl_binary_sub = 0x1fff5,
+    /// Binary greater or equal
+    dnnl_binary_ge = 0x1fff6,
+    /// Binary greater than
+    dnnl_binary_gt = 0x1fff7,
+    /// Binary less or equal
+    dnnl_binary_le = 0x1fff8,
+    /// Binary less than
+    dnnl_binary_lt = 0x1fff9,
+    /// Binary equal
+    dnnl_binary_eq = 0x1fffa,
+    /// Binary not equal
+    dnnl_binary_ne = 0x1fffb,
+    /// Binary select
+    dnnl_binary_select = 0x1fffc,
+    /// Nearest Neighbor Resampling Method
+    dnnl_resampling_nearest = 0x2fff0,
+    /// Linear Resampling Method
+    dnnl_resampling_linear = 0x2fff1,
+    /// Reduction using max
+    dnnl_reduction_max,
+    /// Reduction using min
+    dnnl_reduction_min,
+    /// Reduction using sum
+    dnnl_reduction_sum,
+    /// Reduction using mul
+    dnnl_reduction_mul,
+    /// Reduction using mean
+    dnnl_reduction_mean,
+    /// Reduction using lp norm
+    dnnl_reduction_norm_lp_max,
+    /// Reduction using lp norm
+    dnnl_reduction_norm_lp_sum,
+    /// Reduction using lp norm without final pth-root
+    dnnl_reduction_norm_lp_power_p_max,
+    /// Reduction using lp norm without final pth-root
+    dnnl_reduction_norm_lp_power_p_sum,
+    /// Softmax
+    dnnl_softmax_accurate = 0x30000,
+    /// Logsoftmax
+    dnnl_softmax_log,
+} dnnl_alg_kind_t;
+
+/// Flags for normalization primitives.
+typedef enum {
+    /// Use no normalization flags
+    ///
+    /// If specified
+    ///  - on forward training propagation mean and variance are computed and
+    ///    stored as output
+    ///  - on backward propagation compute full derivative wrt data
+    ///  - on backward propagation prop_kind == #dnnl_backward_data has the same
+    ///    behavior as prop_kind == #dnnl_backward
+    dnnl_normalization_flags_none = 0x0U,
+
+    /// Use global statistics
+    ///
+    /// If specified
+    ///  - on forward propagation use mean and variance provided by user (input)
+    ///  - on backward propagation reduces the amount of computations, since
+    ///    mean and variance are considered as constants
+    ///
+    ///  If not specified:
+    ///   - on forward propagation mean and variance are computed and stored as
+    ///     output
+    ///   - on backward propagation compute full derivative wrt data
+    dnnl_use_global_stats = 0x1U,
+
+    /// Use scale parameter
+    ///
+    /// If specified:
+    ///  - on forward propagation use scale for the normalization results
+    ///  - on backward propagation (for prop_kind == #dnnl_backward) compute
+    ///    diff wrt scale (hence one extra output used)
+    dnnl_use_scale = 0x2U,
+
+    /// Use shift parameter
+    ///
+    /// If specified:
+    ///  - on forward propagation use shift (aka bias) for the normalization
+    ///    results
+    ///  - on backward propagation (for prop_kind == #dnnl_backward) compute
+    ///    diff wrt shift (hence one extra output used)
+    dnnl_use_shift = 0x4U,
+
+    /// Fuse with ReLU
+    ///
+    /// The flag implies negative slope being 0. On training this is the only
+    /// configuration supported. For inference, to use non-zero negative slope
+    /// consider using @ref dev_guide_attributes_post_ops.
+    ///
+    /// If specified:
+    ///  - on inference this option behaves the same as if the primitive were
+    ///    fused with ReLU using post ops API with zero negative slope.
+    ///  - on training primitive requires workspace (required to be able to
+    ///    perform backward pass)
+    dnnl_fuse_norm_relu = 0x8U,
+
+    /// Fuse with Add and then fuse with ReLU
+    ///
+    /// If specified:
+    ///
+    ///  - on forward propagation apply element-wise binary Add operation to
+    ///    to the normalization results with an additional input tensor and then
+    ///    apply ReLU with negative slope being 0.
+    ///  - on training primitive requires workspace (required to be able to
+    ///    perform backward pass).
+    ///  - on backward propagation save the result of backward ReLU operation
+    ///    with input tensor and workspace from forward pass to extra output
+    ///    tensor and then perform backward normalization.
+    dnnl_fuse_norm_add_relu = 0x10U,
+
+} dnnl_normalization_flags_t;
+
+/// @} dnnl_api_primitives_common
+/// @} dnnl_api_primitives
+
+/// @addtogroup dnnl_api_memory
+/// @{
+
+/// A wildcard value for dimensions that are unknown at a primitive creation
+/// time.
+#define DNNL_RUNTIME_DIM_VAL INT64_MIN
+
+/// A `size_t` counterpart of the DNNL_RUNTIME_DIM_VAL.
+/// For instance, this value is returned by dnnl_memory_desc_get_size() if
+/// either of the dimensions or strides equal to #DNNL_RUNTIME_DIM_VAL.
+#define DNNL_RUNTIME_SIZE_VAL ((size_t)DNNL_RUNTIME_DIM_VAL)
+
+/// @cond DO_NOT_DOCUMENT_THIS
+/// Hex representation for a **special** quiet NAN (!= NAN from math.h)
+static const union {
+    unsigned u;
+    float f;
+} DNNL_RUNTIME_F32_VAL_REP = {0x7fc000d0};
+/// @endcond
+
+/// A wildcard value for floating point values that are unknown at a primitive
+/// creation time.
+#define DNNL_RUNTIME_F32_VAL (DNNL_RUNTIME_F32_VAL_REP.f)
+
+/// @cond DO_NOT_DOCUMENT_THIS
+static const int DNNL_RUNTIME_S32_VAL_REP = INT32_MIN;
+/// @endcond
+
+/// A wildcard value for int32_t values that are unknown at a primitive creation
+/// time.
+#define DNNL_RUNTIME_S32_VAL DNNL_RUNTIME_S32_VAL_REP
+
+/// @struct dnnl_memory_desc
+/// An opaque structure to describe a memory descriptor.
+struct dnnl_memory_desc;
+
+/// A memory descriptor handle.
+typedef struct dnnl_memory_desc *dnnl_memory_desc_t;
+
+/// A memory descriptor handle.
+typedef const struct dnnl_memory_desc *const_dnnl_memory_desc_t;
+
+/// @struct dnnl_memory
+/// An opaque structure to describe a memory.
+struct dnnl_memory;
+
+/// A memory handle.
+typedef struct dnnl_memory *dnnl_memory_t;
+
+/// A constant memory handle.
+typedef const struct dnnl_memory *const_dnnl_memory_t;
+
+/// @} dnnl_api_memory
+
+/// @addtogroup dnnl_api_primitives
+/// @{
+
+/// @addtogroup dnnl_api_rnn
+/// @{
+
+/// Flags for RNN cell.
+typedef enum {
+    /// Undefined RNN flags
+    dnnl_rnn_flags_undef = 0x0,
+    /// Do not add weights gradient to existing diff_weights memory
+    dnnl_rnn_flags_diff_weights_overwrite = 0x1,
+} dnnl_rnn_flags_t;
+
+/// A direction of RNN primitive execution.
+typedef enum {
+    /// Undefined RNN direction.
+    dnnl_rnn_direction_undef = 0,
+    /// Unidirectional execution of RNN primitive from left to right.
+    dnnl_unidirectional_left2right,
+    /// Unidirectional execution of RNN primitive from right to left.
+    dnnl_unidirectional_right2left,
+    /// Bidirectional execution of RNN primitive with concatenation of the
+    /// results.
+    dnnl_bidirectional_concat,
+    /// Bidirectional execution of RNN primitive with summation of the
+    /// results.
+    dnnl_bidirectional_sum,
+} dnnl_rnn_direction_t;
+
+/// @} dnnl_api_rnn
+
+/// @} dnnl_api_primitives
+
+/// @addtogroup dnnl_api_primitives
+/// @{
+/// @addtogroup dnnl_api_primitives_common
+/// @{
+
+/// @struct dnnl_primitive_desc
+/// @brief An opaque structure to describe a primitive descriptor.
+struct dnnl_primitive_desc;
+
+/// @brief A primitive descriptor handle.
+typedef struct dnnl_primitive_desc *dnnl_primitive_desc_t;
+
+/// @brief A constant primitive descriptor handle.
+typedef const struct dnnl_primitive_desc *const_dnnl_primitive_desc_t;
+
+/// @} dnnl_api_primitives_common
+
+/// @addtogroup dnnl_api_attributes
+/// @{
+
+/// Scratchpad mode
+typedef enum {
+    /// The library manages the scratchpad allocation according to the policy
+    /// specified by the `DNNL_ENABLE_CONCURRENT_EXEC`
+    /// [build option](@ref dev_guide_build_options) (default).
+    ///
+    /// When `DNNL_ENABLE_CONCURRENT_EXEC=OFF` (default), the library
+    /// scratchpad is common to all primitives to reduce the memory footprint.
+    /// This configuration comes with limited thread-safety properties, namely
+    /// primitives can be created and executed in parallel but cannot migrate
+    /// between threads (in other words, each primitive should be executed in
+    /// the same thread it was created in).
+    ///
+    /// When `DNNL_ENABLE_CONCURRENT_EXEC=ON`, the library scratchpad is
+    /// private to each primitive. The memory footprint is larger than when
+    /// using `DNNL_ENABLE_CONCURRENT_EXEC=OFF` but different primitives can be
+    /// created and run concurrently (the same primitive cannot be run
+    /// concurrently from two different threads though).
+    dnnl_scratchpad_mode_library,
+    /// The user manages the scratchpad allocation by querying and providing
+    /// the scratchpad memory to primitives. This mode is thread-safe as long
+    /// as the scratchpad buffers are not used concurrently by two primitive
+    /// executions.
+    dnnl_scratchpad_mode_user,
+} dnnl_scratchpad_mode_t;
+
+/// Rounding mode
+typedef enum {
+    /// rounding mode dictated by the floating-point environment
+    dnnl_rounding_mode_environment,
+    /// stochastic rounding mode where a random bias is added to the
+    /// trailing mantissa bits before conversion.
+    dnnl_rounding_mode_stochastic,
+} dnnl_rounding_mode_t;
+
+/// @struct dnnl_primitive_attr
+/// @brief An opaque structure for primitive descriptor attributes.
+///
+/// Attributes may contain:
+///  - output scales (to scale the result prior to storing it to the memory)
+struct dnnl_primitive_attr;
+
+/// @brief A primitive descriptor attributes handle that controls primitive
+/// behavior.
+typedef struct dnnl_primitive_attr *dnnl_primitive_attr_t;
+
+/// @brief A constant primitive descriptor attributes handle.
+typedef const struct dnnl_primitive_attr *const_dnnl_primitive_attr_t;
+
+/// @struct dnnl_post_ops
+/// @brief An opaque structure for a chain of post operations.
+///
+/// dnnl_post_ops can be used to perform some (trivial) operations like
+/// accumulation or eltwise after certain primitives like convolution.
+///
+/// Post operations might be combined together, making a chain of post
+/// operations. For instance one can configure convolution followed by
+/// accumulation followed by eltwise. This might be especially beneficial
+/// for residual learning blocks.
+///
+/// @warning
+///      Of course not all combinations are supported, so the user should handle
+///      errors accordingly.
+///
+/// Supported post operations:
+///  - accumulation (base primitive: convolution)
+///  - eltwise (base primitive: convolution)
+struct dnnl_post_ops;
+
+/// @brief A post operation chain handle.
+typedef struct dnnl_post_ops *dnnl_post_ops_t;
+
+/// @brief A constant post operation chain handle.
+typedef const struct dnnl_post_ops *const_dnnl_post_ops_t;
+
+/// @} dnnl_api_attributes
+
+/// @addtogroup dnnl_api_primitives_common
+/// @{
+
+/// @struct dnnl_primitive
+/// An opaque structure to describe a primitive.
+struct dnnl_primitive;
+/// A primitive handle.
+typedef struct dnnl_primitive *dnnl_primitive_t;
+/// A constant primitive handle.
+typedef const struct dnnl_primitive *const_dnnl_primitive_t;
+
+/// Undefined argument.
+#define DNNL_ARG_UNDEF 0
+/// Source argument #0.
+#define DNNL_ARG_SRC_0 1
+/// A special mnemonic for source argument for primitives that have a
+/// single source. An alias for #DNNL_ARG_SRC_0.
+#define DNNL_ARG_SRC DNNL_ARG_SRC_0
+/// A special mnemonic for RNN input vector. An alias for
+/// #DNNL_ARG_SRC_0.
+#define DNNL_ARG_SRC_LAYER DNNL_ARG_SRC_0
+/// A special mnemonic for reorder source argument. An alias for
+/// #DNNL_ARG_SRC_0.
+#define DNNL_ARG_FROM DNNL_ARG_SRC_0
+
+/// Source argument #1.
+#define DNNL_ARG_SRC_1 2
+/// A special mnemonic for RNN input recurrent hidden state vector. An alias
+/// for #DNNL_ARG_SRC_1.
+#define DNNL_ARG_SRC_ITER DNNL_ARG_SRC_1
+
+/// Source argument #2.
+#define DNNL_ARG_SRC_2 3
+/// A special mnemonic for RNN input recurrent cell state vector. An alias for
+/// #DNNL_ARG_SRC_2.
+#define DNNL_ARG_SRC_ITER_C DNNL_ARG_SRC_2
+
+/// Source argument #3.
+#define DNNL_ARG_SRC_3 4
+/// A special mnemonic for RNN input recurrent cell attention vector. An alias for
+/// #DNNL_ARG_SRC_3.
+#define DNNL_ARG_AUGRU_ATTENTION DNNL_ARG_SRC_3
+
+/// Destination argument #0.
+#define DNNL_ARG_DST_0 17
+/// A special mnemonic for destination argument for primitives that have a
+/// single destination. An alias for #DNNL_ARG_DST_0.
+#define DNNL_ARG_DST DNNL_ARG_DST_0
+/// A special mnemonic for reorder destination argument. An alias for
+/// #DNNL_ARG_DST_0.
+#define DNNL_ARG_TO DNNL_ARG_DST_0
+/// A special mnemonic for RNN output vector. An alias for #DNNL_ARG_DST_0.
+#define DNNL_ARG_DST_LAYER DNNL_ARG_DST_0
+
+/// Destination argument #1.
+#define DNNL_ARG_DST_1 18
+/// A special mnemonic for RNN input recurrent hidden state vector. An
+/// alias for #DNNL_ARG_DST_1.
+#define DNNL_ARG_DST_ITER DNNL_ARG_DST_1
+
+/// Destination argument #2.
+#define DNNL_ARG_DST_2 19
+/// A special mnemonic for LSTM output recurrent cell state vector. An
+/// alias for #DNNL_ARG_DST_2.
+#define DNNL_ARG_DST_ITER_C DNNL_ARG_DST_2
+
+/// Weights argument #0.
+#define DNNL_ARG_WEIGHTS_0 33
+/// A special mnemonic for primitives that have a single weights
+/// argument. Alias for #DNNL_ARG_WEIGHTS_0.
+#define DNNL_ARG_WEIGHTS DNNL_ARG_WEIGHTS_0
+/// A special mnemonic for RNN weights applied to the layer input. An
+/// alias for #DNNL_ARG_WEIGHTS_0.
+#define DNNL_ARG_WEIGHTS_LAYER DNNL_ARG_WEIGHTS_0
+
+/// Weights argument #1.
+#define DNNL_ARG_WEIGHTS_1 34
+/// A special mnemonic for RNN weights applied to the recurrent input.
+/// An alias for #DNNL_ARG_WEIGHTS_1.
+#define DNNL_ARG_WEIGHTS_ITER DNNL_ARG_WEIGHTS_1
+
+/// Weights argument #2.
+#define DNNL_ARG_WEIGHTS_2 35
+/// A special mnemonic for RNN weights applied to the peephole weights.
+/// An alias for #DNNL_ARG_WEIGHTS_2.
+#define DNNL_ARG_WEIGHTS_PEEPHOLE DNNL_ARG_WEIGHTS_2
+
+/// Weights argument #3.
+#define DNNL_ARG_WEIGHTS_3 36
+/// A special mnemonic for RNN weights applied to the projection weights.
+/// An alias for #DNNL_ARG_WEIGHTS_3.
+#define DNNL_ARG_WEIGHTS_PROJECTION DNNL_ARG_WEIGHTS_3
+
+/// Bias tensor argument.
+#define DNNL_ARG_BIAS 41
+
+/// Mean values tensor argument.
+#define DNNL_ARG_MEAN 49
+/// Variance values tensor argument.
+#define DNNL_ARG_VARIANCE 50
+
+/// A special mnemonic for scale argument of normalization primitives.
+#define DNNL_ARG_SCALE 51
+/// A special mnemonic for shift argument of normalization primitives.
+#define DNNL_ARG_SHIFT 52
+
+/// Workspace tensor argument. Workspace is used to pass information
+/// from forward propagation to backward propagation computations.
+#define DNNL_ARG_WORKSPACE 64
+/// Scratchpad (temporary storage) tensor argument.
+#define DNNL_ARG_SCRATCHPAD 80
+
+/// Gradient (diff) of the source argument #0.
+#define DNNL_ARG_DIFF_SRC_0 129
+/// A special mnemonic for primitives that have a single diff source argument.
+/// An alias for #DNNL_ARG_DIFF_SRC_0.
+#define DNNL_ARG_DIFF_SRC DNNL_ARG_DIFF_SRC_0
+/// A special mnemonic for gradient (diff) of RNN input vector. An alias for
+/// #DNNL_ARG_DIFF_SRC_0.
+#define DNNL_ARG_DIFF_SRC_LAYER DNNL_ARG_DIFF_SRC_0
+
+/// Gradient (diff) of the source argument #1.
+#define DNNL_ARG_DIFF_SRC_1 130
+/// A special mnemonic for gradient (diff) of RNN input recurrent hidden state
+/// vector. An alias for #DNNL_ARG_DIFF_SRC_1.
+#define DNNL_ARG_DIFF_SRC_ITER DNNL_ARG_DIFF_SRC_1
+
+/// Gradient (diff) of the source argument #2.
+#define DNNL_ARG_DIFF_SRC_2 131
+/// A special mnemonic for gradient (diff) of RNN input recurrent cell state
+/// vector. An alias for #DNNL_ARG_DIFF_SRC_1.
+#define DNNL_ARG_DIFF_SRC_ITER_C DNNL_ARG_DIFF_SRC_2
+
+/// Gradient (diff) of the source argument #3.
+#define DNNL_ARG_DIFF_SRC_3 132
+/// A special mnemonic for gradient (diff) of RNN input recurrent cell attention
+/// vector. An alias for #DNNL_ARG_DIFF_SRC_3.
+#define DNNL_ARG_DIFF_AUGRU_ATTENTION DNNL_ARG_DIFF_SRC_3
+
+/// Gradient (diff) of the destination argument #0.
+#define DNNL_ARG_DIFF_DST_0 145
+/// A special mnemonic for primitives that have a single diff destination
+/// argument. An alias for #DNNL_ARG_DIFF_DST_0.
+#define DNNL_ARG_DIFF_DST DNNL_ARG_DIFF_DST_0
+/// A special mnemonic for gradient (diff) of RNN output vector. An alias for
+/// #DNNL_ARG_DIFF_DST_0.
+#define DNNL_ARG_DIFF_DST_LAYER DNNL_ARG_DIFF_DST_0
+
+/// Gradient (diff) of the destination argument #1.
+#define DNNL_ARG_DIFF_DST_1 146
+/// A special mnemonic for gradient (diff) of RNN input recurrent hidden state
+/// vector. An alias for #DNNL_ARG_DIFF_DST_1.
+#define DNNL_ARG_DIFF_DST_ITER DNNL_ARG_DIFF_DST_1
+
+/// Gradient (diff) of the destination argument #2.
+#define DNNL_ARG_DIFF_DST_2 147
+/// A special mnemonic for gradient (diff) of RNN input recurrent cell state
+/// vector. An alias for #DNNL_ARG_DIFF_DST_2.
+#define DNNL_ARG_DIFF_DST_ITER_C DNNL_ARG_DIFF_DST_2
+
+/// Gradient (diff) of the weights argument #0.
+#define DNNL_ARG_DIFF_WEIGHTS_0 161
+/// A special mnemonic for primitives that have a single diff weights
+/// argument. Alias for #DNNL_ARG_DIFF_WEIGHTS_0.
+#define DNNL_ARG_DIFF_WEIGHTS DNNL_ARG_DIFF_WEIGHTS_0
+/// A special mnemonic for diff of RNN weights applied to the layer input. An
+/// alias for #DNNL_ARG_DIFF_WEIGHTS_0.
+#define DNNL_ARG_DIFF_WEIGHTS_LAYER DNNL_ARG_DIFF_WEIGHTS_0
+
+/// Gradient (diff) of the weights argument #1.
+#define DNNL_ARG_DIFF_WEIGHTS_1 162
+/// A special mnemonic for diff of RNN weights applied to the recurrent input.
+/// An alias for #DNNL_ARG_DIFF_WEIGHTS_1.
+#define DNNL_ARG_DIFF_WEIGHTS_ITER DNNL_ARG_DIFF_WEIGHTS_1
+
+/// Gradient (diff) of the weights argument #2.
+#define DNNL_ARG_DIFF_WEIGHTS_2 163
+/// A special mnemonic for diff of RNN weights applied to the peephole weights.
+/// An alias for #DNNL_ARG_DIFF_WEIGHTS_2.
+#define DNNL_ARG_DIFF_WEIGHTS_PEEPHOLE DNNL_ARG_DIFF_WEIGHTS_2
+
+/// Gradient (diff) of the weights argument #3.
+#define DNNL_ARG_DIFF_WEIGHTS_3 164
+/// A special mnemonic for diff of RNN weights applied to the projection
+/// weights. An alias for #DNNL_ARG_DIFF_WEIGHTS_3.
+#define DNNL_ARG_DIFF_WEIGHTS_PROJECTION DNNL_ARG_DIFF_WEIGHTS_3
+
+/// Gradient (diff) of the bias tensor argument.
+#define DNNL_ARG_DIFF_BIAS 169
+
+/// A special mnemonic for scale argument of normalization primitives.
+#define DNNL_ARG_DIFF_SCALE 255
+/// A special mnemonic for shift argument of normalization primitives.
+#define DNNL_ARG_DIFF_SHIFT 256
+
+/// Rounding mode seed for stochastic rounding
+/// Single seed needed independently of how many arguments need stochastic rounding
+#define DNNL_ARG_ATTR_ROUNDING_SEED 508
+
+/// Dropout mask output buffer.
+#define DNNL_ARG_ATTR_DROPOUT_MASK 509
+
+/// Dropout probability value passed via a buffer.
+#define DNNL_ARG_ATTR_DROPOUT_PROBABILITY 510
+
+/// Dropout RNG seed value passed via a buffer.
+#define DNNL_ARG_ATTR_DROPOUT_SEED 511
+
+/// Output scaling factors provided at execution time.
+#define DNNL_ARG_ATTR_OUTPUT_SCALES 513
+
+/// Starting index for source arguments for primitives that take a variable
+/// number of source arguments.
+#define DNNL_ARG_MULTIPLE_SRC 1024
+/// Starting index for destination arguments for primitives that produce a
+/// variable number of destination arguments.
+#define DNNL_ARG_MULTIPLE_DST 2048
+
+/// Scaling factors provided at execution time.
+#define DNNL_ARG_ATTR_SCALES 4096
+
+/// Zero points provided at execution time.
+#define DNNL_ARG_ATTR_ZERO_POINTS 8192
+
+/// Arguments for fused depthwise convolution.
+/// See @ref dev_guide_attributes_post_ops_depthwise_fusion
+#define DNNL_ARG_ATTR_POST_OP_DW 16384
+
+/// Starting point for a binary post operation.
+#define DNNL_ARG_ATTR_MULTIPLE_POST_OP_BASE 32768
+
+/// Arguments for a binary post operation. Up to 32 arguments are supported.
+/// See @ref dev_guide_attributes_post_ops_binary_fusion
+#define DNNL_ARG_ATTR_MULTIPLE_POST_OP(idx) \
+    (DNNL_ARG_ATTR_MULTIPLE_POST_OP_BASE * ((idx) + 1))
+
+/// A structure that contains an index and a memory object, and is used to pass
+/// arguments to dnnl_primitive_execute().
+typedef struct {
+    int arg; ///< An argument index, e.g. DNNL_ARG_SRC
+    dnnl_memory_t memory; ///< Input/output memory
+} dnnl_exec_arg_t;
+
+/// @} dnnl_api_primitives_common
+
+/// @addtogroup dnnl_api_primitives_common
+/// @{
+
+/// Primitive descriptor query specification
+///
+/// For generic function dnnl_primitive_desc_query(), the type of result must
+/// agree with the queried argument. The correspondence table:
+///
+/// Query kind                      | Type of query result
+/// --------------------------------|-----------------------------
+/// dnnl_query_*_engine             | #dnnl_engine_t *
+/// #dnnl_query_primitive_kind      | #dnnl_primitive_kind_t *
+/// dnnl_query_*_s32                | int *
+/// dnnl_query_*_s64                | #dnnl_dim_t * (same as int64_t *)
+/// dnnl_query_*_f32                | float *
+/// dnnl_query_*_f64                | double *
+/// dnnl_query_*_str                | const char **
+/// dnnl_query_*_md                 | #const_dnnl_memory_desc_t *
+/// dnnl_query_*_pd                 | #const_dnnl_primitive_desc_t *
+/// dnnl_query_cache_blob_id        | const uint8_t **
+/// dnnl_query_strides              | const #dnnl_dims_t **
+/// dnnl_query_dilations            | const #dnnl_dims_t **
+/// dnnl_query_padding_l            | const #dnnl_dims_t **
+/// dnnl_query_padding_r            | const #dnnl_dims_t **
+/// dnnl_query_flags                | unsigned *
+/// dnnl_query_alg_kind             | #dnnl_alg_kind_t *
+/// dnnl_query_factors              | const float **
+/// dnnl_query_cell_kind            | #dnnl_alg_kind_t *
+/// dnnl_query_direction            | #dnnl_rnn_direction_t *
+/// dnnl_query_activation_kind      | #dnnl_alg_kind_t *
+/// dnnl_query_kernel               | const #dnnl_dims_t **
+/// dnnl_query_dims                 | const #dnnl_dims_t **
+/// dnnl_query_data_type            | #dnnl_data_type_t *
+/// dnnl_query_padded_dims          | const #dnnl_dims_t **
+/// dnnl_query_padded_offsets       | const #dnnl_dims_t **
+/// dnnl_query_format_kind          | #dnnl_format_kind_t *
+/// dnnl_query_inner_blks           | const #dnnl_dims_t **
+/// dnnl_query_inner_idxs           | const #dnnl_dims_t **
+/// dnnl_query_sparse_encoding      | #dnnl_sparse_encoding_t *
+///
+/// @note
+///     Rule of thumb: all opaque types and structures are returned by
+///     reference. All numbers are returned by value.
+///
+/// @warning
+///     All returned references point to constant objects and are valid only
+///     during the lifetime of the queried primitive descriptor. Returned objects
+///     must not be destroyed by the user. If you need to keep the object longer
+///     than the lifetime of the queried primitive descriptor, use
+///     dnnl_primitive_desc_clone() to make a copy.
+typedef enum {
+    dnnl_query_undef = 0, ///< no query
+
+    dnnl_query_engine, ///< execution engine
+    dnnl_query_primitive_kind, ///< primitive kind
+
+    dnnl_query_num_of_inputs_s32, ///< number of inputs expected
+    dnnl_query_num_of_outputs_s32, ///< number of outputs expected
+
+    dnnl_query_time_estimate_f64, ///< runtime estimation (seconds)
+    dnnl_query_memory_consumption_s64, ///< memory consumption -- extra
+    ///  (scratch) memory, additional to
+    ///  all inputs and outputs memory
+    ///  (bytes)
+
+    dnnl_query_scratchpad_engine, ///< scratchpad engine -- engine to be used
+    ///  for creating scratchpad memory
+
+    dnnl_query_impl_info_str, ///< implementation name
+
+    dnnl_query_reorder_src_engine, ///< source engine
+    dnnl_query_reorder_dst_engine, ///< destination engine
+
+    dnnl_query_prop_kind, ///< propagation kind
+
+    dnnl_query_cache_blob_id_size_s64, ///< size of cache blob ID in bytes
+    dnnl_query_cache_blob_id, ///< cache blob  ID (pointer to array)
+
+    dnnl_query_strides, ///< strides
+    dnnl_query_dilations, ///< dilations
+    dnnl_query_padding_l, ///< left padding
+    dnnl_query_padding_r, ///< right padding
+    dnnl_query_epsilon_f32, ///< epsilon
+    dnnl_query_flags, ///< flags
+    dnnl_query_alg_kind, ///< algorithm kind
+    dnnl_query_alpha_f32, ///< alpha
+    dnnl_query_beta_f32, ///< beta
+    dnnl_query_axis_s32, ///< axis
+    dnnl_query_local_size_s64, ///< LRN parameter local size
+    dnnl_query_k_f32, ///< LRN parameter K
+    dnnl_query_p_f32, ///< Reduction parameter P
+    dnnl_query_factors, ///< Resampling parameter factors
+    dnnl_query_cell_kind, ///< RNN parameter cell kind
+    dnnl_query_direction, ///< RNN parameter direction
+    dnnl_query_activation_kind, ///< RNN parameter activation kind
+    dnnl_query_kernel, ///< Pooling parameter kernel
+    dnnl_query_group_size_s64, ///< Shuffle parameter group size
+
+    // memory descriptor section
+    dnnl_query_some_md = 128, ///< stub
+    dnnl_query_src_md, ///< source memory desc
+    dnnl_query_diff_src_md, ///< source gradient memory desc
+    dnnl_query_weights_md, ///< weights memory descriptor desc
+    dnnl_query_diff_weights_md, ///< weights grad. memory desc
+    dnnl_query_dst_md, ///< destination memory desc
+    dnnl_query_diff_dst_md, ///< destination grad. memory desc
+    dnnl_query_workspace_md, ///< workspace memory desc
+    dnnl_query_scratchpad_md, ///< scratchpad memory desc
+    dnnl_query_exec_arg_md = 255, ///< memory desc of an execute argument
+
+    dnnl_query_ndims_s32, ///< number of dimensions
+    dnnl_query_dims, ///< vector of dimensions
+    dnnl_query_data_type, ///< data type
+    dnnl_query_submemory_offset_s64, ///< submemory offset
+    dnnl_query_padded_dims, ///< vector of padded dimensions
+    dnnl_query_padded_offsets, ///< vector of padded offsets
+    dnnl_query_format_kind, ///< format kind
+    dnnl_query_inner_nblks_s32, ///< number of innermost blocks
+    dnnl_query_inner_blks, ///< vector of sizes of the innermost blocks
+    dnnl_query_inner_idxs, ///< vector of logical indices of the blocks
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+    dnnl_query_sparse_encoding, ///< Sparse encoding
+    dnnl_query_nnz_s64, ///< Number of non-zero entries
+    dnnl_query_num_handles_s32, ///< Number of buffers required for a memory
+///  descriptor
+#endif
+    // Max value to prevent UB for internal use only dnnl_query_t
+    dnnl_query_max = 0x7fff,
+} dnnl_query_t;
+
+/// @} dnnl_api_primitives_common
+
+/// @} dnnl_api_primitives
+
+/// @addtogroup dnnl_api_service
+/// @{
+
+/// Disable profiling completely
+#define DNNL_JIT_PROFILE_NONE 0u
+
+/// Enable VTune Profiler integration
+#define DNNL_JIT_PROFILE_VTUNE 1u
+
+/// Enable Linux perf integration via perfmap files
+#define DNNL_JIT_PROFILE_LINUX_PERFMAP 2u
+
+/// Enable Linux perf integration via jitdump files
+#define DNNL_JIT_PROFILE_LINUX_JITDUMP 4u
+
+/// Instruct Linux perf integration via jitdump files to use TSC. @ref
+/// DNNL_JIT_PROFILE_LINUX_JITDUMP must be set too for this to take effect.
+#define DNNL_JIT_PROFILE_LINUX_JITDUMP_USE_TSC 8u
+
+/// Enable Linux perf integration (both jitdump and perfmap)
+#define DNNL_JIT_PROFILE_LINUX_PERF \
+    (DNNL_JIT_PROFILE_LINUX_JITDUMP | DNNL_JIT_PROFILE_LINUX_PERFMAP)
+
+/// CPU instruction set flags
+typedef enum {
+    /// Library choice of ISA (excepting those listed as initial support)
+    dnnl_cpu_isa_default = 0x0,
+
+    /// Intel Streaming SIMD Extensions 4.1 (Intel SSE4.1)
+    dnnl_cpu_isa_sse41 = 0x1,
+
+    /// Intel Advanced Vector Extensions (Intel AVX)
+    dnnl_cpu_isa_avx = 0x3,
+
+    /// Intel Advanced Vector Extensions 2 (Intel AVX2)
+    dnnl_cpu_isa_avx2 = 0x7,
+
+    /// Intel AVX2 and Intel Deep Learning Boost (Intel DL Boost) support
+    dnnl_cpu_isa_avx2_vnni = 0xf,
+
+    /// Intel AVX2 and Intel Deep Learning Boost (Intel DL Boost)
+    /// with 8-bit integer, float16 and bfloat16 support
+    dnnl_cpu_isa_avx2_vnni_2 = 0x1f,
+
+    /// Intel AVX-512 subset for Intel Xeon Scalable processor family
+    /// and Intel Core processor family.
+    dnnl_cpu_isa_avx512_core = 0x27,
+
+    /// Intel AVX-512 and Intel Deep Learning Boost (Intel DL Boost) support
+    /// for Intel Xeon Scalable processor family
+    /// and Intel Core processor family.
+    dnnl_cpu_isa_avx512_core_vnni = 0x67,
+
+    /// Intel AVX-512, Intel DL Boost and bfloat16 support
+    /// for Intel Xeon Scalable processor family
+    /// and Intel Core processor family.
+    dnnl_cpu_isa_avx512_core_bf16 = 0xe7,
+
+    /// Intel AVX-512 with float16, Intel DL Boost and bfloat16 support
+    /// for Intel Xeon Scalable processor family
+    /// and Intel Core processor family.
+    // TODO: Align avx10_1 values to internal representation.
+    dnnl_cpu_isa_avx10_1_512 = 0x1ef,
+    /// @copydoc dnnl_cpu_isa_avx10_1_512
+    dnnl_cpu_isa_avx512_core_fp16 = dnnl_cpu_isa_avx10_1_512,
+
+    /// Intel AVX-512 with float16, Intel DL Boost and bfloat16 support and
+    /// Intel AMX with 8-bit integer and bfloat16 support
+    // TODO: Align avx10_1 values to internal representation.
+    dnnl_cpu_isa_avx10_1_512_amx = 0xfef,
+    /// @copydoc dnnl_cpu_isa_avx10_1_512_amx
+    dnnl_cpu_isa_avx512_core_amx = dnnl_cpu_isa_avx10_1_512_amx,
+
+    /// Intel AVX-512 with float16, Intel DL Boost and bfloat16 support and
+    /// Intel AMX with 8-bit integer, bfloat16 and float16 support
+    // TODO: Align avx10_1 values to internal representation.
+    dnnl_cpu_isa_avx10_1_512_amx_fp16 = 0x1fef,
+    /// @copydoc dnnl_cpu_isa_avx10_1_512_amx_fp16
+    dnnl_cpu_isa_avx512_core_amx_fp16 = dnnl_cpu_isa_avx10_1_512_amx_fp16,
+} dnnl_cpu_isa_t;
+
+/// CPU ISA hints flags
+typedef enum {
+    /// No hints (use default features)
+    dnnl_cpu_isa_no_hints = 0x0,
+
+    /// Prefer to exclusively use Ymm registers for computations
+    dnnl_cpu_isa_prefer_ymm = 0x1,
+} dnnl_cpu_isa_hints_t;
+
+/// @} dnnl_api_service
+
+/// @} dnnl_api
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ONEAPI_DNNL_TYPES_H */
diff --git a/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_ukernel.h b/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_ukernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..4d768c8674ffcad62ad63b6890aeda0d57472bb2
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_ukernel.h
@@ -0,0 +1,337 @@
+/*******************************************************************************
+* Copyright 2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/// @file
+/// ukernel C API
+
+#ifndef ONEAPI_DNNL_DNNL_UKERNEL_H
+#define ONEAPI_DNNL_DNNL_UKERNEL_H
+
+#include "oneapi/dnnl/dnnl.h"
+#include "oneapi/dnnl/dnnl_ukernel_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// @addtogroup dnnl_api
+/// @{
+
+/// @addtogroup dnnl_api_ukernel
+/// @{
+
+#ifdef DNNL_EXPERIMENTAL_UKERNEL
+
+/// Creates a ukernel attributes memory storage.
+///
+/// @param attr_params Output ukernel attributes memory storage.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ukernel_attr_params_create(
+        dnnl_ukernel_attr_params_t *attr_params);
+
+/// Sets post-operations arguments to a storage.
+///
+/// @param attr_params Memory pointers storage object.
+/// @param post_ops_args A pointer to pointers of post_ops storages. Expected to
+///     be packed together.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ukernel_attr_params_set_post_ops_args(
+        dnnl_ukernel_attr_params_t attr_params, const void **post_ops_args);
+
+/// Sets tensor A scales argument to a storage.
+///
+/// @param attr_params Memory pointers storage object.
+/// @param a_scales Pointer to the scales storage.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ukernel_attr_params_set_A_scales(
+        dnnl_ukernel_attr_params_t attr_params, const void *a_scales);
+
+/// Sets tensor B scales argument to a storage.
+///
+/// If `dnnl_brgemm_set_B_scales` used mask of 2, then at least N values of
+/// selected data type are expected.
+///
+/// @param attr_params Memory pointers storage object.
+/// @param b_scales Pointer to the scales storage.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ukernel_attr_params_set_B_scales(
+        dnnl_ukernel_attr_params_t attr_params, const void *b_scales);
+
+/// Sets tensor D scales argument to a storage.
+///
+/// @param attr_params Memory pointers storage object.
+/// @param d_scales Pointer to the scales storage.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ukernel_attr_params_set_D_scales(
+        dnnl_ukernel_attr_params_t attr_params, const void *d_scales);
+
+/// Destroys a ukernel attributes memory storage.
+///
+/// @param attr_params Memory pointers storage object to destroy.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ukernel_attr_params_destroy(
+        dnnl_ukernel_attr_params_t attr_params);
+
+/// @addtogroup dnnl_api_ukernel_brgemm
+/// @{
+
+/// Creates a BRGeMM ukernel object. Operates by the following formula:
+/// `C = [A x B]`.
+///
+/// @param brgemm Output BRGeMM ukernel object.
+/// @param M Dimension M of tensor A.
+/// @param N Dimension N of tensor B.
+/// @param K Dimension K of tensors A and B.
+/// @param batch_size Number of batches to process.
+/// @param lda Leading dimension of tensor A.
+/// @param ldb Leading dimension of tensor B.
+/// @param ldc Leading dimension of tensor C.
+/// @param a_dt Data type of tensor A.
+/// @param b_dt Data type of tensor B.
+/// @param c_dt Data type of tensor C. Must be dnnl_f32.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_brgemm_create(dnnl_brgemm_t *brgemm, dnnl_dim_t M,
+        dnnl_dim_t N, dnnl_dim_t K, dnnl_dim_t batch_size, dnnl_dim_t lda,
+        dnnl_dim_t ldb, dnnl_dim_t ldc, dnnl_data_type_t a_dt,
+        dnnl_data_type_t b_dt, dnnl_data_type_t c_dt);
+
+/// Sets adding an intermediate result to the output tensor C instead of
+/// writing: `C += [A x B]`.
+///
+/// @param brgemm BRGeMM ukernel object.
+/// @param add_C Value to indicate addition. Can be `0` to skip addition, and
+///     `1` to apply addition.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_brgemm_set_add_C(dnnl_brgemm_t brgemm, int add_C);
+
+/// Sets post-operations to a BRGeMM ukernel object: `D = post-operations(C)`.
+///
+/// Post-operations applies if one of the following holds:
+/// * Non-empty attributes are specified.
+/// * Output data type `d_dt` is different from accumulation data type `c_dt`.
+///
+/// If any of conditions happens, the final call of the accumulation chain
+/// must be `dnnl_brgemm_execute_postops`, and `dnnl_brgemm_execute`, otherwise.
+///
+/// @param brgemm BRGeMM ukernel object.
+/// @param ldd Leading dimension of tensor D.
+/// @param d_dt Data type of tensor D.
+/// @param post_ops Primitive post operations attribute to extend the kernel
+///     operations.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_brgemm_set_post_ops(dnnl_brgemm_t brgemm,
+        dnnl_dim_t ldd, dnnl_data_type_t d_dt, const_dnnl_post_ops_t post_ops);
+
+/// Sets tensor A scales mask to a BRGeMM ukernel object.
+///
+/// For quantization flavor tensor A scales apply to accumulation buffer once C
+/// is ready.
+///
+/// @param brgemm BRGeMM ukernel object.
+/// @param a_scale_mask Tensor A scale mask. Can be `0` only.
+dnnl_status_t DNNL_API dnnl_brgemm_set_A_scales(
+        dnnl_brgemm_t brgemm, int a_scale_mask);
+
+/// Sets tensor B scales mask to a BRGeMM ukernel object.
+///
+/// For quantization flavor tensor B scales apply to accumulation buffer once C
+/// is ready.
+///
+/// @param brgemm BRGeMM ukernel object.
+/// @param b_scale_mask Tensor B scale mask. Can be `0` and `2` only.
+dnnl_status_t DNNL_API dnnl_brgemm_set_B_scales(
+        dnnl_brgemm_t brgemm, int b_scale_mask);
+
+/// Sets tensor D scales mask to a BRGeMM ukernel object.
+///
+/// For quantization flavor tensor D scales apply after all post-ops are
+/// applied.
+///
+/// @param brgemm BRGeMM ukernel object.
+/// @param d_scale_mask Tensor D scale mask. Can be `0` only.
+dnnl_status_t DNNL_API dnnl_brgemm_set_D_scales(
+        dnnl_brgemm_t brgemm, int d_scale_mask);
+
+/// Finalizes initialization of a BRGeMM ukernel object.
+///
+/// This step is mandatory to query information from the object.
+///
+/// @param brgemm Output BRGeMM ukernel object.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_brgemm_finalize(dnnl_brgemm_t brgemm);
+
+/// Returns the packing type expected by a tensor B of a BRGeMM ukernel object.
+///
+/// @param brgemm BRGeMM ukernel object.
+/// @param pack_type Output packing type. Can be `dnnl_brgemm_no_pack` if
+///     packing is not expected, and `dnnl_brgemm_pack_32`, otherwise.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_brgemm_get_B_pack_type(
+        const_dnnl_brgemm_t brgemm, dnnl_pack_type_t *pack_type);
+
+/// Returns the size of a scratchpad memory needed for the BRGeMM ukernel
+/// object.
+///
+/// @param brgemm BRGeMM ukernel object.
+/// @param size Output size of a buffer required for the BRGeMM ukernel object.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_brgemm_get_scratchpad_size(
+        const_dnnl_brgemm_t brgemm, size_t *size);
+
+/// Returns the flag indicating when the call to `dnnl_brgemm_execute_postops`
+/// is valid.
+///
+/// @param brgemm BRGeMM ukernel object.
+/// @param valid The flag indicating if `dnnl_brgemm_execute_postops` is valid
+///     for a given ukernel object. `1` is for valid and `0`, otherwise.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_brgemm_is_execute_postops_valid(
+        const_dnnl_brgemm_t brgemm, int *valid);
+
+/// Initializes the hardware-specific context. If no initialization required,
+/// returns the success status.
+///
+/// @param brgemm BRGeMM ukernel object.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_brgemm_set_hw_context(const_dnnl_brgemm_t brgemm);
+
+/// Releases the hardware-specific context. Must be used after all the execution
+/// calls to BRGeMM ukernel objects.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_brgemm_release_hw_context();
+
+/// Generates an executable part of BRGeMM ukernel object.
+/// @param brgemm BRGeMM ukernel object.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_brgemm_generate(dnnl_brgemm_t brgemm);
+
+/// Executes a BRGeMM ukernel object.
+///
+/// @param brgemm BRGeMM ukernel object.
+/// @param A_ptr Base pointer to a tensor A.
+/// @param B_ptr Base pointer to a tensor B.
+/// @param A_B_offsets Pointer to the set of tensor A and tensor B offsets for
+///     each batch; the set must be contiguous in memory. Single batch should
+///     supply offsets for both tensors A and B simultaneously. The number of
+///     batches must coincide with the `batch_size` value passed at the creation
+///     stage.
+/// @param C_ptr Pointer to a tensor C (accumulation buffer).
+/// @param scratchpad_ptr Pointer to a scratchpad buffer.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_brgemm_execute(const_dnnl_brgemm_t brgemm,
+        const void *A_ptr, const void *B_ptr, const dnnl_dim_t *A_B_offsets,
+        void *C_ptr, void *scratchpad_ptr);
+
+/// Executes a BRGeMM ukernel object with post operations.
+///
+/// @param brgemm BRGeMM ukernel object.
+/// @param A Base pointer to a tensor A.
+/// @param B Base pointer to a tensor B.
+/// @param A_B_offsets Pointer to a set of tensor A and tensor B offsets for
+///     each batch. A set must be contiguous in memory. A single batch should
+///     supply offsets for both tensors A and B simultaneously. The number of
+///     batches must coincide with the `batch_size` value passed at the creation
+///     stage.
+/// @param C_ptr Pointer to a tensor C (accumulation buffer).
+/// @param D_ptr Pointer to a tensor D (output buffer).
+/// @param scratchpad_ptr Pointer to a scratchpad buffer.
+/// @param attr_params Ukernel attributes memory storage.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_brgemm_execute_postops(const_dnnl_brgemm_t brgemm,
+        const void *A, const void *B, const dnnl_dim_t *A_B_offsets,
+        const void *C_ptr, void *D_ptr, void *scratchpad_ptr,
+        const_dnnl_ukernel_attr_params_t attr_params);
+
+/// Destroys a BRGeMM ukernel object.
+///
+/// @param brgemm BRGeMM ukernel object to destroy.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_brgemm_destroy(dnnl_brgemm_t brgemm);
+
+/// Creates a transform object.
+///
+/// @param transform Output transform object.
+/// @param K Dimension K.
+/// @param N Dimension N.
+/// @param in_pack_type Input packing type. Must be one of
+///     `dnnl_pack_type_no_trans`, or `dnnl_pack_type_trans`.
+/// @param in_ld Input leading dimension.
+/// @param out_ld Output leading dimension. When packing data, it specifies a
+///     block by N dimension.
+/// @param in_dt Input data type.
+/// @param out_dt Output data type.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_transform_create(dnnl_transform_t *transform,
+        dnnl_dim_t K, dnnl_dim_t N, dnnl_pack_type_t in_pack_type,
+        dnnl_dim_t in_ld, dnnl_dim_t out_ld, dnnl_data_type_t in_dt,
+        dnnl_data_type_t out_dt);
+
+/// Generates an executable part of transform object.
+/// @param transform Transform object.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_transform_generate(dnnl_transform_t transform);
+
+/// Executes a transform object.
+///
+/// @param transform Transform object.
+/// @param in_ptr Pointer to an input buffer.
+/// @param out_ptr Pointer to an output buffer.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_transform_execute(
+        const_dnnl_transform_t transform, const void *in_ptr, void *out_ptr);
+
+/// Destroys a transform object.
+///
+/// @param transform Transform object.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_transform_destroy(dnnl_transform_t transform);
+
+/// @} dnnl_api_ukernel_brgemm
+
+#endif
+
+/// @} dnnl_api_ukernel
+
+/// @} dnnl_api
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ONEAPI_DNNL_DNNL_UKERNEL_H */
diff --git a/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_ukernel.hpp b/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_ukernel.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ffc583e9bc1012b04778c7d56bfa96f833f98216
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_ukernel.hpp
@@ -0,0 +1,465 @@
+/*******************************************************************************
+* Copyright 2024-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/// @file
+/// ukernel C++ API
+
+#ifndef ONEAPI_DNNL_DNNL_UKERNEL_HPP
+#define ONEAPI_DNNL_DNNL_UKERNEL_HPP
+
+#include "oneapi/dnnl/dnnl.hpp"
+#include "oneapi/dnnl/dnnl_ukernel.h"
+
+/// @addtogroup dnnl_api oneDNN API
+/// @{
+
+/// oneDNN namespace
+namespace dnnl {
+
+#ifdef DNNL_EXPERIMENTAL_UKERNEL
+
+/// @addtogroup dnnl_api_utils
+/// @{
+
+/// @cond DO_NOT_DOCUMENT_THIS
+
+template <>
+struct handle_traits<dnnl_brgemm_t> {
+    static dnnl_status_t destructor(dnnl_brgemm_t p) {
+        return dnnl_brgemm_destroy(p);
+    }
+};
+
+template <>
+struct handle_traits<dnnl_transform_t> {
+    static dnnl_status_t destructor(dnnl_transform_t p) {
+        return dnnl_transform_destroy(p);
+    }
+};
+
+template <>
+struct handle_traits<dnnl_ukernel_attr_params_t> {
+    static dnnl_status_t destructor(dnnl_ukernel_attr_params_t p) {
+        return dnnl_ukernel_attr_params_destroy(p);
+    }
+};
+
+/// @endcond
+
+/// @} dnnl_api_utils
+
+#endif
+
+/// @addtogroup dnnl_api_ukernel Ukernels
+/// Collection of ukernels
+/// @{
+
+/// ukernel namespace
+namespace ukernel {
+
+#ifdef DNNL_EXPERIMENTAL_UKERNEL
+
+/// @addtogroup dnnl_api_ukernel_utils ukernel utils
+/// ukernel utility functions
+/// @{
+
+/// Packing specification
+enum class pack_type {
+    /// Undefined pack type. A guard value.
+    undef = dnnl_pack_type_undef,
+    /// Plain, not transposed layout. Similar to format_tag::ab.
+    no_trans = dnnl_pack_type_no_trans,
+    /// Plain, transposed layout. Similar to format_tag::ba.
+    trans = dnnl_pack_type_trans,
+    /// Packed by 32 bits along K dimension layout.
+    pack32 = dnnl_pack_type_pack32,
+};
+
+/// Ukernel attributes memory storage
+struct attr_params : public handle<dnnl_ukernel_attr_params_t> {
+    /// Constructs a ukernel attributes memory storage.
+    attr_params() {
+        dnnl_ukernel_attr_params_t c_params = nullptr;
+        dnnl_status_t status = dnnl_ukernel_attr_params_create(&c_params);
+        error::wrap_c_api(
+                status, "could not create an attributes memory storage");
+        reset(c_params);
+    }
+
+    /// Sets post-operations arguments to a storage.
+    ///
+    /// @param post_ops_args Pointer to pointers of post_ops storages.
+    ///     Expected to be packed together.
+    void set_post_ops_args(const void **post_ops_args) {
+        dnnl_status_t status = dnnl_ukernel_attr_params_set_post_ops_args(
+                get(), post_ops_args);
+        if (status != dnnl_success)
+            error::wrap_c_api(
+                    status, "could not set post operations arguments");
+    }
+
+    /// Sets tensor A scales arguments to a storage.
+    ///
+    /// @param a_scales Pointer to scales storage.
+    void set_A_scales(const void *a_scales) {
+        dnnl_status_t status
+                = dnnl_ukernel_attr_params_set_A_scales(get(), a_scales);
+        if (status != dnnl_success)
+            error::wrap_c_api(status, "could not set A scales argument");
+    }
+
+    /// Sets tensor B scales arguments to a storage.
+    ///
+    /// If @ref attr_params::set_B_scales used mask of 2, then at
+    /// least N values of selected data type are expected.
+    ///
+    /// @param b_scales Pointer to scales storage.
+    void set_B_scales(const void *b_scales) {
+        dnnl_status_t status
+                = dnnl_ukernel_attr_params_set_B_scales(get(), b_scales);
+        if (status != dnnl_success)
+            error::wrap_c_api(status, "could not set B scales argument");
+    }
+
+    /// Sets tensor D scales arguments to a storage.
+    ///
+    /// @param d_scales Pointer to scales storage.
+    void set_D_scales(const void *d_scales) {
+        dnnl_status_t status
+                = dnnl_ukernel_attr_params_set_D_scales(get(), d_scales);
+        if (status != dnnl_success)
+            error::wrap_c_api(status, "could not set D scales argument");
+    }
+};
+/// @} dnnl_api_ukernel_utils
+
+/// @addtogroup dnnl_api_ukernel_brgemm BRGeMM ukernel
+/// BRGeMM ukernel routines
+/// @{
+
+/// BRGeMM ukernel
+struct brgemm : public handle<dnnl_brgemm_t> {
+    /// Default constructor. Produces an empty object.
+    brgemm() = default;
+
+    /// Constructs a BRGeMM ukernel object. Operates by the following formula:
+    /// `C = [A x B]`.
+    ///
+    /// @param M Dimension M of tensor A.
+    /// @param N Dimension N of tensor B.
+    /// @param K Dimension K of tensors A and B.
+    /// @param batch_size Number of batches to process.
+    /// @param lda Leading dimension of tensor A.
+    /// @param ldb Leading dimension of tensor B.
+    /// @param ldc Leading dimension of tensor C.
+    /// @param a_dt Data type of tensor A.
+    /// @param b_dt Data type of tensor B.
+    /// @param c_dt Data type of tensor C.
+    /// @param allow_empty A flag signifying whether construction is
+    ///     allowed to fail without throwing an exception. In this case an
+    ///     empty object will be produced. This flag is optional and
+    ///     defaults to false.
+    brgemm(memory::dim M, memory::dim N, memory::dim K, memory::dim batch_size,
+            memory::dim lda, memory::dim ldb, memory::dim ldc,
+            memory::data_type a_dt, memory::data_type b_dt,
+            memory::data_type c_dt, bool allow_empty = false) {
+
+        dnnl_brgemm_t brgemm = nullptr;
+        dnnl_status_t status = dnnl_brgemm_create(&brgemm, M, N, K, batch_size,
+                lda, ldb, ldc, memory::convert_to_c(a_dt),
+                memory::convert_to_c(b_dt), memory::convert_to_c(c_dt));
+
+        if (!allow_empty)
+            error::wrap_c_api(
+                    status, "could not create a BRGeMM ukernel object");
+        reset(brgemm);
+    }
+
+    /// Sets adding an intermediate result to the output tensor C instead of
+    /// writing: `C += [A x B]`.
+    ///
+    /// @param add_C Value to indicate addition. `false` to skip addition, and
+    ///     `true` to apply addition.
+    void set_add_C(bool add_C) {
+        dnnl_status_t status
+                = dnnl_brgemm_set_add_C(get(), static_cast<int>(add_C));
+        if (status != dnnl_success)
+            error::wrap_c_api(status, "could not set add_C attribute");
+    }
+
+    /// Sets post-operations to a BRGeMM ukernel object:
+    /// `D = post-operations(C)`.
+    ///
+    /// Post-operations applies if one of the following holds:
+    /// * Non-empty post-operations are specified.
+    /// * Output data type `d_dt` is different from accumulation data type
+    ///     `c_dt`.
+    ///
+    /// @param ldd Leading dimension of tensor D.
+    /// @param d_dt Data type of tensor D.
+    /// @param po Primitive post-operation attributes to extend the kernel
+    ///     operations.
+    void set_post_ops(memory::dim ldd, memory::data_type d_dt,
+            const post_ops &po = default_post_ops()) {
+        dnnl_status_t status = dnnl_brgemm_set_post_ops(
+                get(), ldd, memory::convert_to_c(d_dt), po.get());
+        if (status != dnnl_success)
+            error::wrap_c_api(status, "could not set post operations");
+    }
+
+    /// Sets tensor A scales mask to a BRGeMM ukernel object.
+    ///
+    /// For quantization flavor tensor A scales apply to accumulation buffer
+    /// once C is ready.
+    ///
+    /// @param a_scale_mask Tensor A scale mask. Can be `0` only.
+    void set_A_scales(int a_scale_mask) {
+        dnnl_status_t status = dnnl_brgemm_set_A_scales(get(), a_scale_mask);
+        if (status != dnnl_success)
+            error::wrap_c_api(status, "could not set A scales");
+    }
+
+    /// Sets tensor B scales mask to a BRGeMM ukernel object.
+    ///
+    /// For quantization flavor tensor B scales apply to accumulation buffer
+    /// once C is ready.
+    ///
+    /// @param b_scale_mask Tensor B scale mask. Can be `0` and `2` only.
+    void set_B_scales(int b_scale_mask) {
+        dnnl_status_t status = dnnl_brgemm_set_B_scales(get(), b_scale_mask);
+        if (status != dnnl_success)
+            error::wrap_c_api(status, "could not set B scales");
+    }
+
+    /// Sets tensor D scales mask to a BRGeMM ukernel object.
+    ///
+    /// For quantization flavor tensor D scales apply after all post-ops are
+    /// applied.
+    ///
+    /// @param d_scale_mask Tensor D scale mask. Can be `0` only.
+    void set_D_scales(int d_scale_mask) {
+        dnnl_status_t status = dnnl_brgemm_set_D_scales(get(), d_scale_mask);
+        if (status != dnnl_success)
+            error::wrap_c_api(status, "could not set D scales");
+    }
+
+    /// Finalizes initialization of a BRGeMM ukernel object.
+    ///
+    /// This step must be performed prior to querying information from the
+    /// object.
+    void finalize() {
+        dnnl_status_t status = dnnl_brgemm_finalize(get());
+        if (status != dnnl_success)
+            error::wrap_c_api(status, "could not finalize an object");
+    }
+
+    /// Returns the packing type expected by a tensor B of a BRGeMM ukernel
+    /// object.
+    pack_type get_B_pack_type() const {
+        dnnl_pack_type_t c_pack_type;
+        dnnl_status_t status = dnnl_brgemm_get_B_pack_type(get(), &c_pack_type);
+        if (status != dnnl_success)
+            error::wrap_c_api(status, "could not query B pack type");
+
+        return static_cast<pack_type>(c_pack_type);
+    }
+
+    /// Returns the size of a scratchpad memory needed for the BRGeMM ukernel
+    /// object.
+    size_t get_scratchpad_size() const {
+        size_t size;
+        dnnl_status_t status = dnnl_brgemm_get_scratchpad_size(get(), &size);
+        if (status != dnnl_success)
+            error::wrap_c_api(status,
+                    "could not query a scratchpad size from a BRGeMM ukernel "
+                    "object");
+        return size;
+    }
+
+    /// Returns the flag indicating when the call to execute with post
+    /// operations is valid.
+    ///
+    /// `True` is for a valid call, `false`, otherwise.
+    bool is_execute_postops_valid() const {
+        int valid;
+        dnnl_status_t status
+                = dnnl_brgemm_is_execute_postops_valid(get(), &valid);
+        if (status != dnnl_success)
+            error::wrap_c_api(status,
+                    "could not query a flag for execute postops from a BRGeMM "
+                    "ukernel object");
+        return static_cast<bool>(valid);
+    }
+
+    /// Initializes the hardware-specific context. Affects the global state for
+    /// all BRGeMM ukernel objects. If no initialization required, returns.
+    void set_hw_context() const {
+        dnnl_status_t status = dnnl_brgemm_set_hw_context(get());
+        if (status != dnnl_success)
+            error::wrap_c_api(status, "could not set hardware context");
+    }
+
+    /// Releases the hardware-specific context. Affects the global state for
+    /// all BRGeMM ukernel objects. Must be used after all the execution calls
+    /// to BRGeMM ukernel objects.
+    static void release_hw_context() {
+        dnnl_status_t status = dnnl_brgemm_release_hw_context();
+        if (status != dnnl_success)
+            error::wrap_c_api(status, "could not release hardware context");
+    }
+
+    /// Generates an executable part of BRGeMM ukernel object.
+    void generate() {
+        dnnl_status_t status = dnnl_brgemm_generate(get());
+        if (status != dnnl_success)
+            error::wrap_c_api(status, "could not generate a kernel");
+    }
+
+    /// Executes a BRGeMM ukernel object.
+    ///
+    /// @param A Base pointer to a tensor A.
+    /// @param B Base pointer to a tensor B.
+    /// @param A_B_offsets Vector of pairs of tensors A and B offsets for
+    ///     each batch. The number of batches must coincide with the
+    ///     `batch_size` value passed at object construction stage.
+    /// @param C Pointer to a tensor C (accumulation buffer).
+    /// @param scratchpad Pointer to a scratchpad buffer.
+    void execute(const void *A, const void *B,
+            const std::vector<std::pair<memory::dim, memory::dim>> &A_B_offsets,
+            void *C, void *scratchpad) const {
+        // TODO: export batch_element to C API later for user to fill it and
+        // pass directly to the call.
+        dnnl_status_t status = dnnl_brgemm_execute(get(), A, B,
+                (const dnnl_dim_t *)A_B_offsets.data(), C, scratchpad);
+        if (status != dnnl_success)
+            error::wrap_c_api(
+                    status, "could not execute a BRGeMM ukernel object");
+    }
+
+    /// Executes a BRGeMM ukernel object with post operations.
+    ///
+    /// @param A Base pointer to a tensor A.
+    /// @param B Base pointer to a tensor B.
+    /// @param A_B_offsets Vector of pairs of tensors A and B offsets for
+    ///     each batch. The number of batches must coincide with the
+    ///     `batch_size` value passed at object construction stage.
+    /// @param C Pointer to a tensor C (accumulation buffer).
+    /// @param D Pointer to a tensor D (output buffer).
+    /// @param scratchpad Pointer to a scratchpad buffer.
+    /// @param params Post-op memory arguments. Must be passed If binary
+    ///     post-op or scales were set.
+    void execute(const void *A, const void *B,
+            const std::vector<std::pair<memory::dim, memory::dim>> &A_B_offsets,
+            const void *C, void *D, void *scratchpad,
+            const attr_params &params = default_attr_params()) const {
+        // TODO: export batch_element to C API later for user to fill it and
+        // pass directly to the call.
+        dnnl_status_t status = dnnl_brgemm_execute_postops(get(), A, B,
+                (const dnnl_dim_t *)A_B_offsets.data(), C, D, scratchpad,
+                params.get());
+        if (status != dnnl_success)
+            error::wrap_c_api(
+                    status, "could not execute a BRGeMM ukernel object");
+    }
+
+    /// Returns a constant reference to a static instance of default constructed
+    /// primitive post-operations attribute.
+    static const post_ops &default_post_ops() {
+        static const post_ops po;
+        return po;
+    }
+
+    /// Returns a constant reference to a static instance of default constructed
+    /// ukernel attributes parameters.
+    static const attr_params &default_attr_params() {
+        static const attr_params ap;
+        return ap;
+    }
+};
+/// @} dnnl_api_ukernel_brgemm
+
+/// @addtogroup dnnl_api_ukernel_transform Transform ukernel
+/// Transform routines
+/// @{
+
+/// Transform ukernel
+struct transform : public handle<dnnl_transform_t> {
+    /// Default constructor. Produces an empty object.
+    transform() = default;
+
+    /// Constructs a transform object.
+    ///
+    /// @param K Dimension K.
+    /// @param N Dimension N.
+    /// @param in_pack_type Input packing type. Must be one of
+    ///     `pack_type::no_trans`, or `pack_type::trans`.
+    /// @param in_ld Input leading dimension.
+    /// @param out_ld Output leading dimension. Specifies a block by N dimension
+    ///     during data packing.
+    /// @param in_dt Input data type.
+    /// @param out_dt Output data type.
+    /// @param allow_empty A flag signifying whether construction is
+    ///     allowed to fail without throwing an exception. In this case an
+    ///     empty object will be produced. This flag is optional and
+    ///     defaults to false.
+    transform(memory::dim K, memory::dim N, pack_type in_pack_type,
+            memory::dim in_ld, memory::dim out_ld, memory::data_type in_dt,
+            memory::data_type out_dt, bool allow_empty = false) {
+
+        dnnl_transform_t transform = nullptr;
+        dnnl_status_t status = dnnl_transform_create(&transform, K, N,
+                static_cast<dnnl_pack_type_t>(in_pack_type), in_ld, out_ld,
+                memory::convert_to_c(in_dt), memory::convert_to_c(out_dt));
+
+        if (!allow_empty)
+            error::wrap_c_api(status,
+                    "could not create a BRGeMM ukernel packing B object");
+        reset(transform);
+    }
+
+    /// Generates an executable part of transform object.
+    void generate() {
+        dnnl_status_t status = dnnl_transform_generate(get());
+        if (status != dnnl_success)
+            error::wrap_c_api(status,
+                    "could not generate a BRGeMM ukernel packing B object");
+    }
+
+    /// Executes a transform object.
+    ///
+    /// @param in Pointer to an input buffer.
+    /// @param out Pointer to an output buffer.
+    void execute(const void *in, void *out) const {
+        dnnl_status_t status = dnnl_transform_execute(get(), in, out);
+        if (status != dnnl_success)
+            error::wrap_c_api(status,
+                    "could not execute a BRGeMM ukernel packing B object");
+    }
+};
+
+/// @} dnnl_api_ukernel_transform
+
+#endif
+
+} // namespace ukernel
+
+/// @} dnnl_api_ukernel
+
+} // namespace dnnl
+
+/// @} dnnl_api
+
+#endif /* ONEAPI_DNNL_DNNL_UKERNEL_HPP */
diff --git a/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_ukernel_types.h b/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_ukernel_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..644e50e2adf727188bed73cc21e797868ae8cd2e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_ukernel_types.h
@@ -0,0 +1,93 @@
+/*******************************************************************************
+* Copyright 2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/// @file
+/// ukernel C API types definitions
+
+#ifndef ONEAPI_DNNL_DNNL_UKERNEL_TYPES_H
+#define ONEAPI_DNNL_DNNL_UKERNEL_TYPES_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "oneapi/dnnl/dnnl_types.h"
+
+/// @addtogroup dnnl_api
+/// @{
+
+/// @addtogroup dnnl_api_ukernel
+/// @{
+
+#ifdef DNNL_EXPERIMENTAL_UKERNEL
+
+/// Packing specification
+typedef enum {
+    /// Undefined pack type. A guard value.
+    dnnl_pack_type_undef = 0,
+    /// Plain, not transposed layout. Similar to format_tag::ab.
+    dnnl_pack_type_no_trans,
+    /// Plain, transposed layout. Similar to format_tag::ba.
+    dnnl_pack_type_trans,
+    /// Packed by 32 bits along K dimension layout.
+    dnnl_pack_type_pack32,
+} dnnl_pack_type_t;
+
+/// @struct dnnl_ukernel_attr_params
+/// An opaque structure to describe ukernel attributes memory storage.
+struct dnnl_ukernel_attr_params;
+
+/// A ukernel attributes memory storage handle.
+typedef struct dnnl_ukernel_attr_params *dnnl_ukernel_attr_params_t;
+
+/// A constant ukernel attributes memory storage handle.
+typedef const struct dnnl_ukernel_attr_params *const_dnnl_ukernel_attr_params_t;
+
+/// @addtogroup dnnl_api_ukernel_brgemm
+/// @{
+
+/// @struct dnnl_brgemm
+/// An opaque structure to describe a brgemm ukernel.
+struct dnnl_brgemm;
+
+/// A brgemm ukernel handle.
+typedef struct dnnl_brgemm *dnnl_brgemm_t;
+
+/// A constant brgemm ukernel handle.
+typedef const struct dnnl_brgemm *const_dnnl_brgemm_t;
+
+/// @struct dnnl_transform
+/// An opaque structure to describe a transform routine.
+struct dnnl_transform;
+
+/// A transform routine handle.
+typedef struct dnnl_transform *dnnl_transform_t;
+
+/// A constant transform routine handle.
+typedef const struct dnnl_transform *const_dnnl_transform_t;
+
+/// @} dnnl_api_ukernel_brgemm
+#endif
+
+/// @} dnnl_api_ukernel
+
+/// @} dnnl_api
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ONEAPI_DNNL_DNNL_UKERNEL_TYPES_H */
diff --git a/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_version.h b/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_version.h
new file mode 100644
index 0000000000000000000000000000000000000000..abd7a9d9f041d9e2146273848c53ffe182c9e459
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_version.h
@@ -0,0 +1,33 @@
+/*******************************************************************************
+* Copyright 2019-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef ONEAPI_DNNL_DNNL_VERSION_H
+#define ONEAPI_DNNL_DNNL_VERSION_H
+
+// clang-format off
+
+/// Major version
+#define DNNL_VERSION_MAJOR 3
+
+/// Minor version
+#define DNNL_VERSION_MINOR 7
+
+/// Patch version
+#define DNNL_VERSION_PATCH 1
+
+// clang-format on
+
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_version_hash.h b/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_version_hash.h
new file mode 100644
index 0000000000000000000000000000000000000000..147d397b4343de7deb478410b239f27227c8d73b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/oneapi/dnnl/dnnl_version_hash.h
@@ -0,0 +1,31 @@
+/*******************************************************************************
+* Copyright 2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef ONEAPI_DNNL_DNNL_VERSION_HASH_H
+#define ONEAPI_DNNL_DNNL_VERSION_HASH_H
+
+// clang-format off
+
+/// Note: this macro and header file were moved to a separate instance to avoid
+/// incremental build issues as moving from commit to commit would trigger a
+/// complete library rebuild. Including a generated header file in a single
+/// translation unit makes this problem go away.
+/// Git commit hash
+#define DNNL_VERSION_HASH  "8d263e693366ef8db40acc569cc7d8edf644556d"
+
+// clang-format on
+
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/pybind11/attr.h b/phivenv/Lib/site-packages/torch/include/pybind11/attr.h
new file mode 100644
index 0000000000000000000000000000000000000000..6ea183b39653c32d9e3458be697e270a6cd695be
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/pybind11/attr.h
@@ -0,0 +1,690 @@
+/*
+    pybind11/attr.h: Infrastructure for processing custom
+    type and function attributes
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+#include "cast.h"
+
+#include <functional>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+/// \addtogroup annotations
+/// @{
+
+/// Annotation for methods
+struct is_method {
+    handle class_;
+    explicit is_method(const handle &c) : class_(c) {}
+};
+
+/// Annotation for setters
+struct is_setter {};
+
+/// Annotation for operators
+struct is_operator {};
+
+/// Annotation for classes that cannot be subclassed
+struct is_final {};
+
+/// Annotation for parent scope
+struct scope {
+    handle value;
+    explicit scope(const handle &s) : value(s) {}
+};
+
+/// Annotation for documentation
+struct doc {
+    const char *value;
+    explicit doc(const char *value) : value(value) {}
+};
+
+/// Annotation for function names
+struct name {
+    const char *value;
+    explicit name(const char *value) : value(value) {}
+};
+
+/// Annotation indicating that a function is an overload associated with a given "sibling"
+struct sibling {
+    handle value;
+    explicit sibling(const handle &value) : value(value.ptr()) {}
+};
+
+/// Annotation indicating that a class derives from another given type
+template <typename T>
+struct base {
+
+    PYBIND11_DEPRECATED(
+        "base<T>() was deprecated in favor of specifying 'T' as a template argument to class_")
+    base() = default;
+};
+
+/// Keep patient alive while nurse lives
+template <size_t Nurse, size_t Patient>
+struct keep_alive {};
+
+/// Annotation indicating that a class is involved in a multiple inheritance relationship
+struct multiple_inheritance {};
+
+/// Annotation which enables dynamic attributes, i.e. adds `__dict__` to a class
+struct dynamic_attr {};
+
+/// Annotation which enables the buffer protocol for a type
+struct buffer_protocol {};
+
+/// Annotation which requests that a special metaclass is created for a type
+struct metaclass {
+    handle value;
+
+    PYBIND11_DEPRECATED("py::metaclass() is no longer required. It's turned on by default now.")
+    metaclass() = default;
+
+    /// Override pybind11's default metaclass
+    explicit metaclass(handle value) : value(value) {}
+};
+
+/// Specifies a custom callback with signature `void (PyHeapTypeObject*)` that
+/// may be used to customize the Python type.
+///
+/// The callback is invoked immediately before `PyType_Ready`.
+///
+/// Note: This is an advanced interface, and uses of it may require changes to
+/// work with later versions of pybind11.  You may wish to consult the
+/// implementation of `make_new_python_type` in `detail/classes.h` to understand
+/// the context in which the callback will be run.
+struct custom_type_setup {
+    using callback = std::function<void(PyHeapTypeObject *heap_type)>;
+
+    explicit custom_type_setup(callback value) : value(std::move(value)) {}
+
+    callback value;
+};
+
+/// Annotation that marks a class as local to the module:
+struct module_local {
+    const bool value;
+    constexpr explicit module_local(bool v = true) : value(v) {}
+};
+
+/// Annotation to mark enums as an arithmetic type
+struct arithmetic {};
+
+/// Mark a function for addition at the beginning of the existing overload chain instead of the end
+struct prepend {};
+
+/** \rst
+    A call policy which places one or more guard variables (``Ts...``) around the function call.
+
+    For example, this definition:
+
+    .. code-block:: cpp
+
+        m.def("foo", foo, py::call_guard<T>());
+
+    is equivalent to the following pseudocode:
+
+    .. code-block:: cpp
+
+        m.def("foo", [](args...) {
+            T scope_guard;
+            return foo(args...); // forwarded arguments
+        });
+ \endrst */
+template <typename... Ts>
+struct call_guard;
+
+template <>
+struct call_guard<> {
+    using type = detail::void_type;
+};
+
+template <typename T>
+struct call_guard<T> {
+    static_assert(std::is_default_constructible<T>::value,
+                  "The guard type must be default constructible");
+
+    using type = T;
+};
+
+template <typename T, typename... Ts>
+struct call_guard<T, Ts...> {
+    struct type {
+        T guard{}; // Compose multiple guard types with left-to-right default-constructor order
+        typename call_guard<Ts...>::type next{};
+    };
+};
+
+/// @} annotations
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+/* Forward declarations */
+enum op_id : int;
+enum op_type : int;
+struct undefined_t;
+template <op_id id, op_type ot, typename L = undefined_t, typename R = undefined_t>
+struct op_;
+void keep_alive_impl(size_t Nurse, size_t Patient, function_call &call, handle ret);
+
+/// Internal data structure which holds metadata about a keyword argument
+struct argument_record {
+    const char *name;  ///< Argument name
+    const char *descr; ///< Human-readable version of the argument value
+    handle value;      ///< Associated Python object
+    bool convert : 1;  ///< True if the argument is allowed to convert when loading
+    bool none : 1;     ///< True if None is allowed when loading
+
+    argument_record(const char *name, const char *descr, handle value, bool convert, bool none)
+        : name(name), descr(descr), value(value), convert(convert), none(none) {}
+};
+
+/// Internal data structure which holds metadata about a bound function (signature, overloads,
+/// etc.)
+struct function_record {
+    function_record()
+        : is_constructor(false), is_new_style_constructor(false), is_stateless(false),
+          is_operator(false), is_method(false), is_setter(false), has_args(false),
+          has_kwargs(false), prepend(false) {}
+
+    /// Function name
+    char *name = nullptr; /* why no C++ strings? They generate heavier code.. */
+
+    // User-specified documentation string
+    char *doc = nullptr;
+
+    /// Human-readable version of the function signature
+    char *signature = nullptr;
+
+    /// List of registered keyword arguments
+    std::vector<argument_record> args;
+
+    /// Pointer to lambda function which converts arguments and performs the actual call
+    handle (*impl)(function_call &) = nullptr;
+
+    /// Storage for the wrapped function pointer and captured data, if any
+    void *data[3] = {};
+
+    /// Pointer to custom destructor for 'data' (if needed)
+    void (*free_data)(function_record *ptr) = nullptr;
+
+    /// Return value policy associated with this function
+    return_value_policy policy = return_value_policy::automatic;
+
+    /// True if name == '__init__'
+    bool is_constructor : 1;
+
+    /// True if this is a new-style `__init__` defined in `detail/init.h`
+    bool is_new_style_constructor : 1;
+
+    /// True if this is a stateless function pointer
+    bool is_stateless : 1;
+
+    /// True if this is an operator (__add__), etc.
+    bool is_operator : 1;
+
+    /// True if this is a method
+    bool is_method : 1;
+
+    /// True if this is a setter
+    bool is_setter : 1;
+
+    /// True if the function has a '*args' argument
+    bool has_args : 1;
+
+    /// True if the function has a '**kwargs' argument
+    bool has_kwargs : 1;
+
+    /// True if this function is to be inserted at the beginning of the overload resolution chain
+    bool prepend : 1;
+
+    /// Number of arguments (including py::args and/or py::kwargs, if present)
+    std::uint16_t nargs;
+
+    /// Number of leading positional arguments, which are terminated by a py::args or py::kwargs
+    /// argument or by a py::kw_only annotation.
+    std::uint16_t nargs_pos = 0;
+
+    /// Number of leading arguments (counted in `nargs`) that are positional-only
+    std::uint16_t nargs_pos_only = 0;
+
+    /// Python method object
+    PyMethodDef *def = nullptr;
+
+    /// Python handle to the parent scope (a class or a module)
+    handle scope;
+
+    /// Python handle to the sibling function representing an overload chain
+    handle sibling;
+
+    /// Pointer to next overload
+    function_record *next = nullptr;
+};
+
+/// Special data structure which (temporarily) holds metadata about a bound class
+struct type_record {
+    PYBIND11_NOINLINE type_record()
+        : multiple_inheritance(false), dynamic_attr(false), buffer_protocol(false),
+          default_holder(true), module_local(false), is_final(false) {}
+
+    /// Handle to the parent scope
+    handle scope;
+
+    /// Name of the class
+    const char *name = nullptr;
+
+    // Pointer to RTTI type_info data structure
+    const std::type_info *type = nullptr;
+
+    /// How large is the underlying C++ type?
+    size_t type_size = 0;
+
+    /// What is the alignment of the underlying C++ type?
+    size_t type_align = 0;
+
+    /// How large is the type's holder?
+    size_t holder_size = 0;
+
+    /// The global operator new can be overridden with a class-specific variant
+    void *(*operator_new)(size_t) = nullptr;
+
+    /// Function pointer to class_<..>::init_instance
+    void (*init_instance)(instance *, const void *) = nullptr;
+
+    /// Function pointer to class_<..>::dealloc
+    void (*dealloc)(detail::value_and_holder &) = nullptr;
+
+    /// List of base classes of the newly created type
+    list bases;
+
+    /// Optional docstring
+    const char *doc = nullptr;
+
+    /// Custom metaclass (optional)
+    handle metaclass;
+
+    /// Custom type setup.
+    custom_type_setup::callback custom_type_setup_callback;
+
+    /// Multiple inheritance marker
+    bool multiple_inheritance : 1;
+
+    /// Does the class manage a __dict__?
+    bool dynamic_attr : 1;
+
+    /// Does the class implement the buffer protocol?
+    bool buffer_protocol : 1;
+
+    /// Is the default (unique_ptr) holder type used?
+    bool default_holder : 1;
+
+    /// Is the class definition local to the module shared object?
+    bool module_local : 1;
+
+    /// Is the class inheritable from python classes?
+    bool is_final : 1;
+
+    PYBIND11_NOINLINE void add_base(const std::type_info &base, void *(*caster)(void *) ) {
+        auto *base_info = detail::get_type_info(base, false);
+        if (!base_info) {
+            std::string tname(base.name());
+            detail::clean_type_id(tname);
+            pybind11_fail("generic_type: type \"" + std::string(name)
+                          + "\" referenced unknown base type \"" + tname + "\"");
+        }
+
+        if (default_holder != base_info->default_holder) {
+            std::string tname(base.name());
+            detail::clean_type_id(tname);
+            pybind11_fail("generic_type: type \"" + std::string(name) + "\" "
+                          + (default_holder ? "does not have" : "has")
+                          + " a non-default holder type while its base \"" + tname + "\" "
+                          + (base_info->default_holder ? "does not" : "does"));
+        }
+
+        bases.append((PyObject *) base_info->type);
+
+#if PY_VERSION_HEX < 0x030B0000
+        dynamic_attr |= base_info->type->tp_dictoffset != 0;
+#else
+        dynamic_attr |= (base_info->type->tp_flags & Py_TPFLAGS_MANAGED_DICT) != 0;
+#endif
+
+        if (caster) {
+            base_info->implicit_casts.emplace_back(type, caster);
+        }
+    }
+};
+
+inline function_call::function_call(const function_record &f, handle p) : func(f), parent(p) {
+    args.reserve(f.nargs);
+    args_convert.reserve(f.nargs);
+}
+
+/// Tag for a new-style `__init__` defined in `detail/init.h`
+struct is_new_style_constructor {};
+
+/**
+ * Partial template specializations to process custom attributes provided to
+ * cpp_function_ and class_. These are either used to initialize the respective
+ * fields in the type_record and function_record data structures or executed at
+ * runtime to deal with custom call policies (e.g. keep_alive).
+ */
+template <typename T, typename SFINAE = void>
+struct process_attribute;
+
+template <typename T>
+struct process_attribute_default {
+    /// Default implementation: do nothing
+    static void init(const T &, function_record *) {}
+    static void init(const T &, type_record *) {}
+    static void precall(function_call &) {}
+    static void postcall(function_call &, handle) {}
+};
+
+/// Process an attribute specifying the function's name
+template <>
+struct process_attribute<name> : process_attribute_default<name> {
+    static void init(const name &n, function_record *r) { r->name = const_cast<char *>(n.value); }
+};
+
+/// Process an attribute specifying the function's docstring
+template <>
+struct process_attribute<doc> : process_attribute_default<doc> {
+    static void init(const doc &n, function_record *r) { r->doc = const_cast<char *>(n.value); }
+};
+
+/// Process an attribute specifying the function's docstring (provided as a C-style string)
+template <>
+struct process_attribute<const char *> : process_attribute_default<const char *> {
+    static void init(const char *d, function_record *r) { r->doc = const_cast<char *>(d); }
+    static void init(const char *d, type_record *r) { r->doc = d; }
+};
+template <>
+struct process_attribute<char *> : process_attribute<const char *> {};
+
+/// Process an attribute indicating the function's return value policy
+template <>
+struct process_attribute<return_value_policy> : process_attribute_default<return_value_policy> {
+    static void init(const return_value_policy &p, function_record *r) { r->policy = p; }
+};
+
+/// Process an attribute which indicates that this is an overloaded function associated with a
+/// given sibling
+template <>
+struct process_attribute<sibling> : process_attribute_default<sibling> {
+    static void init(const sibling &s, function_record *r) { r->sibling = s.value; }
+};
+
+/// Process an attribute which indicates that this function is a method
+template <>
+struct process_attribute<is_method> : process_attribute_default<is_method> {
+    static void init(const is_method &s, function_record *r) {
+        r->is_method = true;
+        r->scope = s.class_;
+    }
+};
+
+/// Process an attribute which indicates that this function is a setter
+template <>
+struct process_attribute<is_setter> : process_attribute_default<is_setter> {
+    static void init(const is_setter &, function_record *r) { r->is_setter = true; }
+};
+
+/// Process an attribute which indicates the parent scope of a method
+template <>
+struct process_attribute<scope> : process_attribute_default<scope> {
+    static void init(const scope &s, function_record *r) { r->scope = s.value; }
+};
+
+/// Process an attribute which indicates that this function is an operator
+template <>
+struct process_attribute<is_operator> : process_attribute_default<is_operator> {
+    static void init(const is_operator &, function_record *r) { r->is_operator = true; }
+};
+
+template <>
+struct process_attribute<is_new_style_constructor>
+    : process_attribute_default<is_new_style_constructor> {
+    static void init(const is_new_style_constructor &, function_record *r) {
+        r->is_new_style_constructor = true;
+    }
+};
+
+inline void check_kw_only_arg(const arg &a, function_record *r) {
+    if (r->args.size() > r->nargs_pos && (!a.name || a.name[0] == '\0')) {
+        pybind11_fail("arg(): cannot specify an unnamed argument after a kw_only() annotation or "
+                      "args() argument");
+    }
+}
+
+inline void append_self_arg_if_needed(function_record *r) {
+    if (r->is_method && r->args.empty()) {
+        r->args.emplace_back("self", nullptr, handle(), /*convert=*/true, /*none=*/false);
+    }
+}
+
+/// Process a keyword argument attribute (*without* a default value)
+template <>
+struct process_attribute<arg> : process_attribute_default<arg> {
+    static void init(const arg &a, function_record *r) {
+        append_self_arg_if_needed(r);
+        r->args.emplace_back(a.name, nullptr, handle(), !a.flag_noconvert, a.flag_none);
+
+        check_kw_only_arg(a, r);
+    }
+};
+
+/// Process a keyword argument attribute (*with* a default value)
+template <>
+struct process_attribute<arg_v> : process_attribute_default<arg_v> {
+    static void init(const arg_v &a, function_record *r) {
+        if (r->is_method && r->args.empty()) {
+            r->args.emplace_back(
+                "self", /*descr=*/nullptr, /*parent=*/handle(), /*convert=*/true, /*none=*/false);
+        }
+
+        if (!a.value) {
+#if defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+            std::string descr("'");
+            if (a.name) {
+                descr += std::string(a.name) + ": ";
+            }
+            descr += a.type + "'";
+            if (r->is_method) {
+                if (r->name) {
+                    descr += " in method '" + (std::string) str(r->scope) + "."
+                             + (std::string) r->name + "'";
+                } else {
+                    descr += " in method of '" + (std::string) str(r->scope) + "'";
+                }
+            } else if (r->name) {
+                descr += " in function '" + (std::string) r->name + "'";
+            }
+            pybind11_fail("arg(): could not convert default argument " + descr
+                          + " into a Python object (type not registered yet?)");
+#else
+            pybind11_fail("arg(): could not convert default argument "
+                          "into a Python object (type not registered yet?). "
+                          "#define PYBIND11_DETAILED_ERROR_MESSAGES or compile in debug mode for "
+                          "more information.");
+#endif
+        }
+        r->args.emplace_back(a.name, a.descr, a.value.inc_ref(), !a.flag_noconvert, a.flag_none);
+
+        check_kw_only_arg(a, r);
+    }
+};
+
+/// Process a keyword-only-arguments-follow pseudo argument
+template <>
+struct process_attribute<kw_only> : process_attribute_default<kw_only> {
+    static void init(const kw_only &, function_record *r) {
+        append_self_arg_if_needed(r);
+        if (r->has_args && r->nargs_pos != static_cast<std::uint16_t>(r->args.size())) {
+            pybind11_fail("Mismatched args() and kw_only(): they must occur at the same relative "
+                          "argument location (or omit kw_only() entirely)");
+        }
+        r->nargs_pos = static_cast<std::uint16_t>(r->args.size());
+    }
+};
+
+/// Process a positional-only-argument maker
+template <>
+struct process_attribute<pos_only> : process_attribute_default<pos_only> {
+    static void init(const pos_only &, function_record *r) {
+        append_self_arg_if_needed(r);
+        r->nargs_pos_only = static_cast<std::uint16_t>(r->args.size());
+        if (r->nargs_pos_only > r->nargs_pos) {
+            pybind11_fail("pos_only(): cannot follow a py::args() argument");
+        }
+        // It also can't follow a kw_only, but a static_assert in pybind11.h checks that
+    }
+};
+
+/// Process a parent class attribute.  Single inheritance only (class_ itself already guarantees
+/// that)
+template <typename T>
+struct process_attribute<T, enable_if_t<is_pyobject<T>::value>>
+    : process_attribute_default<handle> {
+    static void init(const handle &h, type_record *r) { r->bases.append(h); }
+};
+
+/// Process a parent class attribute (deprecated, does not support multiple inheritance)
+template <typename T>
+struct process_attribute<base<T>> : process_attribute_default<base<T>> {
+    static void init(const base<T> &, type_record *r) { r->add_base(typeid(T), nullptr); }
+};
+
+/// Process a multiple inheritance attribute
+template <>
+struct process_attribute<multiple_inheritance> : process_attribute_default<multiple_inheritance> {
+    static void init(const multiple_inheritance &, type_record *r) {
+        r->multiple_inheritance = true;
+    }
+};
+
+template <>
+struct process_attribute<dynamic_attr> : process_attribute_default<dynamic_attr> {
+    static void init(const dynamic_attr &, type_record *r) { r->dynamic_attr = true; }
+};
+
+template <>
+struct process_attribute<custom_type_setup> {
+    static void init(const custom_type_setup &value, type_record *r) {
+        r->custom_type_setup_callback = value.value;
+    }
+};
+
+template <>
+struct process_attribute<is_final> : process_attribute_default<is_final> {
+    static void init(const is_final &, type_record *r) { r->is_final = true; }
+};
+
+template <>
+struct process_attribute<buffer_protocol> : process_attribute_default<buffer_protocol> {
+    static void init(const buffer_protocol &, type_record *r) { r->buffer_protocol = true; }
+};
+
+template <>
+struct process_attribute<metaclass> : process_attribute_default<metaclass> {
+    static void init(const metaclass &m, type_record *r) { r->metaclass = m.value; }
+};
+
+template <>
+struct process_attribute<module_local> : process_attribute_default<module_local> {
+    static void init(const module_local &l, type_record *r) { r->module_local = l.value; }
+};
+
+/// Process a 'prepend' attribute, putting this at the beginning of the overload chain
+template <>
+struct process_attribute<prepend> : process_attribute_default<prepend> {
+    static void init(const prepend &, function_record *r) { r->prepend = true; }
+};
+
+/// Process an 'arithmetic' attribute for enums (does nothing here)
+template <>
+struct process_attribute<arithmetic> : process_attribute_default<arithmetic> {};
+
+template <typename... Ts>
+struct process_attribute<call_guard<Ts...>> : process_attribute_default<call_guard<Ts...>> {};
+
+/**
+ * Process a keep_alive call policy -- invokes keep_alive_impl during the
+ * pre-call handler if both Nurse, Patient != 0 and use the post-call handler
+ * otherwise
+ */
+template <size_t Nurse, size_t Patient>
+struct process_attribute<keep_alive<Nurse, Patient>>
+    : public process_attribute_default<keep_alive<Nurse, Patient>> {
+    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N != 0 && P != 0, int> = 0>
+    static void precall(function_call &call) {
+        keep_alive_impl(Nurse, Patient, call, handle());
+    }
+    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N != 0 && P != 0, int> = 0>
+    static void postcall(function_call &, handle) {}
+    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N == 0 || P == 0, int> = 0>
+    static void precall(function_call &) {}
+    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N == 0 || P == 0, int> = 0>
+    static void postcall(function_call &call, handle ret) {
+        keep_alive_impl(Nurse, Patient, call, ret);
+    }
+};
+
+/// Recursively iterate over variadic template arguments
+template <typename... Args>
+struct process_attributes {
+    static void init(const Args &...args, function_record *r) {
+        PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(r);
+        PYBIND11_WORKAROUND_INCORRECT_GCC_UNUSED_BUT_SET_PARAMETER(r);
+        using expander = int[];
+        (void) expander{
+            0, ((void) process_attribute<typename std::decay<Args>::type>::init(args, r), 0)...};
+    }
+    static void init(const Args &...args, type_record *r) {
+        PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(r);
+        PYBIND11_WORKAROUND_INCORRECT_GCC_UNUSED_BUT_SET_PARAMETER(r);
+        using expander = int[];
+        (void) expander{0,
+                        (process_attribute<typename std::decay<Args>::type>::init(args, r), 0)...};
+    }
+    static void precall(function_call &call) {
+        PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(call);
+        using expander = int[];
+        (void) expander{0,
+                        (process_attribute<typename std::decay<Args>::type>::precall(call), 0)...};
+    }
+    static void postcall(function_call &call, handle fn_ret) {
+        PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(call, fn_ret);
+        PYBIND11_WORKAROUND_INCORRECT_GCC_UNUSED_BUT_SET_PARAMETER(fn_ret);
+        using expander = int[];
+        (void) expander{
+            0, (process_attribute<typename std::decay<Args>::type>::postcall(call, fn_ret), 0)...};
+    }
+};
+
+template <typename T>
+using is_call_guard = is_instantiation<call_guard, T>;
+
+/// Extract the ``type`` from the first `call_guard` in `Extras...` (or `void_type` if none found)
+template <typename... Extra>
+using extract_guard_t = typename exactly_one_t<is_call_guard, call_guard<>, Extra...>::type;
+
+/// Check the number of named arguments at compile time
+template <typename... Extra,
+          size_t named = constexpr_sum(std::is_base_of<arg, Extra>::value...),
+          size_t self = constexpr_sum(std::is_same<is_method, Extra>::value...)>
+constexpr bool expected_num_args(size_t nargs, bool has_args, bool has_kwargs) {
+    PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(nargs, has_args, has_kwargs);
+    return named == 0 || (self + named + size_t(has_args) + size_t(has_kwargs)) == nargs;
+}
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/phivenv/Lib/site-packages/torch/include/pybind11/buffer_info.h b/phivenv/Lib/site-packages/torch/include/pybind11/buffer_info.h
new file mode 100644
index 0000000000000000000000000000000000000000..c44438ffa079fd8900438adaac9ed0abaf4d39ac
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/pybind11/buffer_info.h
@@ -0,0 +1,208 @@
+/*
+    pybind11/buffer_info.h: Python buffer object interface
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// Default, C-style strides
+inline std::vector<ssize_t> c_strides(const std::vector<ssize_t> &shape, ssize_t itemsize) {
+    auto ndim = shape.size();
+    std::vector<ssize_t> strides(ndim, itemsize);
+    if (ndim > 0) {
+        for (size_t i = ndim - 1; i > 0; --i) {
+            strides[i - 1] = strides[i] * shape[i];
+        }
+    }
+    return strides;
+}
+
+// F-style strides; default when constructing an array_t with `ExtraFlags & f_style`
+inline std::vector<ssize_t> f_strides(const std::vector<ssize_t> &shape, ssize_t itemsize) {
+    auto ndim = shape.size();
+    std::vector<ssize_t> strides(ndim, itemsize);
+    for (size_t i = 1; i < ndim; ++i) {
+        strides[i] = strides[i - 1] * shape[i - 1];
+    }
+    return strides;
+}
+
+template <typename T, typename SFINAE = void>
+struct compare_buffer_info;
+
+PYBIND11_NAMESPACE_END(detail)
+
+/// Information record describing a Python buffer object
+struct buffer_info {
+    void *ptr = nullptr;          // Pointer to the underlying storage
+    ssize_t itemsize = 0;         // Size of individual items in bytes
+    ssize_t size = 0;             // Total number of entries
+    std::string format;           // For homogeneous buffers, this should be set to
+                                  // format_descriptor<T>::format()
+    ssize_t ndim = 0;             // Number of dimensions
+    std::vector<ssize_t> shape;   // Shape of the tensor (1 entry per dimension)
+    std::vector<ssize_t> strides; // Number of bytes between adjacent entries
+                                  // (for each per dimension)
+    bool readonly = false;        // flag to indicate if the underlying storage may be written to
+
+    buffer_info() = default;
+
+    buffer_info(void *ptr,
+                ssize_t itemsize,
+                const std::string &format,
+                ssize_t ndim,
+                detail::any_container<ssize_t> shape_in,
+                detail::any_container<ssize_t> strides_in,
+                bool readonly = false)
+        : ptr(ptr), itemsize(itemsize), size(1), format(format), ndim(ndim),
+          shape(std::move(shape_in)), strides(std::move(strides_in)), readonly(readonly) {
+        if (ndim != (ssize_t) shape.size() || ndim != (ssize_t) strides.size()) {
+            pybind11_fail("buffer_info: ndim doesn't match shape and/or strides length");
+        }
+        for (size_t i = 0; i < (size_t) ndim; ++i) {
+            size *= shape[i];
+        }
+    }
+
+    template <typename T>
+    buffer_info(T *ptr,
+                detail::any_container<ssize_t> shape_in,
+                detail::any_container<ssize_t> strides_in,
+                bool readonly = false)
+        : buffer_info(private_ctr_tag(),
+                      ptr,
+                      sizeof(T),
+                      format_descriptor<T>::format(),
+                      static_cast<ssize_t>(shape_in->size()),
+                      std::move(shape_in),
+                      std::move(strides_in),
+                      readonly) {}
+
+    buffer_info(void *ptr,
+                ssize_t itemsize,
+                const std::string &format,
+                ssize_t size,
+                bool readonly = false)
+        : buffer_info(ptr, itemsize, format, 1, {size}, {itemsize}, readonly) {}
+
+    template <typename T>
+    buffer_info(T *ptr, ssize_t size, bool readonly = false)
+        : buffer_info(ptr, sizeof(T), format_descriptor<T>::format(), size, readonly) {}
+
+    template <typename T>
+    buffer_info(const T *ptr, ssize_t size, bool readonly = true)
+        : buffer_info(
+              const_cast<T *>(ptr), sizeof(T), format_descriptor<T>::format(), size, readonly) {}
+
+    explicit buffer_info(Py_buffer *view, bool ownview = true)
+        : buffer_info(
+              view->buf,
+              view->itemsize,
+              view->format,
+              view->ndim,
+              {view->shape, view->shape + view->ndim},
+              /* Though buffer::request() requests PyBUF_STRIDES, ctypes objects
+               * ignore this flag and return a view with NULL strides.
+               * When strides are NULL, build them manually.  */
+              view->strides
+                  ? std::vector<ssize_t>(view->strides, view->strides + view->ndim)
+                  : detail::c_strides({view->shape, view->shape + view->ndim}, view->itemsize),
+              (view->readonly != 0)) {
+        // NOLINTNEXTLINE(cppcoreguidelines-prefer-member-initializer)
+        this->m_view = view;
+        // NOLINTNEXTLINE(cppcoreguidelines-prefer-member-initializer)
+        this->ownview = ownview;
+    }
+
+    buffer_info(const buffer_info &) = delete;
+    buffer_info &operator=(const buffer_info &) = delete;
+
+    buffer_info(buffer_info &&other) noexcept { (*this) = std::move(other); }
+
+    buffer_info &operator=(buffer_info &&rhs) noexcept {
+        ptr = rhs.ptr;
+        itemsize = rhs.itemsize;
+        size = rhs.size;
+        format = std::move(rhs.format);
+        ndim = rhs.ndim;
+        shape = std::move(rhs.shape);
+        strides = std::move(rhs.strides);
+        std::swap(m_view, rhs.m_view);
+        std::swap(ownview, rhs.ownview);
+        readonly = rhs.readonly;
+        return *this;
+    }
+
+    ~buffer_info() {
+        if (m_view && ownview) {
+            PyBuffer_Release(m_view);
+            delete m_view;
+        }
+    }
+
+    Py_buffer *view() const { return m_view; }
+    Py_buffer *&view() { return m_view; }
+
+    /* True if the buffer item type is equivalent to `T`. */
+    // To define "equivalent" by example:
+    // `buffer_info::item_type_is_equivalent_to<int>(b)` and
+    // `buffer_info::item_type_is_equivalent_to<long>(b)` may both be true
+    // on some platforms, but `int` and `unsigned` will never be equivalent.
+    // For the ground truth, please inspect `detail::compare_buffer_info<>`.
+    template <typename T>
+    bool item_type_is_equivalent_to() const {
+        return detail::compare_buffer_info<T>::compare(*this);
+    }
+
+private:
+    struct private_ctr_tag {};
+
+    buffer_info(private_ctr_tag,
+                void *ptr,
+                ssize_t itemsize,
+                const std::string &format,
+                ssize_t ndim,
+                detail::any_container<ssize_t> &&shape_in,
+                detail::any_container<ssize_t> &&strides_in,
+                bool readonly)
+        : buffer_info(
+              ptr, itemsize, format, ndim, std::move(shape_in), std::move(strides_in), readonly) {}
+
+    Py_buffer *m_view = nullptr;
+    bool ownview = false;
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <typename T, typename SFINAE>
+struct compare_buffer_info {
+    static bool compare(const buffer_info &b) {
+        // NOLINTNEXTLINE(bugprone-sizeof-expression) Needed for `PyObject *`
+        return b.format == format_descriptor<T>::format() && b.itemsize == (ssize_t) sizeof(T);
+    }
+};
+
+template <typename T>
+struct compare_buffer_info<T, detail::enable_if_t<std::is_integral<T>::value>> {
+    static bool compare(const buffer_info &b) {
+        return (size_t) b.itemsize == sizeof(T)
+               && (b.format == format_descriptor<T>::value
+                   || ((sizeof(T) == sizeof(long))
+                       && b.format == (std::is_unsigned<T>::value ? "L" : "l"))
+                   || ((sizeof(T) == sizeof(size_t))
+                       && b.format == (std::is_unsigned<T>::value ? "N" : "n")));
+    }
+};
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/phivenv/Lib/site-packages/torch/include/pybind11/cast.h b/phivenv/Lib/site-packages/torch/include/pybind11/cast.h
new file mode 100644
index 0000000000000000000000000000000000000000..6a32f537ceca3212dfc4adba65afd3a57881efa3
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/pybind11/cast.h
@@ -0,0 +1,1855 @@
+/*
+    pybind11/cast.h: Partial template specializations to cast between
+    C++ and Python types
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+#include "detail/descr.h"
+#include "detail/type_caster_base.h"
+#include "detail/typeid.h"
+#include "pytypes.h"
+
+#include <array>
+#include <cstring>
+#include <functional>
+#include <iosfwd>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+PYBIND11_WARNING_DISABLE_MSVC(4127)
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <typename type, typename SFINAE = void>
+class type_caster : public type_caster_base<type> {};
+template <typename type>
+using make_caster = type_caster<intrinsic_t<type>>;
+
+// Shortcut for calling a caster's `cast_op_type` cast operator for casting a type_caster to a T
+template <typename T>
+typename make_caster<T>::template cast_op_type<T> cast_op(make_caster<T> &caster) {
+    using result_t = typename make_caster<T>::template cast_op_type<T>; // See PR #4893
+    return caster.operator result_t();
+}
+template <typename T>
+typename make_caster<T>::template cast_op_type<typename std::add_rvalue_reference<T>::type>
+cast_op(make_caster<T> &&caster) {
+    using result_t = typename make_caster<T>::template cast_op_type<
+        typename std::add_rvalue_reference<T>::type>; // See PR #4893
+    return std::move(caster).operator result_t();
+}
+
+template <typename type>
+class type_caster<std::reference_wrapper<type>> {
+private:
+    using caster_t = make_caster<type>;
+    caster_t subcaster;
+    using reference_t = type &;
+    using subcaster_cast_op_type = typename caster_t::template cast_op_type<reference_t>;
+
+    static_assert(
+        std::is_same<typename std::remove_const<type>::type &, subcaster_cast_op_type>::value
+            || std::is_same<reference_t, subcaster_cast_op_type>::value,
+        "std::reference_wrapper<T> caster requires T to have a caster with an "
+        "`operator T &()` or `operator const T &()`");
+
+public:
+    bool load(handle src, bool convert) { return subcaster.load(src, convert); }
+    static constexpr auto name = caster_t::name;
+    static handle
+    cast(const std::reference_wrapper<type> &src, return_value_policy policy, handle parent) {
+        // It is definitely wrong to take ownership of this pointer, so mask that rvp
+        if (policy == return_value_policy::take_ownership
+            || policy == return_value_policy::automatic) {
+            policy = return_value_policy::automatic_reference;
+        }
+        return caster_t::cast(&src.get(), policy, parent);
+    }
+    template <typename T>
+    using cast_op_type = std::reference_wrapper<type>;
+    explicit operator std::reference_wrapper<type>() { return cast_op<type &>(subcaster); }
+};
+
+#define PYBIND11_TYPE_CASTER(type, py_name)                                                       \
+protected:                                                                                        \
+    type value;                                                                                   \
+                                                                                                  \
+public:                                                                                           \
+    static constexpr auto name = py_name;                                                         \
+    template <typename T_,                                                                        \
+              ::pybind11::detail::enable_if_t<                                                    \
+                  std::is_same<type, ::pybind11::detail::remove_cv_t<T_>>::value,                 \
+                  int>                                                                            \
+              = 0>                                                                                \
+    static ::pybind11::handle cast(                                                               \
+        T_ *src, ::pybind11::return_value_policy policy, ::pybind11::handle parent) {             \
+        if (!src)                                                                                 \
+            return ::pybind11::none().release();                                                  \
+        if (policy == ::pybind11::return_value_policy::take_ownership) {                          \
+            auto h = cast(std::move(*src), policy, parent);                                       \
+            delete src;                                                                           \
+            return h;                                                                             \
+        }                                                                                         \
+        return cast(*src, policy, parent);                                                        \
+    }                                                                                             \
+    operator type *() { return &value; }               /* NOLINT(bugprone-macro-parentheses) */   \
+    operator type &() { return value; }                /* NOLINT(bugprone-macro-parentheses) */   \
+    operator type &&() && { return std::move(value); } /* NOLINT(bugprone-macro-parentheses) */   \
+    template <typename T_>                                                                        \
+    using cast_op_type = ::pybind11::detail::movable_cast_op_type<T_>
+
+template <typename CharT>
+using is_std_char_type = any_of<std::is_same<CharT, char>, /* std::string */
+#if defined(PYBIND11_HAS_U8STRING)
+                                std::is_same<CharT, char8_t>, /* std::u8string */
+#endif
+                                std::is_same<CharT, char16_t>, /* std::u16string */
+                                std::is_same<CharT, char32_t>, /* std::u32string */
+                                std::is_same<CharT, wchar_t>   /* std::wstring */
+                                >;
+
+template <typename T>
+struct type_caster<T, enable_if_t<std::is_arithmetic<T>::value && !is_std_char_type<T>::value>> {
+    using _py_type_0 = conditional_t<sizeof(T) <= sizeof(long), long, long long>;
+    using _py_type_1 = conditional_t<std::is_signed<T>::value,
+                                     _py_type_0,
+                                     typename std::make_unsigned<_py_type_0>::type>;
+    using py_type = conditional_t<std::is_floating_point<T>::value, double, _py_type_1>;
+
+public:
+    bool load(handle src, bool convert) {
+        py_type py_value;
+
+        if (!src) {
+            return false;
+        }
+
+#if !defined(PYPY_VERSION)
+        auto index_check = [](PyObject *o) { return PyIndex_Check(o); };
+#else
+        // In PyPy 7.3.3, `PyIndex_Check` is implemented by calling `__index__`,
+        // while CPython only considers the existence of `nb_index`/`__index__`.
+        auto index_check = [](PyObject *o) { return hasattr(o, "__index__"); };
+#endif
+
+        if (std::is_floating_point<T>::value) {
+            if (convert || PyFloat_Check(src.ptr())) {
+                py_value = (py_type) PyFloat_AsDouble(src.ptr());
+            } else {
+                return false;
+            }
+        } else if (PyFloat_Check(src.ptr())
+                   || (!convert && !PYBIND11_LONG_CHECK(src.ptr()) && !index_check(src.ptr()))) {
+            return false;
+        } else {
+            handle src_or_index = src;
+            // PyPy: 7.3.7's 3.8 does not implement PyLong_*'s __index__ calls.
+#if PY_VERSION_HEX < 0x03080000 || defined(PYPY_VERSION)
+            object index;
+            if (!PYBIND11_LONG_CHECK(src.ptr())) { // So: index_check(src.ptr())
+                index = reinterpret_steal<object>(PyNumber_Index(src.ptr()));
+                if (!index) {
+                    PyErr_Clear();
+                    if (!convert)
+                        return false;
+                } else {
+                    src_or_index = index;
+                }
+            }
+#endif
+            if (std::is_unsigned<py_type>::value) {
+                py_value = as_unsigned<py_type>(src_or_index.ptr());
+            } else { // signed integer:
+                py_value = sizeof(T) <= sizeof(long)
+                               ? (py_type) PyLong_AsLong(src_or_index.ptr())
+                               : (py_type) PYBIND11_LONG_AS_LONGLONG(src_or_index.ptr());
+            }
+        }
+
+        // Python API reported an error
+        bool py_err = py_value == (py_type) -1 && PyErr_Occurred();
+
+        // Check to see if the conversion is valid (integers should match exactly)
+        // Signed/unsigned checks happen elsewhere
+        if (py_err
+            || (std::is_integral<T>::value && sizeof(py_type) != sizeof(T)
+                && py_value != (py_type) (T) py_value)) {
+            PyErr_Clear();
+            if (py_err && convert && (PyNumber_Check(src.ptr()) != 0)) {
+                auto tmp = reinterpret_steal<object>(std::is_floating_point<T>::value
+                                                         ? PyNumber_Float(src.ptr())
+                                                         : PyNumber_Long(src.ptr()));
+                PyErr_Clear();
+                return load(tmp, false);
+            }
+            return false;
+        }
+
+        value = (T) py_value;
+        return true;
+    }
+
+    template <typename U = T>
+    static typename std::enable_if<std::is_floating_point<U>::value, handle>::type
+    cast(U src, return_value_policy /* policy */, handle /* parent */) {
+        return PyFloat_FromDouble((double) src);
+    }
+
+    template <typename U = T>
+    static typename std::enable_if<!std::is_floating_point<U>::value && std::is_signed<U>::value
+                                       && (sizeof(U) <= sizeof(long)),
+                                   handle>::type
+    cast(U src, return_value_policy /* policy */, handle /* parent */) {
+        return PYBIND11_LONG_FROM_SIGNED((long) src);
+    }
+
+    template <typename U = T>
+    static typename std::enable_if<!std::is_floating_point<U>::value && std::is_unsigned<U>::value
+                                       && (sizeof(U) <= sizeof(unsigned long)),
+                                   handle>::type
+    cast(U src, return_value_policy /* policy */, handle /* parent */) {
+        return PYBIND11_LONG_FROM_UNSIGNED((unsigned long) src);
+    }
+
+    template <typename U = T>
+    static typename std::enable_if<!std::is_floating_point<U>::value && std::is_signed<U>::value
+                                       && (sizeof(U) > sizeof(long)),
+                                   handle>::type
+    cast(U src, return_value_policy /* policy */, handle /* parent */) {
+        return PyLong_FromLongLong((long long) src);
+    }
+
+    template <typename U = T>
+    static typename std::enable_if<!std::is_floating_point<U>::value && std::is_unsigned<U>::value
+                                       && (sizeof(U) > sizeof(unsigned long)),
+                                   handle>::type
+    cast(U src, return_value_policy /* policy */, handle /* parent */) {
+        return PyLong_FromUnsignedLongLong((unsigned long long) src);
+    }
+
+    PYBIND11_TYPE_CASTER(T, const_name<std::is_integral<T>::value>("int", "float"));
+};
+
+template <typename T>
+struct void_caster {
+public:
+    bool load(handle src, bool) {
+        if (src && src.is_none()) {
+            return true;
+        }
+        return false;
+    }
+    static handle cast(T, return_value_policy /* policy */, handle /* parent */) {
+        return none().release();
+    }
+    PYBIND11_TYPE_CASTER(T, const_name("None"));
+};
+
+template <>
+class type_caster<void_type> : public void_caster<void_type> {};
+
+template <>
+class type_caster<void> : public type_caster<void_type> {
+public:
+    using type_caster<void_type>::cast;
+
+    bool load(handle h, bool) {
+        if (!h) {
+            return false;
+        }
+        if (h.is_none()) {
+            value = nullptr;
+            return true;
+        }
+
+        /* Check if this is a capsule */
+        if (isinstance<capsule>(h)) {
+            value = reinterpret_borrow<capsule>(h);
+            return true;
+        }
+
+        /* Check if this is a C++ type */
+        const auto &bases = all_type_info((PyTypeObject *) type::handle_of(h).ptr());
+        if (bases.size() == 1) { // Only allowing loading from a single-value type
+            value = values_and_holders(reinterpret_cast<instance *>(h.ptr())).begin()->value_ptr();
+            return true;
+        }
+
+        /* Fail */
+        return false;
+    }
+
+    static handle cast(const void *ptr, return_value_policy /* policy */, handle /* parent */) {
+        if (ptr) {
+            return capsule(ptr).release();
+        }
+        return none().release();
+    }
+
+    template <typename T>
+    using cast_op_type = void *&;
+    explicit operator void *&() { return value; }
+    static constexpr auto name = const_name("capsule");
+
+private:
+    void *value = nullptr;
+};
+
+template <>
+class type_caster<std::nullptr_t> : public void_caster<std::nullptr_t> {};
+
+template <>
+class type_caster<bool> {
+public:
+    bool load(handle src, bool convert) {
+        if (!src) {
+            return false;
+        }
+        if (src.ptr() == Py_True) {
+            value = true;
+            return true;
+        }
+        if (src.ptr() == Py_False) {
+            value = false;
+            return true;
+        }
+        if (convert || is_numpy_bool(src)) {
+            // (allow non-implicit conversion for numpy booleans), use strncmp
+            // since NumPy 1.x had an additional trailing underscore.
+
+            Py_ssize_t res = -1;
+            if (src.is_none()) {
+                res = 0; // None is implicitly converted to False
+            }
+#if defined(PYPY_VERSION)
+            // On PyPy, check that "__bool__" attr exists
+            else if (hasattr(src, PYBIND11_BOOL_ATTR)) {
+                res = PyObject_IsTrue(src.ptr());
+            }
+#else
+            // Alternate approach for CPython: this does the same as the above, but optimized
+            // using the CPython API so as to avoid an unneeded attribute lookup.
+            else if (auto *tp_as_number = src.ptr()->ob_type->tp_as_number) {
+                if (PYBIND11_NB_BOOL(tp_as_number)) {
+                    res = (*PYBIND11_NB_BOOL(tp_as_number))(src.ptr());
+                }
+            }
+#endif
+            if (res == 0 || res == 1) {
+                value = (res != 0);
+                return true;
+            }
+            PyErr_Clear();
+        }
+        return false;
+    }
+    static handle cast(bool src, return_value_policy /* policy */, handle /* parent */) {
+        return handle(src ? Py_True : Py_False).inc_ref();
+    }
+    PYBIND11_TYPE_CASTER(bool, const_name("bool"));
+
+private:
+    // Test if an object is a NumPy boolean (without fetching the type).
+    static inline bool is_numpy_bool(handle object) {
+        const char *type_name = Py_TYPE(object.ptr())->tp_name;
+        // Name changed to `numpy.bool` in NumPy 2, `numpy.bool_` is needed for 1.x support
+        return std::strcmp("numpy.bool", type_name) == 0
+               || std::strcmp("numpy.bool_", type_name) == 0;
+    }
+};
+
+// Helper class for UTF-{8,16,32} C++ stl strings:
+template <typename StringType, bool IsView = false>
+struct string_caster {
+    using CharT = typename StringType::value_type;
+
+    // Simplify life by being able to assume standard char sizes (the standard only guarantees
+    // minimums, but Python requires exact sizes)
+    static_assert(!std::is_same<CharT, char>::value || sizeof(CharT) == 1,
+                  "Unsupported char size != 1");
+#if defined(PYBIND11_HAS_U8STRING)
+    static_assert(!std::is_same<CharT, char8_t>::value || sizeof(CharT) == 1,
+                  "Unsupported char8_t size != 1");
+#endif
+    static_assert(!std::is_same<CharT, char16_t>::value || sizeof(CharT) == 2,
+                  "Unsupported char16_t size != 2");
+    static_assert(!std::is_same<CharT, char32_t>::value || sizeof(CharT) == 4,
+                  "Unsupported char32_t size != 4");
+    // wchar_t can be either 16 bits (Windows) or 32 (everywhere else)
+    static_assert(!std::is_same<CharT, wchar_t>::value || sizeof(CharT) == 2 || sizeof(CharT) == 4,
+                  "Unsupported wchar_t size != 2/4");
+    static constexpr size_t UTF_N = 8 * sizeof(CharT);
+
+    bool load(handle src, bool) {
+        handle load_src = src;
+        if (!src) {
+            return false;
+        }
+        if (!PyUnicode_Check(load_src.ptr())) {
+            return load_raw(load_src);
+        }
+
+        // For UTF-8 we avoid the need for a temporary `bytes` object by using
+        // `PyUnicode_AsUTF8AndSize`.
+        if (UTF_N == 8) {
+            Py_ssize_t size = -1;
+            const auto *buffer
+                = reinterpret_cast<const CharT *>(PyUnicode_AsUTF8AndSize(load_src.ptr(), &size));
+            if (!buffer) {
+                PyErr_Clear();
+                return false;
+            }
+            value = StringType(buffer, static_cast<size_t>(size));
+            return true;
+        }
+
+        auto utfNbytes
+            = reinterpret_steal<object>(PyUnicode_AsEncodedString(load_src.ptr(),
+                                                                  UTF_N == 8    ? "utf-8"
+                                                                  : UTF_N == 16 ? "utf-16"
+                                                                                : "utf-32",
+                                                                  nullptr));
+        if (!utfNbytes) {
+            PyErr_Clear();
+            return false;
+        }
+
+        const auto *buffer
+            = reinterpret_cast<const CharT *>(PYBIND11_BYTES_AS_STRING(utfNbytes.ptr()));
+        size_t length = (size_t) PYBIND11_BYTES_SIZE(utfNbytes.ptr()) / sizeof(CharT);
+        // Skip BOM for UTF-16/32
+        if (UTF_N > 8) {
+            buffer++;
+            length--;
+        }
+        value = StringType(buffer, length);
+
+        // If we're loading a string_view we need to keep the encoded Python object alive:
+        if (IsView) {
+            loader_life_support::add_patient(utfNbytes);
+        }
+
+        return true;
+    }
+
+    static handle
+    cast(const StringType &src, return_value_policy /* policy */, handle /* parent */) {
+        const char *buffer = reinterpret_cast<const char *>(src.data());
+        auto nbytes = ssize_t(src.size() * sizeof(CharT));
+        handle s = decode_utfN(buffer, nbytes);
+        if (!s) {
+            throw error_already_set();
+        }
+        return s;
+    }
+
+    PYBIND11_TYPE_CASTER(StringType, const_name(PYBIND11_STRING_NAME));
+
+private:
+    static handle decode_utfN(const char *buffer, ssize_t nbytes) {
+#if !defined(PYPY_VERSION)
+        return UTF_N == 8    ? PyUnicode_DecodeUTF8(buffer, nbytes, nullptr)
+               : UTF_N == 16 ? PyUnicode_DecodeUTF16(buffer, nbytes, nullptr, nullptr)
+                             : PyUnicode_DecodeUTF32(buffer, nbytes, nullptr, nullptr);
+#else
+        // PyPy segfaults when on PyUnicode_DecodeUTF16 (and possibly on PyUnicode_DecodeUTF32 as
+        // well), so bypass the whole thing by just passing the encoding as a string value, which
+        // works properly:
+        return PyUnicode_Decode(buffer,
+                                nbytes,
+                                UTF_N == 8    ? "utf-8"
+                                : UTF_N == 16 ? "utf-16"
+                                              : "utf-32",
+                                nullptr);
+#endif
+    }
+
+    // When loading into a std::string or char*, accept a bytes/bytearray object as-is (i.e.
+    // without any encoding/decoding attempt).  For other C++ char sizes this is a no-op.
+    // which supports loading a unicode from a str, doesn't take this path.
+    template <typename C = CharT>
+    bool load_raw(enable_if_t<std::is_same<C, char>::value, handle> src) {
+        if (PYBIND11_BYTES_CHECK(src.ptr())) {
+            // We were passed raw bytes; accept it into a std::string or char*
+            // without any encoding attempt.
+            const char *bytes = PYBIND11_BYTES_AS_STRING(src.ptr());
+            if (!bytes) {
+                pybind11_fail("Unexpected PYBIND11_BYTES_AS_STRING() failure.");
+            }
+            value = StringType(bytes, (size_t) PYBIND11_BYTES_SIZE(src.ptr()));
+            return true;
+        }
+        if (PyByteArray_Check(src.ptr())) {
+            // We were passed a bytearray; accept it into a std::string or char*
+            // without any encoding attempt.
+            const char *bytearray = PyByteArray_AsString(src.ptr());
+            if (!bytearray) {
+                pybind11_fail("Unexpected PyByteArray_AsString() failure.");
+            }
+            value = StringType(bytearray, (size_t) PyByteArray_Size(src.ptr()));
+            return true;
+        }
+
+        return false;
+    }
+
+    template <typename C = CharT>
+    bool load_raw(enable_if_t<!std::is_same<C, char>::value, handle>) {
+        return false;
+    }
+};
+
+template <typename CharT, class Traits, class Allocator>
+struct type_caster<std::basic_string<CharT, Traits, Allocator>,
+                   enable_if_t<is_std_char_type<CharT>::value>>
+    : string_caster<std::basic_string<CharT, Traits, Allocator>> {};
+
+#ifdef PYBIND11_HAS_STRING_VIEW
+template <typename CharT, class Traits>
+struct type_caster<std::basic_string_view<CharT, Traits>,
+                   enable_if_t<is_std_char_type<CharT>::value>>
+    : string_caster<std::basic_string_view<CharT, Traits>, true> {};
+#endif
+
+// Type caster for C-style strings.  We basically use a std::string type caster, but also add the
+// ability to use None as a nullptr char* (which the string caster doesn't allow).
+template <typename CharT>
+struct type_caster<CharT, enable_if_t<is_std_char_type<CharT>::value>> {
+    using StringType = std::basic_string<CharT>;
+    using StringCaster = make_caster<StringType>;
+    StringCaster str_caster;
+    bool none = false;
+    CharT one_char = 0;
+
+public:
+    bool load(handle src, bool convert) {
+        if (!src) {
+            return false;
+        }
+        if (src.is_none()) {
+            // Defer accepting None to other overloads (if we aren't in convert mode):
+            if (!convert) {
+                return false;
+            }
+            none = true;
+            return true;
+        }
+        return str_caster.load(src, convert);
+    }
+
+    static handle cast(const CharT *src, return_value_policy policy, handle parent) {
+        if (src == nullptr) {
+            return pybind11::none().release();
+        }
+        return StringCaster::cast(StringType(src), policy, parent);
+    }
+
+    static handle cast(CharT src, return_value_policy policy, handle parent) {
+        if (std::is_same<char, CharT>::value) {
+            handle s = PyUnicode_DecodeLatin1((const char *) &src, 1, nullptr);
+            if (!s) {
+                throw error_already_set();
+            }
+            return s;
+        }
+        return StringCaster::cast(StringType(1, src), policy, parent);
+    }
+
+    explicit operator CharT *() {
+        return none ? nullptr : const_cast<CharT *>(static_cast<StringType &>(str_caster).c_str());
+    }
+    explicit operator CharT &() {
+        if (none) {
+            throw value_error("Cannot convert None to a character");
+        }
+
+        auto &value = static_cast<StringType &>(str_caster);
+        size_t str_len = value.size();
+        if (str_len == 0) {
+            throw value_error("Cannot convert empty string to a character");
+        }
+
+        // If we're in UTF-8 mode, we have two possible failures: one for a unicode character that
+        // is too high, and one for multiple unicode characters (caught later), so we need to
+        // figure out how long the first encoded character is in bytes to distinguish between these
+        // two errors.  We also allow want to allow unicode characters U+0080 through U+00FF, as
+        // those can fit into a single char value.
+        if (StringCaster::UTF_N == 8 && str_len > 1 && str_len <= 4) {
+            auto v0 = static_cast<unsigned char>(value[0]);
+            // low bits only: 0-127
+            // 0b110xxxxx - start of 2-byte sequence
+            // 0b1110xxxx - start of 3-byte sequence
+            // 0b11110xxx - start of 4-byte sequence
+            size_t char0_bytes = (v0 & 0x80) == 0      ? 1
+                                 : (v0 & 0xE0) == 0xC0 ? 2
+                                 : (v0 & 0xF0) == 0xE0 ? 3
+                                                       : 4;
+
+            if (char0_bytes == str_len) {
+                // If we have a 128-255 value, we can decode it into a single char:
+                if (char0_bytes == 2 && (v0 & 0xFC) == 0xC0) { // 0x110000xx 0x10xxxxxx
+                    one_char = static_cast<CharT>(((v0 & 3) << 6)
+                                                  + (static_cast<unsigned char>(value[1]) & 0x3F));
+                    return one_char;
+                }
+                // Otherwise we have a single character, but it's > U+00FF
+                throw value_error("Character code point not in range(0x100)");
+            }
+        }
+
+        // UTF-16 is much easier: we can only have a surrogate pair for values above U+FFFF, thus a
+        // surrogate pair with total length 2 instantly indicates a range error (but not a "your
+        // string was too long" error).
+        else if (StringCaster::UTF_N == 16 && str_len == 2) {
+            one_char = static_cast<CharT>(value[0]);
+            if (one_char >= 0xD800 && one_char < 0xE000) {
+                throw value_error("Character code point not in range(0x10000)");
+            }
+        }
+
+        if (str_len != 1) {
+            throw value_error("Expected a character, but multi-character string found");
+        }
+
+        one_char = value[0];
+        return one_char;
+    }
+
+    static constexpr auto name = const_name(PYBIND11_STRING_NAME);
+    template <typename _T>
+    using cast_op_type = pybind11::detail::cast_op_type<_T>;
+};
+
+// Base implementation for std::tuple and std::pair
+template <template <typename...> class Tuple, typename... Ts>
+class tuple_caster {
+    using type = Tuple<Ts...>;
+    static constexpr auto size = sizeof...(Ts);
+    using indices = make_index_sequence<size>;
+
+public:
+    bool load(handle src, bool convert) {
+        if (!isinstance<sequence>(src)) {
+            return false;
+        }
+        const auto seq = reinterpret_borrow<sequence>(src);
+        if (seq.size() != size) {
+            return false;
+        }
+        return load_impl(seq, convert, indices{});
+    }
+
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        return cast_impl(std::forward<T>(src), policy, parent, indices{});
+    }
+
+    // copied from the PYBIND11_TYPE_CASTER macro
+    template <typename T>
+    static handle cast(T *src, return_value_policy policy, handle parent) {
+        if (!src) {
+            return none().release();
+        }
+        if (policy == return_value_policy::take_ownership) {
+            auto h = cast(std::move(*src), policy, parent);
+            delete src;
+            return h;
+        }
+        return cast(*src, policy, parent);
+    }
+
+    static constexpr auto name = const_name("tuple[")
+                                 + ::pybind11::detail::concat(make_caster<Ts>::name...)
+                                 + const_name("]");
+
+    template <typename T>
+    using cast_op_type = type;
+
+    explicit operator type() & { return implicit_cast(indices{}); }
+    explicit operator type() && { return std::move(*this).implicit_cast(indices{}); }
+
+protected:
+    template <size_t... Is>
+    type implicit_cast(index_sequence<Is...>) & {
+        return type(cast_op<Ts>(std::get<Is>(subcasters))...);
+    }
+    template <size_t... Is>
+    type implicit_cast(index_sequence<Is...>) && {
+        return type(cast_op<Ts>(std::move(std::get<Is>(subcasters)))...);
+    }
+
+    static constexpr bool load_impl(const sequence &, bool, index_sequence<>) { return true; }
+
+    template <size_t... Is>
+    bool load_impl(const sequence &seq, bool convert, index_sequence<Is...>) {
+#ifdef __cpp_fold_expressions
+        if ((... || !std::get<Is>(subcasters).load(seq[Is], convert))) {
+            return false;
+        }
+#else
+        for (bool r : {std::get<Is>(subcasters).load(seq[Is], convert)...}) {
+            if (!r) {
+                return false;
+            }
+        }
+#endif
+        return true;
+    }
+
+    /* Implementation: Convert a C++ tuple into a Python tuple */
+    template <typename T, size_t... Is>
+    static handle
+    cast_impl(T &&src, return_value_policy policy, handle parent, index_sequence<Is...>) {
+        PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(src, policy, parent);
+        PYBIND11_WORKAROUND_INCORRECT_GCC_UNUSED_BUT_SET_PARAMETER(policy, parent);
+        std::array<object, size> entries{{reinterpret_steal<object>(
+            make_caster<Ts>::cast(std::get<Is>(std::forward<T>(src)), policy, parent))...}};
+        for (const auto &entry : entries) {
+            if (!entry) {
+                return handle();
+            }
+        }
+        tuple result(size);
+        int counter = 0;
+        for (auto &entry : entries) {
+            PyTuple_SET_ITEM(result.ptr(), counter++, entry.release().ptr());
+        }
+        return result.release();
+    }
+
+    Tuple<make_caster<Ts>...> subcasters;
+};
+
+template <typename T1, typename T2>
+class type_caster<std::pair<T1, T2>> : public tuple_caster<std::pair, T1, T2> {};
+
+template <typename... Ts>
+class type_caster<std::tuple<Ts...>> : public tuple_caster<std::tuple, Ts...> {};
+
+template <>
+class type_caster<std::tuple<>> : public tuple_caster<std::tuple> {
+public:
+    // PEP 484 specifies this syntax for an empty tuple
+    static constexpr auto name = const_name("tuple[()]");
+};
+
+/// Helper class which abstracts away certain actions. Users can provide specializations for
+/// custom holders, but it's only necessary if the type has a non-standard interface.
+template <typename T>
+struct holder_helper {
+    static auto get(const T &p) -> decltype(p.get()) { return p.get(); }
+};
+
+/// Type caster for holder types like std::shared_ptr, etc.
+/// The SFINAE hook is provided to help work around the current lack of support
+/// for smart-pointer interoperability. Please consider it an implementation
+/// detail that may change in the future, as formal support for smart-pointer
+/// interoperability is added into pybind11.
+template <typename type, typename holder_type, typename SFINAE = void>
+struct copyable_holder_caster : public type_caster_base<type> {
+public:
+    using base = type_caster_base<type>;
+    static_assert(std::is_base_of<base, type_caster<type>>::value,
+                  "Holder classes are only supported for custom types");
+    using base::base;
+    using base::cast;
+    using base::typeinfo;
+    using base::value;
+
+    bool load(handle src, bool convert) {
+        return base::template load_impl<copyable_holder_caster<type, holder_type>>(src, convert);
+    }
+
+    explicit operator type *() { return this->value; }
+    // static_cast works around compiler error with MSVC 17 and CUDA 10.2
+    // see issue #2180
+    explicit operator type &() { return *(static_cast<type *>(this->value)); }
+    explicit operator holder_type *() { return std::addressof(holder); }
+    explicit operator holder_type &() { return holder; }
+
+    static handle cast(const holder_type &src, return_value_policy, handle) {
+        const auto *ptr = holder_helper<holder_type>::get(src);
+        return type_caster_base<type>::cast_holder(ptr, &src);
+    }
+
+protected:
+    friend class type_caster_generic;
+    void check_holder_compat() {
+        if (typeinfo->default_holder) {
+            throw cast_error("Unable to load a custom holder type from a default-holder instance");
+        }
+    }
+
+    void load_value(value_and_holder &&v_h) {
+        if (v_h.holder_constructed()) {
+            value = v_h.value_ptr();
+            holder = v_h.template holder<holder_type>();
+            return;
+        }
+        throw cast_error("Unable to cast from non-held to held instance (T& to Holder<T>) "
+#if !defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+                         "(#define PYBIND11_DETAILED_ERROR_MESSAGES or compile in debug mode for "
+                         "type information)");
+#else
+                         "of type '"
+                         + type_id<holder_type>() + "''");
+#endif
+    }
+
+    template <typename T = holder_type,
+              detail::enable_if_t<!std::is_constructible<T, const T &, type *>::value, int> = 0>
+    bool try_implicit_casts(handle, bool) {
+        return false;
+    }
+
+    template <typename T = holder_type,
+              detail::enable_if_t<std::is_constructible<T, const T &, type *>::value, int> = 0>
+    bool try_implicit_casts(handle src, bool convert) {
+        for (auto &cast : typeinfo->implicit_casts) {
+            copyable_holder_caster sub_caster(*cast.first);
+            if (sub_caster.load(src, convert)) {
+                value = cast.second(sub_caster.value);
+                holder = holder_type(sub_caster.holder, (type *) value);
+                return true;
+            }
+        }
+        return false;
+    }
+
+    static bool try_direct_conversions(handle) { return false; }
+
+    holder_type holder;
+};
+
+/// Specialize for the common std::shared_ptr, so users don't need to
+template <typename T>
+class type_caster<std::shared_ptr<T>> : public copyable_holder_caster<T, std::shared_ptr<T>> {};
+
+/// Type caster for holder types like std::unique_ptr.
+/// Please consider the SFINAE hook an implementation detail, as explained
+/// in the comment for the copyable_holder_caster.
+template <typename type, typename holder_type, typename SFINAE = void>
+struct move_only_holder_caster {
+    static_assert(std::is_base_of<type_caster_base<type>, type_caster<type>>::value,
+                  "Holder classes are only supported for custom types");
+
+    static handle cast(holder_type &&src, return_value_policy, handle) {
+        auto *ptr = holder_helper<holder_type>::get(src);
+        return type_caster_base<type>::cast_holder(ptr, std::addressof(src));
+    }
+    static constexpr auto name = type_caster_base<type>::name;
+};
+
+template <typename type, typename deleter>
+class type_caster<std::unique_ptr<type, deleter>>
+    : public move_only_holder_caster<type, std::unique_ptr<type, deleter>> {};
+
+template <typename type, typename holder_type>
+using type_caster_holder = conditional_t<is_copy_constructible<holder_type>::value,
+                                         copyable_holder_caster<type, holder_type>,
+                                         move_only_holder_caster<type, holder_type>>;
+
+template <typename T, bool Value = false>
+struct always_construct_holder {
+    static constexpr bool value = Value;
+};
+
+/// Create a specialization for custom holder types (silently ignores std::shared_ptr)
+#define PYBIND11_DECLARE_HOLDER_TYPE(type, holder_type, ...)                                      \
+    PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)                                                  \
+    namespace detail {                                                                            \
+    template <typename type>                                                                      \
+    struct always_construct_holder<holder_type> : always_construct_holder<void, ##__VA_ARGS__> {  \
+    };                                                                                            \
+    template <typename type>                                                                      \
+    class type_caster<holder_type, enable_if_t<!is_shared_ptr<holder_type>::value>>               \
+        : public type_caster_holder<type, holder_type> {};                                        \
+    }                                                                                             \
+    PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
+
+// PYBIND11_DECLARE_HOLDER_TYPE holder types:
+template <typename base, typename holder>
+struct is_holder_type
+    : std::is_base_of<detail::type_caster_holder<base, holder>, detail::type_caster<holder>> {};
+// Specialization for always-supported unique_ptr holders:
+template <typename base, typename deleter>
+struct is_holder_type<base, std::unique_ptr<base, deleter>> : std::true_type {};
+
+#ifdef PYBIND11_DISABLE_HANDLE_TYPE_NAME_DEFAULT_IMPLEMENTATION // See PR #4888
+
+// This leads to compilation errors if a specialization is missing.
+template <typename T>
+struct handle_type_name;
+
+#else
+
+template <typename T>
+struct handle_type_name {
+    static constexpr auto name = const_name<T>();
+};
+
+#endif
+
+template <>
+struct handle_type_name<object> {
+    static constexpr auto name = const_name("object");
+};
+template <>
+struct handle_type_name<list> {
+    static constexpr auto name = const_name("list");
+};
+template <>
+struct handle_type_name<dict> {
+    static constexpr auto name = const_name("dict");
+};
+template <>
+struct handle_type_name<anyset> {
+    static constexpr auto name = const_name("Union[set, frozenset]");
+};
+template <>
+struct handle_type_name<set> {
+    static constexpr auto name = const_name("set");
+};
+template <>
+struct handle_type_name<frozenset> {
+    static constexpr auto name = const_name("frozenset");
+};
+template <>
+struct handle_type_name<str> {
+    static constexpr auto name = const_name("str");
+};
+template <>
+struct handle_type_name<tuple> {
+    static constexpr auto name = const_name("tuple");
+};
+template <>
+struct handle_type_name<bool_> {
+    static constexpr auto name = const_name("bool");
+};
+template <>
+struct handle_type_name<bytes> {
+    static constexpr auto name = const_name(PYBIND11_BYTES_NAME);
+};
+template <>
+struct handle_type_name<buffer> {
+    static constexpr auto name = const_name("Buffer");
+};
+template <>
+struct handle_type_name<int_> {
+    static constexpr auto name = const_name("int");
+};
+template <>
+struct handle_type_name<iterable> {
+    static constexpr auto name = const_name("Iterable");
+};
+template <>
+struct handle_type_name<iterator> {
+    static constexpr auto name = const_name("Iterator");
+};
+template <>
+struct handle_type_name<float_> {
+    static constexpr auto name = const_name("float");
+};
+template <>
+struct handle_type_name<function> {
+    static constexpr auto name = const_name("Callable");
+};
+template <>
+struct handle_type_name<handle> {
+    static constexpr auto name = handle_type_name<object>::name;
+};
+template <>
+struct handle_type_name<none> {
+    static constexpr auto name = const_name("None");
+};
+template <>
+struct handle_type_name<sequence> {
+    static constexpr auto name = const_name("Sequence");
+};
+template <>
+struct handle_type_name<bytearray> {
+    static constexpr auto name = const_name("bytearray");
+};
+template <>
+struct handle_type_name<memoryview> {
+    static constexpr auto name = const_name("memoryview");
+};
+template <>
+struct handle_type_name<slice> {
+    static constexpr auto name = const_name("slice");
+};
+template <>
+struct handle_type_name<type> {
+    static constexpr auto name = const_name("type");
+};
+template <>
+struct handle_type_name<capsule> {
+    static constexpr auto name = const_name("capsule");
+};
+template <>
+struct handle_type_name<ellipsis> {
+    static constexpr auto name = const_name("ellipsis");
+};
+template <>
+struct handle_type_name<weakref> {
+    static constexpr auto name = const_name("weakref");
+};
+template <>
+struct handle_type_name<args> {
+    static constexpr auto name = const_name("*args");
+};
+template <>
+struct handle_type_name<kwargs> {
+    static constexpr auto name = const_name("**kwargs");
+};
+template <>
+struct handle_type_name<obj_attr_accessor> {
+    static constexpr auto name = const_name<obj_attr_accessor>();
+};
+template <>
+struct handle_type_name<str_attr_accessor> {
+    static constexpr auto name = const_name<str_attr_accessor>();
+};
+template <>
+struct handle_type_name<item_accessor> {
+    static constexpr auto name = const_name<item_accessor>();
+};
+template <>
+struct handle_type_name<sequence_accessor> {
+    static constexpr auto name = const_name<sequence_accessor>();
+};
+template <>
+struct handle_type_name<list_accessor> {
+    static constexpr auto name = const_name<list_accessor>();
+};
+template <>
+struct handle_type_name<tuple_accessor> {
+    static constexpr auto name = const_name<tuple_accessor>();
+};
+
+template <typename type>
+struct pyobject_caster {
+    template <typename T = type, enable_if_t<std::is_same<T, handle>::value, int> = 0>
+    pyobject_caster() : value() {}
+
+    // `type` may not be default constructible (e.g. frozenset, anyset).  Initializing `value`
+    // to a nil handle is safe since it will only be accessed if `load` succeeds.
+    template <typename T = type, enable_if_t<std::is_base_of<object, T>::value, int> = 0>
+    pyobject_caster() : value(reinterpret_steal<type>(handle())) {}
+
+    template <typename T = type, enable_if_t<std::is_same<T, handle>::value, int> = 0>
+    bool load(handle src, bool /* convert */) {
+        value = src;
+        return static_cast<bool>(value);
+    }
+
+    template <typename T = type, enable_if_t<std::is_base_of<object, T>::value, int> = 0>
+    bool load(handle src, bool /* convert */) {
+        if (!isinstance<type>(src)) {
+            return false;
+        }
+        value = reinterpret_borrow<type>(src);
+        return true;
+    }
+
+    static handle cast(const handle &src, return_value_policy /* policy */, handle /* parent */) {
+        return src.inc_ref();
+    }
+    PYBIND11_TYPE_CASTER(type, handle_type_name<type>::name);
+};
+
+template <typename T>
+class type_caster<T, enable_if_t<is_pyobject<T>::value>> : public pyobject_caster<T> {};
+
+// Our conditions for enabling moving are quite restrictive:
+// At compile time:
+// - T needs to be a non-const, non-pointer, non-reference type
+// - type_caster<T>::operator T&() must exist
+// - the type must be move constructible (obviously)
+// At run-time:
+// - if the type is non-copy-constructible, the object must be the sole owner of the type (i.e. it
+//   must have ref_count() == 1)h
+// If any of the above are not satisfied, we fall back to copying.
+template <typename T>
+using move_is_plain_type
+    = satisfies_none_of<T, std::is_void, std::is_pointer, std::is_reference, std::is_const>;
+template <typename T, typename SFINAE = void>
+struct move_always : std::false_type {};
+template <typename T>
+struct move_always<
+    T,
+    enable_if_t<
+        all_of<move_is_plain_type<T>,
+               negation<is_copy_constructible<T>>,
+               is_move_constructible<T>,
+               std::is_same<decltype(std::declval<make_caster<T>>().operator T &()), T &>>::value>>
+    : std::true_type {};
+template <typename T, typename SFINAE = void>
+struct move_if_unreferenced : std::false_type {};
+template <typename T>
+struct move_if_unreferenced<
+    T,
+    enable_if_t<
+        all_of<move_is_plain_type<T>,
+               negation<move_always<T>>,
+               is_move_constructible<T>,
+               std::is_same<decltype(std::declval<make_caster<T>>().operator T &()), T &>>::value>>
+    : std::true_type {};
+template <typename T>
+using move_never = none_of<move_always<T>, move_if_unreferenced<T>>;
+
+// Detect whether returning a `type` from a cast on type's type_caster is going to result in a
+// reference or pointer to a local variable of the type_caster.  Basically, only
+// non-reference/pointer `type`s and reference/pointers from a type_caster_generic are safe;
+// everything else returns a reference/pointer to a local variable.
+template <typename type>
+using cast_is_temporary_value_reference
+    = bool_constant<(std::is_reference<type>::value || std::is_pointer<type>::value)
+                    && !std::is_base_of<type_caster_generic, make_caster<type>>::value
+                    && !std::is_same<intrinsic_t<type>, void>::value>;
+
+// When a value returned from a C++ function is being cast back to Python, we almost always want to
+// force `policy = move`, regardless of the return value policy the function/method was declared
+// with.
+template <typename Return, typename SFINAE = void>
+struct return_value_policy_override {
+    static return_value_policy policy(return_value_policy p) { return p; }
+};
+
+template <typename Return>
+struct return_value_policy_override<
+    Return,
+    detail::enable_if_t<std::is_base_of<type_caster_generic, make_caster<Return>>::value, void>> {
+    static return_value_policy policy(return_value_policy p) {
+        return !std::is_lvalue_reference<Return>::value && !std::is_pointer<Return>::value
+                   ? return_value_policy::move
+                   : p;
+    }
+};
+
+// Basic python -> C++ casting; throws if casting fails
+template <typename T, typename SFINAE>
+type_caster<T, SFINAE> &load_type(type_caster<T, SFINAE> &conv, const handle &handle) {
+    static_assert(!detail::is_pyobject<T>::value,
+                  "Internal error: type_caster should only be used for C++ types");
+    if (!conv.load(handle, true)) {
+#if !defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+        throw cast_error(
+            "Unable to cast Python instance of type "
+            + str(type::handle_of(handle)).cast<std::string>()
+            + " to C++ type '?' (#define "
+              "PYBIND11_DETAILED_ERROR_MESSAGES or compile in debug mode for details)");
+#else
+        throw cast_error("Unable to cast Python instance of type "
+                         + str(type::handle_of(handle)).cast<std::string>() + " to C++ type '"
+                         + type_id<T>() + "'");
+#endif
+    }
+    return conv;
+}
+// Wrapper around the above that also constructs and returns a type_caster
+template <typename T>
+make_caster<T> load_type(const handle &handle) {
+    make_caster<T> conv;
+    load_type(conv, handle);
+    return conv;
+}
+
+PYBIND11_NAMESPACE_END(detail)
+
+// pytype -> C++ type
+template <typename T,
+          detail::enable_if_t<!detail::is_pyobject<T>::value
+                                  && !detail::is_same_ignoring_cvref<T, PyObject *>::value,
+                              int>
+          = 0>
+T cast(const handle &handle) {
+    using namespace detail;
+    static_assert(!cast_is_temporary_value_reference<T>::value,
+                  "Unable to cast type to reference: value is local to type caster");
+    return cast_op<T>(load_type<T>(handle));
+}
+
+// pytype -> pytype (calls converting constructor)
+template <typename T, detail::enable_if_t<detail::is_pyobject<T>::value, int> = 0>
+T cast(const handle &handle) {
+    return T(reinterpret_borrow<object>(handle));
+}
+
+// Note that `cast<PyObject *>(obj)` increments the reference count of `obj`.
+// This is necessary for the case that `obj` is a temporary, and could
+// not possibly be different, given
+// 1. the established convention that the passed `handle` is borrowed, and
+// 2. we don't want to force all generic code using `cast<T>()` to special-case
+//    handling of `T` = `PyObject *` (to increment the reference count there).
+// It is the responsibility of the caller to ensure that the reference count
+// is decremented.
+template <typename T,
+          typename Handle,
+          detail::enable_if_t<detail::is_same_ignoring_cvref<T, PyObject *>::value
+                                  && detail::is_same_ignoring_cvref<Handle, handle>::value,
+                              int>
+          = 0>
+T cast(Handle &&handle) {
+    return handle.inc_ref().ptr();
+}
+// To optimize way an inc_ref/dec_ref cycle:
+template <typename T,
+          typename Object,
+          detail::enable_if_t<detail::is_same_ignoring_cvref<T, PyObject *>::value
+                                  && detail::is_same_ignoring_cvref<Object, object>::value,
+                              int>
+          = 0>
+T cast(Object &&obj) {
+    return obj.release().ptr();
+}
+
+// C++ type -> py::object
+template <typename T, detail::enable_if_t<!detail::is_pyobject<T>::value, int> = 0>
+object cast(T &&value,
+            return_value_policy policy = return_value_policy::automatic_reference,
+            handle parent = handle()) {
+    using no_ref_T = typename std::remove_reference<T>::type;
+    if (policy == return_value_policy::automatic) {
+        policy = std::is_pointer<no_ref_T>::value     ? return_value_policy::take_ownership
+                 : std::is_lvalue_reference<T>::value ? return_value_policy::copy
+                                                      : return_value_policy::move;
+    } else if (policy == return_value_policy::automatic_reference) {
+        policy = std::is_pointer<no_ref_T>::value     ? return_value_policy::reference
+                 : std::is_lvalue_reference<T>::value ? return_value_policy::copy
+                                                      : return_value_policy::move;
+    }
+    return reinterpret_steal<object>(
+        detail::make_caster<T>::cast(std::forward<T>(value), policy, parent));
+}
+
+template <typename T>
+T handle::cast() const {
+    return pybind11::cast<T>(*this);
+}
+template <>
+inline void handle::cast() const {
+    return;
+}
+
+template <typename T>
+detail::enable_if_t<!detail::move_never<T>::value, T> move(object &&obj) {
+    if (obj.ref_count() > 1) {
+#if !defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+        throw cast_error(
+            "Unable to cast Python " + str(type::handle_of(obj)).cast<std::string>()
+            + " instance to C++ rvalue: instance has multiple references"
+              " (#define PYBIND11_DETAILED_ERROR_MESSAGES or compile in debug mode for details)");
+#else
+        throw cast_error("Unable to move from Python "
+                         + str(type::handle_of(obj)).cast<std::string>() + " instance to C++ "
+                         + type_id<T>() + " instance: instance has multiple references");
+#endif
+    }
+
+    // Move into a temporary and return that, because the reference may be a local value of `conv`
+    T ret = std::move(detail::load_type<T>(obj).operator T &());
+    return ret;
+}
+
+// Calling cast() on an rvalue calls pybind11::cast with the object rvalue, which does:
+// - If we have to move (because T has no copy constructor), do it.  This will fail if the moved
+//   object has multiple references, but trying to copy will fail to compile.
+// - If both movable and copyable, check ref count: if 1, move; otherwise copy
+// - Otherwise (not movable), copy.
+template <typename T>
+detail::enable_if_t<!detail::is_pyobject<T>::value && detail::move_always<T>::value, T>
+cast(object &&object) {
+    return move<T>(std::move(object));
+}
+template <typename T>
+detail::enable_if_t<!detail::is_pyobject<T>::value && detail::move_if_unreferenced<T>::value, T>
+cast(object &&object) {
+    if (object.ref_count() > 1) {
+        return cast<T>(object);
+    }
+    return move<T>(std::move(object));
+}
+template <typename T>
+detail::enable_if_t<!detail::is_pyobject<T>::value && detail::move_never<T>::value, T>
+cast(object &&object) {
+    return cast<T>(object);
+}
+
+// pytype rvalue -> pytype (calls converting constructor)
+template <typename T>
+detail::enable_if_t<detail::is_pyobject<T>::value, T> cast(object &&object) {
+    return T(std::move(object));
+}
+
+template <typename T>
+T object::cast() const & {
+    return pybind11::cast<T>(*this);
+}
+template <typename T>
+T object::cast() && {
+    return pybind11::cast<T>(std::move(*this));
+}
+template <>
+inline void object::cast() const & {
+    return;
+}
+template <>
+inline void object::cast() && {
+    return;
+}
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// Declared in pytypes.h:
+template <typename T, enable_if_t<!is_pyobject<T>::value, int>>
+object object_or_cast(T &&o) {
+    return pybind11::cast(std::forward<T>(o));
+}
+
+// Placeholder type for the unneeded (and dead code) static variable in the
+// PYBIND11_OVERRIDE_OVERRIDE macro
+struct override_unused {};
+template <typename ret_type>
+using override_caster_t = conditional_t<cast_is_temporary_value_reference<ret_type>::value,
+                                        make_caster<ret_type>,
+                                        override_unused>;
+
+// Trampoline use: for reference/pointer types to value-converted values, we do a value cast, then
+// store the result in the given variable.  For other types, this is a no-op.
+template <typename T>
+enable_if_t<cast_is_temporary_value_reference<T>::value, T> cast_ref(object &&o,
+                                                                     make_caster<T> &caster) {
+    return cast_op<T>(load_type(caster, o));
+}
+template <typename T>
+enable_if_t<!cast_is_temporary_value_reference<T>::value, T> cast_ref(object &&,
+                                                                      override_unused &) {
+    pybind11_fail("Internal error: cast_ref fallback invoked");
+}
+
+// Trampoline use: Having a pybind11::cast with an invalid reference type is going to
+// static_assert, even though if it's in dead code, so we provide a "trampoline" to pybind11::cast
+// that only does anything in cases where pybind11::cast is valid.
+template <typename T>
+enable_if_t<cast_is_temporary_value_reference<T>::value
+                && !detail::is_same_ignoring_cvref<T, PyObject *>::value,
+            T>
+cast_safe(object &&) {
+    pybind11_fail("Internal error: cast_safe fallback invoked");
+}
+template <typename T>
+enable_if_t<std::is_void<T>::value, void> cast_safe(object &&) {}
+template <typename T>
+enable_if_t<detail::is_same_ignoring_cvref<T, PyObject *>::value, PyObject *>
+cast_safe(object &&o) {
+    return o.release().ptr();
+}
+template <typename T>
+enable_if_t<detail::none_of<cast_is_temporary_value_reference<T>,
+                            detail::is_same_ignoring_cvref<T, PyObject *>,
+                            std::is_void<T>>::value,
+            T>
+cast_safe(object &&o) {
+    return pybind11::cast<T>(std::move(o));
+}
+
+PYBIND11_NAMESPACE_END(detail)
+
+// The overloads could coexist, i.e. the #if is not strictly speaking needed,
+// but it is an easy minor optimization.
+#if !defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+inline cast_error cast_error_unable_to_convert_call_arg(const std::string &name) {
+    return cast_error("Unable to convert call argument '" + name
+                      + "' to Python object (#define "
+                        "PYBIND11_DETAILED_ERROR_MESSAGES or compile in debug mode for details)");
+}
+#else
+inline cast_error cast_error_unable_to_convert_call_arg(const std::string &name,
+                                                        const std::string &type) {
+    return cast_error("Unable to convert call argument '" + name + "' of type '" + type
+                      + "' to Python object");
+}
+#endif
+
+template <return_value_policy policy = return_value_policy::automatic_reference>
+tuple make_tuple() {
+    return tuple(0);
+}
+
+template <return_value_policy policy = return_value_policy::automatic_reference, typename... Args>
+tuple make_tuple(Args &&...args_) {
+    constexpr size_t size = sizeof...(Args);
+    std::array<object, size> args{{reinterpret_steal<object>(
+        detail::make_caster<Args>::cast(std::forward<Args>(args_), policy, nullptr))...}};
+    for (size_t i = 0; i < args.size(); i++) {
+        if (!args[i]) {
+#if !defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+            throw cast_error_unable_to_convert_call_arg(std::to_string(i));
+#else
+            std::array<std::string, size> argtypes{{type_id<Args>()...}};
+            throw cast_error_unable_to_convert_call_arg(std::to_string(i), argtypes[i]);
+#endif
+        }
+    }
+    tuple result(size);
+    int counter = 0;
+    for (auto &arg_value : args) {
+        PyTuple_SET_ITEM(result.ptr(), counter++, arg_value.release().ptr());
+    }
+    return result;
+}
+
+/// \ingroup annotations
+/// Annotation for arguments
+struct arg {
+    /// Constructs an argument with the name of the argument; if null or omitted, this is a
+    /// positional argument.
+    constexpr explicit arg(const char *name = nullptr)
+        : name(name), flag_noconvert(false), flag_none(true) {}
+    /// Assign a value to this argument
+    template <typename T>
+    arg_v operator=(T &&value) const;
+    /// Indicate that the type should not be converted in the type caster
+    arg &noconvert(bool flag = true) {
+        flag_noconvert = flag;
+        return *this;
+    }
+    /// Indicates that the argument should/shouldn't allow None (e.g. for nullable pointer args)
+    arg &none(bool flag = true) {
+        flag_none = flag;
+        return *this;
+    }
+
+    const char *name;        ///< If non-null, this is a named kwargs argument
+    bool flag_noconvert : 1; ///< If set, do not allow conversion (requires a supporting type
+                             ///< caster!)
+    bool flag_none : 1;      ///< If set (the default), allow None to be passed to this argument
+};
+
+/// \ingroup annotations
+/// Annotation for arguments with values
+struct arg_v : arg {
+private:
+    template <typename T>
+    arg_v(arg &&base, T &&x, const char *descr = nullptr)
+        : arg(base), value(reinterpret_steal<object>(detail::make_caster<T>::cast(
+                         std::forward<T>(x), return_value_policy::automatic, {}))),
+          descr(descr)
+#if defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+          ,
+          type(type_id<T>())
+#endif
+    {
+        // Workaround! See:
+        // https://github.com/pybind/pybind11/issues/2336
+        // https://github.com/pybind/pybind11/pull/2685#issuecomment-731286700
+        if (PyErr_Occurred()) {
+            PyErr_Clear();
+        }
+    }
+
+public:
+    /// Direct construction with name, default, and description
+    template <typename T>
+    arg_v(const char *name, T &&x, const char *descr = nullptr)
+        : arg_v(arg(name), std::forward<T>(x), descr) {}
+
+    /// Called internally when invoking `py::arg("a") = value`
+    template <typename T>
+    arg_v(const arg &base, T &&x, const char *descr = nullptr)
+        : arg_v(arg(base), std::forward<T>(x), descr) {}
+
+    /// Same as `arg::noconvert()`, but returns *this as arg_v&, not arg&
+    arg_v &noconvert(bool flag = true) {
+        arg::noconvert(flag);
+        return *this;
+    }
+
+    /// Same as `arg::nonone()`, but returns *this as arg_v&, not arg&
+    arg_v &none(bool flag = true) {
+        arg::none(flag);
+        return *this;
+    }
+
+    /// The default value
+    object value;
+    /// The (optional) description of the default value
+    const char *descr;
+#if defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+    /// The C++ type name of the default value (only available when compiled in debug mode)
+    std::string type;
+#endif
+};
+
+/// \ingroup annotations
+/// Annotation indicating that all following arguments are keyword-only; the is the equivalent of
+/// an unnamed '*' argument
+struct kw_only {};
+
+/// \ingroup annotations
+/// Annotation indicating that all previous arguments are positional-only; the is the equivalent of
+/// an unnamed '/' argument (in Python 3.8)
+struct pos_only {};
+
+template <typename T>
+arg_v arg::operator=(T &&value) const {
+    return {*this, std::forward<T>(value)};
+}
+
+/// Alias for backward compatibility -- to be removed in version 2.0
+template <typename /*unused*/>
+using arg_t = arg_v;
+
+inline namespace literals {
+/** \rst
+    String literal version of `arg`
+ \endrst */
+constexpr arg
+#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5
+operator"" _a // gcc 4.8.5 insists on having a space (hard error).
+#else
+operator""_a // clang 17 generates a deprecation warning if there is a space.
+#endif
+    (const char *name, size_t) {
+    return arg(name);
+}
+} // namespace literals
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <typename T>
+using is_kw_only = std::is_same<intrinsic_t<T>, kw_only>;
+template <typename T>
+using is_pos_only = std::is_same<intrinsic_t<T>, pos_only>;
+
+// forward declaration (definition in attr.h)
+struct function_record;
+
+/// Internal data associated with a single function call
+struct function_call {
+    function_call(const function_record &f, handle p); // Implementation in attr.h
+
+    /// The function data:
+    const function_record &func;
+
+    /// Arguments passed to the function:
+    std::vector<handle> args;
+
+    /// The `convert` value the arguments should be loaded with
+    std::vector<bool> args_convert;
+
+    /// Extra references for the optional `py::args` and/or `py::kwargs` arguments (which, if
+    /// present, are also in `args` but without a reference).
+    object args_ref, kwargs_ref;
+
+    /// The parent, if any
+    handle parent;
+
+    /// If this is a call to an initializer, this argument contains `self`
+    handle init_self;
+};
+
+/// Helper class which loads arguments for C++ functions called from Python
+template <typename... Args>
+class argument_loader {
+    using indices = make_index_sequence<sizeof...(Args)>;
+
+    template <typename Arg>
+    using argument_is_args = std::is_same<intrinsic_t<Arg>, args>;
+    template <typename Arg>
+    using argument_is_kwargs = std::is_same<intrinsic_t<Arg>, kwargs>;
+    // Get kwargs argument position, or -1 if not present:
+    static constexpr auto kwargs_pos = constexpr_last<argument_is_kwargs, Args...>();
+
+    static_assert(kwargs_pos == -1 || kwargs_pos == (int) sizeof...(Args) - 1,
+                  "py::kwargs is only permitted as the last argument of a function");
+
+public:
+    static constexpr bool has_kwargs = kwargs_pos != -1;
+
+    // py::args argument position; -1 if not present.
+    static constexpr int args_pos = constexpr_last<argument_is_args, Args...>();
+
+    static_assert(args_pos == -1 || args_pos == constexpr_first<argument_is_args, Args...>(),
+                  "py::args cannot be specified more than once");
+
+    static constexpr auto arg_names
+        = ::pybind11::detail::concat(type_descr(make_caster<Args>::name)...);
+
+    bool load_args(function_call &call) { return load_impl_sequence(call, indices{}); }
+
+    template <typename Return, typename Guard, typename Func>
+    // NOLINTNEXTLINE(readability-const-return-type)
+    enable_if_t<!std::is_void<Return>::value, Return> call(Func &&f) && {
+        return std::move(*this).template call_impl<remove_cv_t<Return>>(
+            std::forward<Func>(f), indices{}, Guard{});
+    }
+
+    template <typename Return, typename Guard, typename Func>
+    enable_if_t<std::is_void<Return>::value, void_type> call(Func &&f) && {
+        std::move(*this).template call_impl<remove_cv_t<Return>>(
+            std::forward<Func>(f), indices{}, Guard{});
+        return void_type();
+    }
+
+private:
+    static bool load_impl_sequence(function_call &, index_sequence<>) { return true; }
+
+    template <size_t... Is>
+    bool load_impl_sequence(function_call &call, index_sequence<Is...>) {
+#ifdef __cpp_fold_expressions
+        if ((... || !std::get<Is>(argcasters).load(call.args[Is], call.args_convert[Is]))) {
+            return false;
+        }
+#else
+        for (bool r : {std::get<Is>(argcasters).load(call.args[Is], call.args_convert[Is])...}) {
+            if (!r) {
+                return false;
+            }
+        }
+#endif
+        return true;
+    }
+
+    template <typename Return, typename Func, size_t... Is, typename Guard>
+    Return call_impl(Func &&f, index_sequence<Is...>, Guard &&) && {
+        return std::forward<Func>(f)(cast_op<Args>(std::move(std::get<Is>(argcasters)))...);
+    }
+
+    std::tuple<make_caster<Args>...> argcasters;
+};
+
+/// Helper class which collects only positional arguments for a Python function call.
+/// A fancier version below can collect any argument, but this one is optimal for simple calls.
+template <return_value_policy policy>
+class simple_collector {
+public:
+    template <typename... Ts>
+    explicit simple_collector(Ts &&...values)
+        : m_args(pybind11::make_tuple<policy>(std::forward<Ts>(values)...)) {}
+
+    const tuple &args() const & { return m_args; }
+    dict kwargs() const { return {}; }
+
+    tuple args() && { return std::move(m_args); }
+
+    /// Call a Python function and pass the collected arguments
+    object call(PyObject *ptr) const {
+        PyObject *result = PyObject_CallObject(ptr, m_args.ptr());
+        if (!result) {
+            throw error_already_set();
+        }
+        return reinterpret_steal<object>(result);
+    }
+
+private:
+    tuple m_args;
+};
+
+/// Helper class which collects positional, keyword, * and ** arguments for a Python function call
+template <return_value_policy policy>
+class unpacking_collector {
+public:
+    template <typename... Ts>
+    explicit unpacking_collector(Ts &&...values) {
+        // Tuples aren't (easily) resizable so a list is needed for collection,
+        // but the actual function call strictly requires a tuple.
+        auto args_list = list();
+        using expander = int[];
+        (void) expander{0, (process(args_list, std::forward<Ts>(values)), 0)...};
+
+        m_args = std::move(args_list);
+    }
+
+    const tuple &args() const & { return m_args; }
+    const dict &kwargs() const & { return m_kwargs; }
+
+    tuple args() && { return std::move(m_args); }
+    dict kwargs() && { return std::move(m_kwargs); }
+
+    /// Call a Python function and pass the collected arguments
+    object call(PyObject *ptr) const {
+        PyObject *result = PyObject_Call(ptr, m_args.ptr(), m_kwargs.ptr());
+        if (!result) {
+            throw error_already_set();
+        }
+        return reinterpret_steal<object>(result);
+    }
+
+private:
+    template <typename T>
+    void process(list &args_list, T &&x) {
+        auto o = reinterpret_steal<object>(
+            detail::make_caster<T>::cast(std::forward<T>(x), policy, {}));
+        if (!o) {
+#if !defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+            throw cast_error_unable_to_convert_call_arg(std::to_string(args_list.size()));
+#else
+            throw cast_error_unable_to_convert_call_arg(std::to_string(args_list.size()),
+                                                        type_id<T>());
+#endif
+        }
+        args_list.append(std::move(o));
+    }
+
+    void process(list &args_list, detail::args_proxy ap) {
+        for (auto a : ap) {
+            args_list.append(a);
+        }
+    }
+
+    void process(list & /*args_list*/, arg_v a) {
+        if (!a.name) {
+#if !defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+            nameless_argument_error();
+#else
+            nameless_argument_error(a.type);
+#endif
+        }
+        if (m_kwargs.contains(a.name)) {
+#if !defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+            multiple_values_error();
+#else
+            multiple_values_error(a.name);
+#endif
+        }
+        if (!a.value) {
+#if !defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+            throw cast_error_unable_to_convert_call_arg(a.name);
+#else
+            throw cast_error_unable_to_convert_call_arg(a.name, a.type);
+#endif
+        }
+        m_kwargs[a.name] = std::move(a.value);
+    }
+
+    void process(list & /*args_list*/, detail::kwargs_proxy kp) {
+        if (!kp) {
+            return;
+        }
+        for (auto k : reinterpret_borrow<dict>(kp)) {
+            if (m_kwargs.contains(k.first)) {
+#if !defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+                multiple_values_error();
+#else
+                multiple_values_error(str(k.first));
+#endif
+            }
+            m_kwargs[k.first] = k.second;
+        }
+    }
+
+    [[noreturn]] static void nameless_argument_error() {
+        throw type_error(
+            "Got kwargs without a name; only named arguments "
+            "may be passed via py::arg() to a python function call. "
+            "(#define PYBIND11_DETAILED_ERROR_MESSAGES or compile in debug mode for details)");
+    }
+    [[noreturn]] static void nameless_argument_error(const std::string &type) {
+        throw type_error("Got kwargs without a name of type '" + type
+                         + "'; only named "
+                           "arguments may be passed via py::arg() to a python function call. ");
+    }
+    [[noreturn]] static void multiple_values_error() {
+        throw type_error(
+            "Got multiple values for keyword argument "
+            "(#define PYBIND11_DETAILED_ERROR_MESSAGES or compile in debug mode for details)");
+    }
+
+    [[noreturn]] static void multiple_values_error(const std::string &name) {
+        throw type_error("Got multiple values for keyword argument '" + name + "'");
+    }
+
+private:
+    tuple m_args;
+    dict m_kwargs;
+};
+
+// [workaround(intel)] Separate function required here
+// We need to put this into a separate function because the Intel compiler
+// fails to compile enable_if_t<!all_of<is_positional<Args>...>::value>
+// (tested with ICC 2021.1 Beta 20200827).
+template <typename... Args>
+constexpr bool args_are_all_positional() {
+    return all_of<is_positional<Args>...>::value;
+}
+
+/// Collect only positional arguments for a Python function call
+template <return_value_policy policy,
+          typename... Args,
+          typename = enable_if_t<args_are_all_positional<Args...>()>>
+simple_collector<policy> collect_arguments(Args &&...args) {
+    return simple_collector<policy>(std::forward<Args>(args)...);
+}
+
+/// Collect all arguments, including keywords and unpacking (only instantiated when needed)
+template <return_value_policy policy,
+          typename... Args,
+          typename = enable_if_t<!args_are_all_positional<Args...>()>>
+unpacking_collector<policy> collect_arguments(Args &&...args) {
+    // Following argument order rules for generalized unpacking according to PEP 448
+    static_assert(constexpr_last<is_positional, Args...>()
+                          < constexpr_first<is_keyword_or_ds, Args...>()
+                      && constexpr_last<is_s_unpacking, Args...>()
+                             < constexpr_first<is_ds_unpacking, Args...>(),
+                  "Invalid function call: positional args must precede keywords and ** unpacking; "
+                  "* unpacking must precede ** unpacking");
+    return unpacking_collector<policy>(std::forward<Args>(args)...);
+}
+
+template <typename Derived>
+template <return_value_policy policy, typename... Args>
+object object_api<Derived>::operator()(Args &&...args) const {
+#ifndef NDEBUG
+    if (!PyGILState_Check()) {
+        pybind11_fail("pybind11::object_api<>::operator() PyGILState_Check() failure.");
+    }
+#endif
+    return detail::collect_arguments<policy>(std::forward<Args>(args)...).call(derived().ptr());
+}
+
+template <typename Derived>
+template <return_value_policy policy, typename... Args>
+object object_api<Derived>::call(Args &&...args) const {
+    return operator()<policy>(std::forward<Args>(args)...);
+}
+
+PYBIND11_NAMESPACE_END(detail)
+
+template <typename T>
+handle type::handle_of() {
+    static_assert(std::is_base_of<detail::type_caster_generic, detail::make_caster<T>>::value,
+                  "py::type::of<T> only supports the case where T is a registered C++ types.");
+
+    return detail::get_type_handle(typeid(T), true);
+}
+
+#define PYBIND11_MAKE_OPAQUE(...)                                                                 \
+    PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)                                                  \
+    namespace detail {                                                                            \
+    template <>                                                                                   \
+    class type_caster<__VA_ARGS__> : public type_caster_base<__VA_ARGS__> {};                     \
+    }                                                                                             \
+    PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
+
+/// Lets you pass a type containing a `,` through a macro parameter without needing a separate
+/// typedef, e.g.:
+/// `PYBIND11_OVERRIDE(PYBIND11_TYPE(ReturnType<A, B>), PYBIND11_TYPE(Parent<C, D>), f, arg)`
+#define PYBIND11_TYPE(...) __VA_ARGS__
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/phivenv/Lib/site-packages/torch/include/pybind11/chrono.h b/phivenv/Lib/site-packages/torch/include/pybind11/chrono.h
new file mode 100644
index 0000000000000000000000000000000000000000..c1e343a415fa28228bd9d759e0e1c69007535d67
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/pybind11/chrono.h
@@ -0,0 +1,225 @@
+/*
+    pybind11/chrono.h: Transparent conversion between std::chrono and python's datetime
+
+    Copyright (c) 2016 Trent Houliston <trent@houliston.me> and
+                       Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+
+#include <chrono>
+#include <cmath>
+#include <ctime>
+#include <datetime.h>
+#include <mutex>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <typename type>
+class duration_caster {
+public:
+    using rep = typename type::rep;
+    using period = typename type::period;
+
+    // signed 25 bits required by the standard.
+    using days = std::chrono::duration<int_least32_t, std::ratio<86400>>;
+
+    bool load(handle src, bool) {
+        using namespace std::chrono;
+
+        // Lazy initialise the PyDateTime import
+        if (!PyDateTimeAPI) {
+            PyDateTime_IMPORT;
+        }
+
+        if (!src) {
+            return false;
+        }
+        // If invoked with datetime.delta object
+        if (PyDelta_Check(src.ptr())) {
+            value = type(duration_cast<duration<rep, period>>(
+                days(PyDateTime_DELTA_GET_DAYS(src.ptr()))
+                + seconds(PyDateTime_DELTA_GET_SECONDS(src.ptr()))
+                + microseconds(PyDateTime_DELTA_GET_MICROSECONDS(src.ptr()))));
+            return true;
+        }
+        // If invoked with a float we assume it is seconds and convert
+        if (PyFloat_Check(src.ptr())) {
+            value = type(duration_cast<duration<rep, period>>(
+                duration<double>(PyFloat_AsDouble(src.ptr()))));
+            return true;
+        }
+        return false;
+    }
+
+    // If this is a duration just return it back
+    static const std::chrono::duration<rep, period> &
+    get_duration(const std::chrono::duration<rep, period> &src) {
+        return src;
+    }
+
+    // If this is a time_point get the time_since_epoch
+    template <typename Clock>
+    static std::chrono::duration<rep, period>
+    get_duration(const std::chrono::time_point<Clock, std::chrono::duration<rep, period>> &src) {
+        return src.time_since_epoch();
+    }
+
+    static handle cast(const type &src, return_value_policy /* policy */, handle /* parent */) {
+        using namespace std::chrono;
+
+        // Use overloaded function to get our duration from our source
+        // Works out if it is a duration or time_point and get the duration
+        auto d = get_duration(src);
+
+        // Lazy initialise the PyDateTime import
+        if (!PyDateTimeAPI) {
+            PyDateTime_IMPORT;
+        }
+
+        // Declare these special duration types so the conversions happen with the correct
+        // primitive types (int)
+        using dd_t = duration<int, std::ratio<86400>>;
+        using ss_t = duration<int, std::ratio<1>>;
+        using us_t = duration<int, std::micro>;
+
+        auto dd = duration_cast<dd_t>(d);
+        auto subd = d - dd;
+        auto ss = duration_cast<ss_t>(subd);
+        auto us = duration_cast<us_t>(subd - ss);
+        return PyDelta_FromDSU(dd.count(), ss.count(), us.count());
+    }
+
+    PYBIND11_TYPE_CASTER(type, const_name("datetime.timedelta"));
+};
+
+inline std::tm *localtime_thread_safe(const std::time_t *time, std::tm *buf) {
+#if (defined(__STDC_LIB_EXT1__) && defined(__STDC_WANT_LIB_EXT1__)) || defined(_MSC_VER)
+    if (localtime_s(buf, time))
+        return nullptr;
+    return buf;
+#else
+    static std::mutex mtx;
+    std::lock_guard<std::mutex> lock(mtx);
+    std::tm *tm_ptr = std::localtime(time);
+    if (tm_ptr != nullptr) {
+        *buf = *tm_ptr;
+    }
+    return tm_ptr;
+#endif
+}
+
+// This is for casting times on the system clock into datetime.datetime instances
+template <typename Duration>
+class type_caster<std::chrono::time_point<std::chrono::system_clock, Duration>> {
+public:
+    using type = std::chrono::time_point<std::chrono::system_clock, Duration>;
+    bool load(handle src, bool) {
+        using namespace std::chrono;
+
+        // Lazy initialise the PyDateTime import
+        if (!PyDateTimeAPI) {
+            PyDateTime_IMPORT;
+        }
+
+        if (!src) {
+            return false;
+        }
+
+        std::tm cal;
+        microseconds msecs;
+
+        if (PyDateTime_Check(src.ptr())) {
+            cal.tm_sec = PyDateTime_DATE_GET_SECOND(src.ptr());
+            cal.tm_min = PyDateTime_DATE_GET_MINUTE(src.ptr());
+            cal.tm_hour = PyDateTime_DATE_GET_HOUR(src.ptr());
+            cal.tm_mday = PyDateTime_GET_DAY(src.ptr());
+            cal.tm_mon = PyDateTime_GET_MONTH(src.ptr()) - 1;
+            cal.tm_year = PyDateTime_GET_YEAR(src.ptr()) - 1900;
+            cal.tm_isdst = -1;
+            msecs = microseconds(PyDateTime_DATE_GET_MICROSECOND(src.ptr()));
+        } else if (PyDate_Check(src.ptr())) {
+            cal.tm_sec = 0;
+            cal.tm_min = 0;
+            cal.tm_hour = 0;
+            cal.tm_mday = PyDateTime_GET_DAY(src.ptr());
+            cal.tm_mon = PyDateTime_GET_MONTH(src.ptr()) - 1;
+            cal.tm_year = PyDateTime_GET_YEAR(src.ptr()) - 1900;
+            cal.tm_isdst = -1;
+            msecs = microseconds(0);
+        } else if (PyTime_Check(src.ptr())) {
+            cal.tm_sec = PyDateTime_TIME_GET_SECOND(src.ptr());
+            cal.tm_min = PyDateTime_TIME_GET_MINUTE(src.ptr());
+            cal.tm_hour = PyDateTime_TIME_GET_HOUR(src.ptr());
+            cal.tm_mday = 1;  // This date (day, month, year) = (1, 0, 70)
+            cal.tm_mon = 0;   // represents 1-Jan-1970, which is the first
+            cal.tm_year = 70; // earliest available date for Python's datetime
+            cal.tm_isdst = -1;
+            msecs = microseconds(PyDateTime_TIME_GET_MICROSECOND(src.ptr()));
+        } else {
+            return false;
+        }
+
+        value = time_point_cast<Duration>(system_clock::from_time_t(std::mktime(&cal)) + msecs);
+        return true;
+    }
+
+    static handle cast(const std::chrono::time_point<std::chrono::system_clock, Duration> &src,
+                       return_value_policy /* policy */,
+                       handle /* parent */) {
+        using namespace std::chrono;
+
+        // Lazy initialise the PyDateTime import
+        if (!PyDateTimeAPI) {
+            PyDateTime_IMPORT;
+        }
+
+        // Get out microseconds, and make sure they are positive, to avoid bug in eastern
+        // hemisphere time zones (cfr. https://github.com/pybind/pybind11/issues/2417)
+        using us_t = duration<int, std::micro>;
+        auto us = duration_cast<us_t>(src.time_since_epoch() % seconds(1));
+        if (us.count() < 0) {
+            us += seconds(1);
+        }
+
+        // Subtract microseconds BEFORE `system_clock::to_time_t`, because:
+        // > If std::time_t has lower precision, it is implementation-defined whether the value is
+        // rounded or truncated. (https://en.cppreference.com/w/cpp/chrono/system_clock/to_time_t)
+        std::time_t tt
+            = system_clock::to_time_t(time_point_cast<system_clock::duration>(src - us));
+
+        std::tm localtime;
+        std::tm *localtime_ptr = localtime_thread_safe(&tt, &localtime);
+        if (!localtime_ptr) {
+            throw cast_error("Unable to represent system_clock in local time");
+        }
+        return PyDateTime_FromDateAndTime(localtime.tm_year + 1900,
+                                          localtime.tm_mon + 1,
+                                          localtime.tm_mday,
+                                          localtime.tm_hour,
+                                          localtime.tm_min,
+                                          localtime.tm_sec,
+                                          us.count());
+    }
+    PYBIND11_TYPE_CASTER(type, const_name("datetime.datetime"));
+};
+
+// Other clocks that are not the system clock are not measured as datetime.datetime objects
+// since they are not measured on calendar time. So instead we just make them timedeltas
+// Or if they have passed us a time as a float we convert that
+template <typename Clock, typename Duration>
+class type_caster<std::chrono::time_point<Clock, Duration>>
+    : public duration_caster<std::chrono::time_point<Clock, Duration>> {};
+
+template <typename Rep, typename Period>
+class type_caster<std::chrono::duration<Rep, Period>>
+    : public duration_caster<std::chrono::duration<Rep, Period>> {};
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/phivenv/Lib/site-packages/torch/include/pybind11/common.h b/phivenv/Lib/site-packages/torch/include/pybind11/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..6b391aa6fc99a2e30c38a2ef739301f7dc4e31dc
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/pybind11/common.h
@@ -0,0 +1,2 @@
+#include "detail/common.h"
+#warning "Including 'common.h' is deprecated. It will be removed in v3.0. Use 'pybind11.h'."
diff --git a/phivenv/Lib/site-packages/torch/include/pybind11/complex.h b/phivenv/Lib/site-packages/torch/include/pybind11/complex.h
new file mode 100644
index 0000000000000000000000000000000000000000..7357b97849bf5f31f4a0770954dbfddb0b014c3a
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/pybind11/complex.h
@@ -0,0 +1,74 @@
+/*
+    pybind11/complex.h: Complex number support
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+
+#include <complex>
+
+/// glibc defines I as a macro which breaks things, e.g., boost template names
+#ifdef I
+#    undef I
+#endif
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+template <typename T>
+struct format_descriptor<std::complex<T>, detail::enable_if_t<std::is_floating_point<T>::value>> {
+    static constexpr const char c = format_descriptor<T>::c;
+    static constexpr const char value[3] = {'Z', c, '\0'};
+    static std::string format() { return std::string(value); }
+};
+
+#ifndef PYBIND11_CPP17
+
+template <typename T>
+constexpr const char
+    format_descriptor<std::complex<T>,
+                      detail::enable_if_t<std::is_floating_point<T>::value>>::value[3];
+
+#endif
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <typename T>
+struct is_fmt_numeric<std::complex<T>, detail::enable_if_t<std::is_floating_point<T>::value>> {
+    static constexpr bool value = true;
+    static constexpr int index = is_fmt_numeric<T>::index + 3;
+};
+
+template <typename T>
+class type_caster<std::complex<T>> {
+public:
+    bool load(handle src, bool convert) {
+        if (!src) {
+            return false;
+        }
+        if (!convert && !PyComplex_Check(src.ptr())) {
+            return false;
+        }
+        Py_complex result = PyComplex_AsCComplex(src.ptr());
+        if (result.real == -1.0 && PyErr_Occurred()) {
+            PyErr_Clear();
+            return false;
+        }
+        value = std::complex<T>((T) result.real, (T) result.imag);
+        return true;
+    }
+
+    static handle
+    cast(const std::complex<T> &src, return_value_policy /* policy */, handle /* parent */) {
+        return PyComplex_FromDoubles((double) src.real(), (double) src.imag());
+    }
+
+    PYBIND11_TYPE_CASTER(std::complex<T>, const_name("complex"));
+};
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/phivenv/Lib/site-packages/torch/include/pybind11/detail/class.h b/phivenv/Lib/site-packages/torch/include/pybind11/detail/class.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb6efe2518c61497ddef4342c2f8561f7b813b9b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/pybind11/detail/class.h
@@ -0,0 +1,767 @@
+/*
+    pybind11/detail/class.h: Python C API implementation details for py::class_
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include <pybind11/attr.h>
+#include <pybind11/options.h>
+
+#include "exception_translation.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+#if !defined(PYPY_VERSION)
+#    define PYBIND11_BUILTIN_QUALNAME
+#    define PYBIND11_SET_OLDPY_QUALNAME(obj, nameobj)
+#else
+// In PyPy, we still set __qualname__ so that we can produce reliable function type
+// signatures; in CPython this macro expands to nothing:
+#    define PYBIND11_SET_OLDPY_QUALNAME(obj, nameobj)                                             \
+        setattr((PyObject *) obj, "__qualname__", nameobj)
+#endif
+
+inline std::string get_fully_qualified_tp_name(PyTypeObject *type) {
+#if !defined(PYPY_VERSION)
+    return type->tp_name;
+#else
+    auto module_name = handle((PyObject *) type).attr("__module__").cast<std::string>();
+    if (module_name == PYBIND11_BUILTINS_MODULE)
+        return type->tp_name;
+    else
+        return std::move(module_name) + "." + type->tp_name;
+#endif
+}
+
+inline PyTypeObject *type_incref(PyTypeObject *type) {
+    Py_INCREF(type);
+    return type;
+}
+
+#if !defined(PYPY_VERSION)
+
+/// `pybind11_static_property.__get__()`: Always pass the class instead of the instance.
+extern "C" inline PyObject *pybind11_static_get(PyObject *self, PyObject * /*ob*/, PyObject *cls) {
+    return PyProperty_Type.tp_descr_get(self, cls, cls);
+}
+
+/// `pybind11_static_property.__set__()`: Just like the above `__get__()`.
+extern "C" inline int pybind11_static_set(PyObject *self, PyObject *obj, PyObject *value) {
+    PyObject *cls = PyType_Check(obj) ? obj : (PyObject *) Py_TYPE(obj);
+    return PyProperty_Type.tp_descr_set(self, cls, value);
+}
+
+// Forward declaration to use in `make_static_property_type()`
+inline void enable_dynamic_attributes(PyHeapTypeObject *heap_type);
+
+/** A `static_property` is the same as a `property` but the `__get__()` and `__set__()`
+    methods are modified to always use the object type instead of a concrete instance.
+    Return value: New reference. */
+inline PyTypeObject *make_static_property_type() {
+    constexpr auto *name = "pybind11_static_property";
+    auto name_obj = reinterpret_steal<object>(PYBIND11_FROM_STRING(name));
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto *heap_type = (PyHeapTypeObject *) PyType_Type.tp_alloc(&PyType_Type, 0);
+    if (!heap_type) {
+        pybind11_fail("make_static_property_type(): error allocating type!");
+    }
+
+    heap_type->ht_name = name_obj.inc_ref().ptr();
+#    ifdef PYBIND11_BUILTIN_QUALNAME
+    heap_type->ht_qualname = name_obj.inc_ref().ptr();
+#    endif
+
+    auto *type = &heap_type->ht_type;
+    type->tp_name = name;
+    type->tp_base = type_incref(&PyProperty_Type);
+    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+    type->tp_descr_get = pybind11_static_get;
+    type->tp_descr_set = pybind11_static_set;
+
+#    if PY_VERSION_HEX >= 0x030C0000
+    // Since Python-3.12 property-derived types are required to
+    // have dynamic attributes (to set `__doc__`)
+    enable_dynamic_attributes(heap_type);
+#    endif
+
+    if (PyType_Ready(type) < 0) {
+        pybind11_fail("make_static_property_type(): failure in PyType_Ready()!");
+    }
+
+    setattr((PyObject *) type, "__module__", str("pybind11_builtins"));
+    PYBIND11_SET_OLDPY_QUALNAME(type, name_obj);
+
+    return type;
+}
+
+#else // PYPY
+
+/** PyPy has some issues with the above C API, so we evaluate Python code instead.
+    This function will only be called once so performance isn't really a concern.
+    Return value: New reference. */
+inline PyTypeObject *make_static_property_type() {
+    auto d = dict();
+    PyObject *result = PyRun_String(R"(\
+class pybind11_static_property(property):
+    def __get__(self, obj, cls):
+        return property.__get__(self, cls, cls)
+
+    def __set__(self, obj, value):
+        cls = obj if isinstance(obj, type) else type(obj)
+        property.__set__(self, cls, value)
+)",
+                                    Py_file_input,
+                                    d.ptr(),
+                                    d.ptr());
+    if (result == nullptr)
+        throw error_already_set();
+    Py_DECREF(result);
+    return (PyTypeObject *) d["pybind11_static_property"].cast<object>().release().ptr();
+}
+
+#endif // PYPY
+
+/** Types with static properties need to handle `Type.static_prop = x` in a specific way.
+    By default, Python replaces the `static_property` itself, but for wrapped C++ types
+    we need to call `static_property.__set__()` in order to propagate the new value to
+    the underlying C++ data structure. */
+extern "C" inline int pybind11_meta_setattro(PyObject *obj, PyObject *name, PyObject *value) {
+    // Use `_PyType_Lookup()` instead of `PyObject_GetAttr()` in order to get the raw
+    // descriptor (`property`) instead of calling `tp_descr_get` (`property.__get__()`).
+    PyObject *descr = _PyType_Lookup((PyTypeObject *) obj, name);
+
+    // The following assignment combinations are possible:
+    //   1. `Type.static_prop = value`             --> descr_set: `Type.static_prop.__set__(value)`
+    //   2. `Type.static_prop = other_static_prop` --> setattro:  replace existing `static_prop`
+    //   3. `Type.regular_attribute = value`       --> setattro:  regular attribute assignment
+    auto *const static_prop = (PyObject *) get_internals().static_property_type;
+    const auto call_descr_set = (descr != nullptr) && (value != nullptr)
+                                && (PyObject_IsInstance(descr, static_prop) != 0)
+                                && (PyObject_IsInstance(value, static_prop) == 0);
+    if (call_descr_set) {
+        // Call `static_property.__set__()` instead of replacing the `static_property`.
+#if !defined(PYPY_VERSION)
+        return Py_TYPE(descr)->tp_descr_set(descr, obj, value);
+#else
+        if (PyObject *result = PyObject_CallMethod(descr, "__set__", "OO", obj, value)) {
+            Py_DECREF(result);
+            return 0;
+        } else {
+            return -1;
+        }
+#endif
+    } else {
+        // Replace existing attribute.
+        return PyType_Type.tp_setattro(obj, name, value);
+    }
+}
+
+/**
+ * Python 3's PyInstanceMethod_Type hides itself via its tp_descr_get, which prevents aliasing
+ * methods via cls.attr("m2") = cls.attr("m1"): instead the tp_descr_get returns a plain function,
+ * when called on a class, or a PyMethod, when called on an instance.  Override that behaviour here
+ * to do a special case bypass for PyInstanceMethod_Types.
+ */
+extern "C" inline PyObject *pybind11_meta_getattro(PyObject *obj, PyObject *name) {
+    PyObject *descr = _PyType_Lookup((PyTypeObject *) obj, name);
+    if (descr && PyInstanceMethod_Check(descr)) {
+        Py_INCREF(descr);
+        return descr;
+    }
+    return PyType_Type.tp_getattro(obj, name);
+}
+
+/// metaclass `__call__` function that is used to create all pybind11 objects.
+extern "C" inline PyObject *pybind11_meta_call(PyObject *type, PyObject *args, PyObject *kwargs) {
+
+    // use the default metaclass call to create/initialize the object
+    PyObject *self = PyType_Type.tp_call(type, args, kwargs);
+    if (self == nullptr) {
+        return nullptr;
+    }
+
+    // Ensure that the base __init__ function(s) were called
+    values_and_holders vhs(self);
+    for (const auto &vh : vhs) {
+        if (!vh.holder_constructed() && !vhs.is_redundant_value_and_holder(vh)) {
+            PyErr_Format(PyExc_TypeError,
+                         "%.200s.__init__() must be called when overriding __init__",
+                         get_fully_qualified_tp_name(vh.type->type).c_str());
+            Py_DECREF(self);
+            return nullptr;
+        }
+    }
+
+    return self;
+}
+
+/// Cleanup the type-info for a pybind11-registered type.
+extern "C" inline void pybind11_meta_dealloc(PyObject *obj) {
+    with_internals([obj](internals &internals) {
+        auto *type = (PyTypeObject *) obj;
+
+        // A pybind11-registered type will:
+        // 1) be found in internals.registered_types_py
+        // 2) have exactly one associated `detail::type_info`
+        auto found_type = internals.registered_types_py.find(type);
+        if (found_type != internals.registered_types_py.end() && found_type->second.size() == 1
+            && found_type->second[0]->type == type) {
+
+            auto *tinfo = found_type->second[0];
+            auto tindex = std::type_index(*tinfo->cpptype);
+            internals.direct_conversions.erase(tindex);
+
+            if (tinfo->module_local) {
+                get_local_internals().registered_types_cpp.erase(tindex);
+            } else {
+                internals.registered_types_cpp.erase(tindex);
+            }
+            internals.registered_types_py.erase(tinfo->type);
+
+            // Actually just `std::erase_if`, but that's only available in C++20
+            auto &cache = internals.inactive_override_cache;
+            for (auto it = cache.begin(), last = cache.end(); it != last;) {
+                if (it->first == (PyObject *) tinfo->type) {
+                    it = cache.erase(it);
+                } else {
+                    ++it;
+                }
+            }
+
+            delete tinfo;
+        }
+    });
+
+    PyType_Type.tp_dealloc(obj);
+}
+
+/** This metaclass is assigned by default to all pybind11 types and is required in order
+    for static properties to function correctly. Users may override this using `py::metaclass`.
+    Return value: New reference. */
+inline PyTypeObject *make_default_metaclass() {
+    constexpr auto *name = "pybind11_type";
+    auto name_obj = reinterpret_steal<object>(PYBIND11_FROM_STRING(name));
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto *heap_type = (PyHeapTypeObject *) PyType_Type.tp_alloc(&PyType_Type, 0);
+    if (!heap_type) {
+        pybind11_fail("make_default_metaclass(): error allocating metaclass!");
+    }
+
+    heap_type->ht_name = name_obj.inc_ref().ptr();
+#ifdef PYBIND11_BUILTIN_QUALNAME
+    heap_type->ht_qualname = name_obj.inc_ref().ptr();
+#endif
+
+    auto *type = &heap_type->ht_type;
+    type->tp_name = name;
+    type->tp_base = type_incref(&PyType_Type);
+    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+
+    type->tp_call = pybind11_meta_call;
+
+    type->tp_setattro = pybind11_meta_setattro;
+    type->tp_getattro = pybind11_meta_getattro;
+
+    type->tp_dealloc = pybind11_meta_dealloc;
+
+    if (PyType_Ready(type) < 0) {
+        pybind11_fail("make_default_metaclass(): failure in PyType_Ready()!");
+    }
+
+    setattr((PyObject *) type, "__module__", str("pybind11_builtins"));
+    PYBIND11_SET_OLDPY_QUALNAME(type, name_obj);
+
+    return type;
+}
+
+/// For multiple inheritance types we need to recursively register/deregister base pointers for any
+/// base classes with pointers that are difference from the instance value pointer so that we can
+/// correctly recognize an offset base class pointer. This calls a function with any offset base
+/// ptrs.
+inline void traverse_offset_bases(void *valueptr,
+                                  const detail::type_info *tinfo,
+                                  instance *self,
+                                  bool (*f)(void * /*parentptr*/, instance * /*self*/)) {
+    for (handle h : reinterpret_borrow<tuple>(tinfo->type->tp_bases)) {
+        if (auto *parent_tinfo = get_type_info((PyTypeObject *) h.ptr())) {
+            for (auto &c : parent_tinfo->implicit_casts) {
+                if (c.first == tinfo->cpptype) {
+                    auto *parentptr = c.second(valueptr);
+                    if (parentptr != valueptr) {
+                        f(parentptr, self);
+                    }
+                    traverse_offset_bases(parentptr, parent_tinfo, self, f);
+                    break;
+                }
+            }
+        }
+    }
+}
+
+inline bool register_instance_impl(void *ptr, instance *self) {
+    with_instance_map(ptr, [&](instance_map &instances) { instances.emplace(ptr, self); });
+    return true; // unused, but gives the same signature as the deregister func
+}
+inline bool deregister_instance_impl(void *ptr, instance *self) {
+    return with_instance_map(ptr, [&](instance_map &instances) {
+        auto range = instances.equal_range(ptr);
+        for (auto it = range.first; it != range.second; ++it) {
+            if (self == it->second) {
+                instances.erase(it);
+                return true;
+            }
+        }
+        return false;
+    });
+}
+
+inline void register_instance(instance *self, void *valptr, const type_info *tinfo) {
+    register_instance_impl(valptr, self);
+    if (!tinfo->simple_ancestors) {
+        traverse_offset_bases(valptr, tinfo, self, register_instance_impl);
+    }
+}
+
+inline bool deregister_instance(instance *self, void *valptr, const type_info *tinfo) {
+    bool ret = deregister_instance_impl(valptr, self);
+    if (!tinfo->simple_ancestors) {
+        traverse_offset_bases(valptr, tinfo, self, deregister_instance_impl);
+    }
+    return ret;
+}
+
+/// Instance creation function for all pybind11 types. It allocates the internal instance layout
+/// for holding C++ objects and holders.  Allocation is done lazily (the first time the instance is
+/// cast to a reference or pointer), and initialization is done by an `__init__` function.
+inline PyObject *make_new_instance(PyTypeObject *type) {
+#if defined(PYPY_VERSION)
+    // PyPy gets tp_basicsize wrong (issue 2482) under multiple inheritance when the first
+    // inherited object is a plain Python type (i.e. not derived from an extension type).  Fix it.
+    ssize_t instance_size = static_cast<ssize_t>(sizeof(instance));
+    if (type->tp_basicsize < instance_size) {
+        type->tp_basicsize = instance_size;
+    }
+#endif
+    PyObject *self = type->tp_alloc(type, 0);
+    auto *inst = reinterpret_cast<instance *>(self);
+    // Allocate the value/holder internals:
+    inst->allocate_layout();
+
+    return self;
+}
+
+/// Instance creation function for all pybind11 types. It only allocates space for the
+/// C++ object, but doesn't call the constructor -- an `__init__` function must do that.
+extern "C" inline PyObject *pybind11_object_new(PyTypeObject *type, PyObject *, PyObject *) {
+    return make_new_instance(type);
+}
+
+/// An `__init__` function constructs the C++ object. Users should provide at least one
+/// of these using `py::init` or directly with `.def(__init__, ...)`. Otherwise, the
+/// following default function will be used which simply throws an exception.
+extern "C" inline int pybind11_object_init(PyObject *self, PyObject *, PyObject *) {
+    PyTypeObject *type = Py_TYPE(self);
+    std::string msg = get_fully_qualified_tp_name(type) + ": No constructor defined!";
+    set_error(PyExc_TypeError, msg.c_str());
+    return -1;
+}
+
+inline void add_patient(PyObject *nurse, PyObject *patient) {
+    auto *instance = reinterpret_cast<detail::instance *>(nurse);
+    instance->has_patients = true;
+    Py_INCREF(patient);
+
+    with_internals([&](internals &internals) { internals.patients[nurse].push_back(patient); });
+}
+
+inline void clear_patients(PyObject *self) {
+    auto *instance = reinterpret_cast<detail::instance *>(self);
+    std::vector<PyObject *> patients;
+
+    with_internals([&](internals &internals) {
+        auto pos = internals.patients.find(self);
+
+        if (pos == internals.patients.end()) {
+            pybind11_fail(
+                "FATAL: Internal consistency check failed: Invalid clear_patients() call.");
+        }
+
+        // Clearing the patients can cause more Python code to run, which
+        // can invalidate the iterator. Extract the vector of patients
+        // from the unordered_map first.
+        patients = std::move(pos->second);
+        internals.patients.erase(pos);
+    });
+
+    instance->has_patients = false;
+    for (PyObject *&patient : patients) {
+        Py_CLEAR(patient);
+    }
+}
+
+/// Clears all internal data from the instance and removes it from registered instances in
+/// preparation for deallocation.
+inline void clear_instance(PyObject *self) {
+    auto *instance = reinterpret_cast<detail::instance *>(self);
+
+    // Deallocate any values/holders, if present:
+    for (auto &v_h : values_and_holders(instance)) {
+        if (v_h) {
+
+            // We have to deregister before we call dealloc because, for virtual MI types, we still
+            // need to be able to get the parent pointers.
+            if (v_h.instance_registered()
+                && !deregister_instance(instance, v_h.value_ptr(), v_h.type)) {
+                pybind11_fail(
+                    "pybind11_object_dealloc(): Tried to deallocate unregistered instance!");
+            }
+
+            if (instance->owned || v_h.holder_constructed()) {
+                v_h.type->dealloc(v_h);
+            }
+        }
+    }
+    // Deallocate the value/holder layout internals:
+    instance->deallocate_layout();
+
+    if (instance->weakrefs) {
+        PyObject_ClearWeakRefs(self);
+    }
+
+    PyObject **dict_ptr = _PyObject_GetDictPtr(self);
+    if (dict_ptr) {
+        Py_CLEAR(*dict_ptr);
+    }
+
+    if (instance->has_patients) {
+        clear_patients(self);
+    }
+}
+
+/// Instance destructor function for all pybind11 types. It calls `type_info.dealloc`
+/// to destroy the C++ object itself, while the rest is Python bookkeeping.
+extern "C" inline void pybind11_object_dealloc(PyObject *self) {
+    auto *type = Py_TYPE(self);
+
+    // If this is a GC tracked object, untrack it first
+    // Note that the track call is implicitly done by the
+    // default tp_alloc, which we never override.
+    if (PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC) != 0) {
+        PyObject_GC_UnTrack(self);
+    }
+
+    clear_instance(self);
+
+    type->tp_free(self);
+
+#if PY_VERSION_HEX < 0x03080000
+    // `type->tp_dealloc != pybind11_object_dealloc` means that we're being called
+    // as part of a derived type's dealloc, in which case we're not allowed to decref
+    // the type here. For cross-module compatibility, we shouldn't compare directly
+    // with `pybind11_object_dealloc`, but with the common one stashed in internals.
+    auto pybind11_object_type = (PyTypeObject *) get_internals().instance_base;
+    if (type->tp_dealloc == pybind11_object_type->tp_dealloc)
+        Py_DECREF(type);
+#else
+    // This was not needed before Python 3.8 (Python issue 35810)
+    // https://github.com/pybind/pybind11/issues/1946
+    Py_DECREF(type);
+#endif
+}
+
+std::string error_string();
+
+/** Create the type which can be used as a common base for all classes.  This is
+    needed in order to satisfy Python's requirements for multiple inheritance.
+    Return value: New reference. */
+inline PyObject *make_object_base_type(PyTypeObject *metaclass) {
+    constexpr auto *name = "pybind11_object";
+    auto name_obj = reinterpret_steal<object>(PYBIND11_FROM_STRING(name));
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto *heap_type = (PyHeapTypeObject *) metaclass->tp_alloc(metaclass, 0);
+    if (!heap_type) {
+        pybind11_fail("make_object_base_type(): error allocating type!");
+    }
+
+    heap_type->ht_name = name_obj.inc_ref().ptr();
+#ifdef PYBIND11_BUILTIN_QUALNAME
+    heap_type->ht_qualname = name_obj.inc_ref().ptr();
+#endif
+
+    auto *type = &heap_type->ht_type;
+    type->tp_name = name;
+    type->tp_base = type_incref(&PyBaseObject_Type);
+    type->tp_basicsize = static_cast<ssize_t>(sizeof(instance));
+    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+
+    type->tp_new = pybind11_object_new;
+    type->tp_init = pybind11_object_init;
+    type->tp_dealloc = pybind11_object_dealloc;
+
+    /* Support weak references (needed for the keep_alive feature) */
+    type->tp_weaklistoffset = offsetof(instance, weakrefs);
+
+    if (PyType_Ready(type) < 0) {
+        pybind11_fail("PyType_Ready failed in make_object_base_type(): " + error_string());
+    }
+
+    setattr((PyObject *) type, "__module__", str("pybind11_builtins"));
+    PYBIND11_SET_OLDPY_QUALNAME(type, name_obj);
+
+    assert(!PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC));
+    return (PyObject *) heap_type;
+}
+
+/// dynamic_attr: Allow the garbage collector to traverse the internal instance `__dict__`.
+extern "C" inline int pybind11_traverse(PyObject *self, visitproc visit, void *arg) {
+#if PY_VERSION_HEX >= 0x030D0000
+    PyObject_VisitManagedDict(self, visit, arg);
+#else
+    PyObject *&dict = *_PyObject_GetDictPtr(self);
+    Py_VISIT(dict);
+#endif
+// https://docs.python.org/3/c-api/typeobj.html#c.PyTypeObject.tp_traverse
+#if PY_VERSION_HEX >= 0x03090000
+    Py_VISIT(Py_TYPE(self));
+#endif
+    return 0;
+}
+
+/// dynamic_attr: Allow the GC to clear the dictionary.
+extern "C" inline int pybind11_clear(PyObject *self) {
+#if PY_VERSION_HEX >= 0x030D0000
+    PyObject_ClearManagedDict(self);
+#else
+    PyObject *&dict = *_PyObject_GetDictPtr(self);
+    Py_CLEAR(dict);
+#endif
+    return 0;
+}
+
+/// Give instances of this type a `__dict__` and opt into garbage collection.
+inline void enable_dynamic_attributes(PyHeapTypeObject *heap_type) {
+    auto *type = &heap_type->ht_type;
+    type->tp_flags |= Py_TPFLAGS_HAVE_GC;
+#if PY_VERSION_HEX < 0x030B0000
+    type->tp_dictoffset = type->tp_basicsize;           // place dict at the end
+    type->tp_basicsize += (ssize_t) sizeof(PyObject *); // and allocate enough space for it
+#else
+    type->tp_flags |= Py_TPFLAGS_MANAGED_DICT;
+#endif
+    type->tp_traverse = pybind11_traverse;
+    type->tp_clear = pybind11_clear;
+
+    static PyGetSetDef getset[]
+        = {{"__dict__", PyObject_GenericGetDict, PyObject_GenericSetDict, nullptr, nullptr},
+           {nullptr, nullptr, nullptr, nullptr, nullptr}};
+    type->tp_getset = getset;
+}
+
+/// buffer_protocol: Fill in the view as specified by flags.
+extern "C" inline int pybind11_getbuffer(PyObject *obj, Py_buffer *view, int flags) {
+    // Look for a `get_buffer` implementation in this type's info or any bases (following MRO).
+    type_info *tinfo = nullptr;
+    for (auto type : reinterpret_borrow<tuple>(Py_TYPE(obj)->tp_mro)) {
+        tinfo = get_type_info((PyTypeObject *) type.ptr());
+        if (tinfo && tinfo->get_buffer) {
+            break;
+        }
+    }
+    if (view == nullptr || !tinfo || !tinfo->get_buffer) {
+        if (view) {
+            view->obj = nullptr;
+        }
+        set_error(PyExc_BufferError, "pybind11_getbuffer(): Internal error");
+        return -1;
+    }
+    std::memset(view, 0, sizeof(Py_buffer));
+    buffer_info *info = nullptr;
+    try {
+        info = tinfo->get_buffer(obj, tinfo->get_buffer_data);
+    } catch (...) {
+        try_translate_exceptions();
+        raise_from(PyExc_BufferError, "Error getting buffer");
+        return -1;
+    }
+    if (info == nullptr) {
+        pybind11_fail("FATAL UNEXPECTED SITUATION: tinfo->get_buffer() returned nullptr.");
+    }
+
+    if ((flags & PyBUF_WRITABLE) == PyBUF_WRITABLE && info->readonly) {
+        delete info;
+        // view->obj = nullptr;  // Was just memset to 0, so not necessary
+        set_error(PyExc_BufferError, "Writable buffer requested for readonly storage");
+        return -1;
+    }
+    view->obj = obj;
+    view->ndim = 1;
+    view->internal = info;
+    view->buf = info->ptr;
+    view->itemsize = info->itemsize;
+    view->len = view->itemsize;
+    for (auto s : info->shape) {
+        view->len *= s;
+    }
+    view->readonly = static_cast<int>(info->readonly);
+    if ((flags & PyBUF_FORMAT) == PyBUF_FORMAT) {
+        view->format = const_cast<char *>(info->format.c_str());
+    }
+    if ((flags & PyBUF_STRIDES) == PyBUF_STRIDES) {
+        view->ndim = (int) info->ndim;
+        view->strides = info->strides.data();
+        view->shape = info->shape.data();
+    }
+    Py_INCREF(view->obj);
+    return 0;
+}
+
+/// buffer_protocol: Release the resources of the buffer.
+extern "C" inline void pybind11_releasebuffer(PyObject *, Py_buffer *view) {
+    delete (buffer_info *) view->internal;
+}
+
+/// Give this type a buffer interface.
+inline void enable_buffer_protocol(PyHeapTypeObject *heap_type) {
+    heap_type->ht_type.tp_as_buffer = &heap_type->as_buffer;
+
+    heap_type->as_buffer.bf_getbuffer = pybind11_getbuffer;
+    heap_type->as_buffer.bf_releasebuffer = pybind11_releasebuffer;
+}
+
+/** Create a brand new Python type according to the `type_record` specification.
+    Return value: New reference. */
+inline PyObject *make_new_python_type(const type_record &rec) {
+    auto name = reinterpret_steal<object>(PYBIND11_FROM_STRING(rec.name));
+
+    auto qualname = name;
+    if (rec.scope && !PyModule_Check(rec.scope.ptr()) && hasattr(rec.scope, "__qualname__")) {
+        qualname = reinterpret_steal<object>(
+            PyUnicode_FromFormat("%U.%U", rec.scope.attr("__qualname__").ptr(), name.ptr()));
+    }
+
+    object module_;
+    if (rec.scope) {
+        if (hasattr(rec.scope, "__module__")) {
+            module_ = rec.scope.attr("__module__");
+        } else if (hasattr(rec.scope, "__name__")) {
+            module_ = rec.scope.attr("__name__");
+        }
+    }
+
+    const auto *full_name = c_str(
+#if !defined(PYPY_VERSION)
+        module_ ? str(module_).cast<std::string>() + "." + rec.name :
+#endif
+                rec.name);
+
+    char *tp_doc = nullptr;
+    if (rec.doc && options::show_user_defined_docstrings()) {
+        /* Allocate memory for docstring (Python will free this later on) */
+        size_t size = std::strlen(rec.doc) + 1;
+#if PY_VERSION_HEX >= 0x030D0000
+        tp_doc = (char *) PyMem_MALLOC(size);
+#else
+        tp_doc = (char *) PyObject_MALLOC(size);
+#endif
+        std::memcpy((void *) tp_doc, rec.doc, size);
+    }
+
+    auto &internals = get_internals();
+    auto bases = tuple(rec.bases);
+    auto *base = (bases.empty()) ? internals.instance_base : bases[0].ptr();
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto *metaclass
+        = rec.metaclass.ptr() ? (PyTypeObject *) rec.metaclass.ptr() : internals.default_metaclass;
+
+    auto *heap_type = (PyHeapTypeObject *) metaclass->tp_alloc(metaclass, 0);
+    if (!heap_type) {
+        pybind11_fail(std::string(rec.name) + ": Unable to create type object!");
+    }
+
+    heap_type->ht_name = name.release().ptr();
+#ifdef PYBIND11_BUILTIN_QUALNAME
+    heap_type->ht_qualname = qualname.inc_ref().ptr();
+#endif
+
+    auto *type = &heap_type->ht_type;
+    type->tp_name = full_name;
+    type->tp_doc = tp_doc;
+    type->tp_base = type_incref((PyTypeObject *) base);
+    type->tp_basicsize = static_cast<ssize_t>(sizeof(instance));
+    if (!bases.empty()) {
+        type->tp_bases = bases.release().ptr();
+    }
+
+    /* Don't inherit base __init__ */
+    type->tp_init = pybind11_object_init;
+
+    /* Supported protocols */
+    type->tp_as_number = &heap_type->as_number;
+    type->tp_as_sequence = &heap_type->as_sequence;
+    type->tp_as_mapping = &heap_type->as_mapping;
+    type->tp_as_async = &heap_type->as_async;
+
+    /* Flags */
+    type->tp_flags |= Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HEAPTYPE;
+    if (!rec.is_final) {
+        type->tp_flags |= Py_TPFLAGS_BASETYPE;
+    }
+
+    if (rec.dynamic_attr) {
+        enable_dynamic_attributes(heap_type);
+    }
+
+    if (rec.buffer_protocol) {
+        enable_buffer_protocol(heap_type);
+    }
+
+    if (rec.custom_type_setup_callback) {
+        rec.custom_type_setup_callback(heap_type);
+    }
+
+    if (PyType_Ready(type) < 0) {
+        pybind11_fail(std::string(rec.name) + ": PyType_Ready failed: " + error_string());
+    }
+
+    assert(!rec.dynamic_attr || PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC));
+
+    /* Register type with the parent scope */
+    if (rec.scope) {
+        setattr(rec.scope, rec.name, (PyObject *) type);
+    } else {
+        Py_INCREF(type); // Keep it alive forever (reference leak)
+    }
+
+    if (module_) { // Needed by pydoc
+        setattr((PyObject *) type, "__module__", module_);
+    }
+
+    PYBIND11_SET_OLDPY_QUALNAME(type, qualname);
+
+    return (PyObject *) type;
+}
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/phivenv/Lib/site-packages/torch/include/pybind11/detail/common.h b/phivenv/Lib/site-packages/torch/include/pybind11/detail/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..c3b7ff0a5e31479801f04f035898bf0e2bc6ea34
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/pybind11/detail/common.h
@@ -0,0 +1,1287 @@
+/*
+    pybind11/detail/common.h -- Basic macros
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#define PYBIND11_VERSION_MAJOR 2
+#define PYBIND11_VERSION_MINOR 13
+#define PYBIND11_VERSION_PATCH 6
+
+// Similar to Python's convention: https://docs.python.org/3/c-api/apiabiversion.html
+// Additional convention: 0xD = dev
+#define PYBIND11_VERSION_HEX 0x020D0600
+
+// Define some generic pybind11 helper macros for warning management.
+//
+// Note that compiler-specific push/pop pairs are baked into the
+// PYBIND11_NAMESPACE_BEGIN/PYBIND11_NAMESPACE_END pair of macros. Therefore manual
+// PYBIND11_WARNING_PUSH/PYBIND11_WARNING_POP are usually only needed in `#include` sections.
+//
+// If you find you need to suppress a warning, please try to make the suppression as local as
+// possible using these macros. Please also be sure to push/pop with the pybind11 macros. Please
+// only use compiler specifics if you need to check specific versions, e.g. Apple Clang vs. vanilla
+// Clang.
+#if defined(_MSC_VER)
+#    define PYBIND11_COMPILER_MSVC
+#    define PYBIND11_PRAGMA(...) __pragma(__VA_ARGS__)
+#    define PYBIND11_WARNING_PUSH PYBIND11_PRAGMA(warning(push))
+#    define PYBIND11_WARNING_POP PYBIND11_PRAGMA(warning(pop))
+#elif defined(__INTEL_COMPILER)
+#    define PYBIND11_COMPILER_INTEL
+#    define PYBIND11_PRAGMA(...) _Pragma(#__VA_ARGS__)
+#    define PYBIND11_WARNING_PUSH PYBIND11_PRAGMA(warning push)
+#    define PYBIND11_WARNING_POP PYBIND11_PRAGMA(warning pop)
+#elif defined(__clang__)
+#    define PYBIND11_COMPILER_CLANG
+#    define PYBIND11_PRAGMA(...) _Pragma(#__VA_ARGS__)
+#    define PYBIND11_WARNING_PUSH PYBIND11_PRAGMA(clang diagnostic push)
+#    define PYBIND11_WARNING_POP PYBIND11_PRAGMA(clang diagnostic push)
+#elif defined(__GNUC__)
+#    define PYBIND11_COMPILER_GCC
+#    define PYBIND11_PRAGMA(...) _Pragma(#__VA_ARGS__)
+#    define PYBIND11_WARNING_PUSH PYBIND11_PRAGMA(GCC diagnostic push)
+#    define PYBIND11_WARNING_POP PYBIND11_PRAGMA(GCC diagnostic pop)
+#endif
+
+#ifdef PYBIND11_COMPILER_MSVC
+#    define PYBIND11_WARNING_DISABLE_MSVC(name) PYBIND11_PRAGMA(warning(disable : name))
+#else
+#    define PYBIND11_WARNING_DISABLE_MSVC(name)
+#endif
+
+#ifdef PYBIND11_COMPILER_CLANG
+#    define PYBIND11_WARNING_DISABLE_CLANG(name) PYBIND11_PRAGMA(clang diagnostic ignored name)
+#else
+#    define PYBIND11_WARNING_DISABLE_CLANG(name)
+#endif
+
+#ifdef PYBIND11_COMPILER_GCC
+#    define PYBIND11_WARNING_DISABLE_GCC(name) PYBIND11_PRAGMA(GCC diagnostic ignored name)
+#else
+#    define PYBIND11_WARNING_DISABLE_GCC(name)
+#endif
+
+#ifdef PYBIND11_COMPILER_INTEL
+#    define PYBIND11_WARNING_DISABLE_INTEL(name) PYBIND11_PRAGMA(warning disable name)
+#else
+#    define PYBIND11_WARNING_DISABLE_INTEL(name)
+#endif
+
+#define PYBIND11_NAMESPACE_BEGIN(name)                                                            \
+    namespace name {                                                                              \
+    PYBIND11_WARNING_PUSH
+
+#define PYBIND11_NAMESPACE_END(name)                                                              \
+    PYBIND11_WARNING_POP                                                                          \
+    }
+
+// Robust support for some features and loading modules compiled against different pybind versions
+// requires forcing hidden visibility on pybind code, so we enforce this by setting the attribute
+// on the main `pybind11` namespace.
+#if !defined(PYBIND11_NAMESPACE)
+#    ifdef __GNUG__
+#        define PYBIND11_NAMESPACE pybind11 __attribute__((visibility("hidden")))
+#    else
+#        define PYBIND11_NAMESPACE pybind11
+#    endif
+#endif
+
+#if !(defined(_MSC_VER) && __cplusplus == 199711L)
+#    if __cplusplus >= 201402L
+#        define PYBIND11_CPP14
+#        if __cplusplus >= 201703L
+#            define PYBIND11_CPP17
+#            if __cplusplus >= 202002L
+#                define PYBIND11_CPP20
+// Please update tests/pybind11_tests.cpp `cpp_std()` when adding a macro here.
+#            endif
+#        endif
+#    endif
+#elif defined(_MSC_VER) && __cplusplus == 199711L
+// MSVC sets _MSVC_LANG rather than __cplusplus (supposedly until the standard is fully
+// implemented). Unless you use the /Zc:__cplusplus flag on Visual Studio 2017 15.7 Preview 3
+// or newer.
+#    if _MSVC_LANG >= 201402L
+#        define PYBIND11_CPP14
+#        if _MSVC_LANG > 201402L
+#            define PYBIND11_CPP17
+#            if _MSVC_LANG >= 202002L
+#                define PYBIND11_CPP20
+#            endif
+#        endif
+#    endif
+#endif
+
+#if defined(PYBIND11_CPP20)
+#    define PYBIND11_CONSTINIT constinit
+#    define PYBIND11_DTOR_CONSTEXPR constexpr
+#else
+#    define PYBIND11_CONSTINIT
+#    define PYBIND11_DTOR_CONSTEXPR
+#endif
+
+// Compiler version assertions
+#if defined(__INTEL_COMPILER)
+#    if __INTEL_COMPILER < 1800
+#        error pybind11 requires Intel C++ compiler v18 or newer
+#    elif __INTEL_COMPILER < 1900 && defined(PYBIND11_CPP14)
+#        error pybind11 supports only C++11 with Intel C++ compiler v18. Use v19 or newer for C++14.
+#    endif
+/* The following pragma cannot be pop'ed:
+   https://community.intel.com/t5/Intel-C-Compiler/Inline-and-no-inline-warning/td-p/1216764 */
+#    pragma warning disable 2196 // warning #2196: routine is both "inline" and "noinline"
+#elif defined(__clang__) && !defined(__apple_build_version__)
+#    if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ < 3)
+#        error pybind11 requires clang 3.3 or newer
+#    endif
+#elif defined(__clang__)
+// Apple changes clang version macros to its Xcode version; the first Xcode release based on
+// (upstream) clang 3.3 was Xcode 5:
+#    if __clang_major__ < 5
+#        error pybind11 requires Xcode/clang 5.0 or newer
+#    endif
+#elif defined(__GNUG__)
+#    if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 8)
+#        error pybind11 requires gcc 4.8 or newer
+#    endif
+#elif defined(_MSC_VER)
+#    if _MSC_VER < 1910
+#        error pybind11 2.10+ requires MSVC 2017 or newer
+#    endif
+#endif
+
+#if !defined(PYBIND11_EXPORT)
+#    if defined(WIN32) || defined(_WIN32)
+#        define PYBIND11_EXPORT __declspec(dllexport)
+#    else
+#        define PYBIND11_EXPORT __attribute__((visibility("default")))
+#    endif
+#endif
+
+#if !defined(PYBIND11_EXPORT_EXCEPTION)
+#    if defined(__apple_build_version__)
+#        define PYBIND11_EXPORT_EXCEPTION PYBIND11_EXPORT
+#    else
+#        define PYBIND11_EXPORT_EXCEPTION
+#    endif
+#endif
+
+// For CUDA, GCC7, GCC8:
+// PYBIND11_NOINLINE_FORCED is incompatible with `-Wattributes -Werror`.
+// When defining PYBIND11_NOINLINE_FORCED, it is best to also use `-Wno-attributes`.
+// However, the measured shared-library size saving when using noinline are only
+// 1.7% for CUDA, -0.2% for GCC7, and 0.0% for GCC8 (using -DCMAKE_BUILD_TYPE=MinSizeRel,
+// the default under pybind11/tests).
+#if !defined(PYBIND11_NOINLINE_FORCED)                                                            \
+    && (defined(__CUDACC__) || (defined(__GNUC__) && (__GNUC__ == 7 || __GNUC__ == 8)))
+#    define PYBIND11_NOINLINE_DISABLED
+#endif
+
+// The PYBIND11_NOINLINE macro is for function DEFINITIONS.
+// In contrast, FORWARD DECLARATIONS should never use this macro:
+// https://stackoverflow.com/questions/9317473/forward-declaration-of-inline-functions
+#if defined(PYBIND11_NOINLINE_DISABLED) // Option for maximum portability and experimentation.
+#    define PYBIND11_NOINLINE inline
+#elif defined(_MSC_VER)
+#    define PYBIND11_NOINLINE __declspec(noinline) inline
+#else
+#    define PYBIND11_NOINLINE __attribute__((noinline)) inline
+#endif
+
+#if defined(__MINGW32__)
+// For unknown reasons all PYBIND11_DEPRECATED member trigger a warning when declared
+// whether it is used or not
+#    define PYBIND11_DEPRECATED(reason)
+#elif defined(PYBIND11_CPP14)
+#    define PYBIND11_DEPRECATED(reason) [[deprecated(reason)]]
+#else
+#    define PYBIND11_DEPRECATED(reason) __attribute__((deprecated(reason)))
+#endif
+
+#if defined(PYBIND11_CPP17)
+#    define PYBIND11_MAYBE_UNUSED [[maybe_unused]]
+#elif defined(_MSC_VER) && !defined(__clang__)
+#    define PYBIND11_MAYBE_UNUSED
+#else
+#    define PYBIND11_MAYBE_UNUSED __attribute__((__unused__))
+#endif
+
+/* Don't let Python.h #define (v)snprintf as macro because they are implemented
+   properly in Visual Studio since 2015. */
+#if defined(_MSC_VER)
+#    define HAVE_SNPRINTF 1
+#endif
+
+/// Include Python header, disable linking to pythonX_d.lib on Windows in debug mode
+#if defined(_MSC_VER)
+PYBIND11_WARNING_PUSH
+PYBIND11_WARNING_DISABLE_MSVC(4505)
+// C4505: 'PySlice_GetIndicesEx': unreferenced local function has been removed (PyPy only)
+#    if defined(_DEBUG) && !defined(Py_DEBUG)
+// Workaround for a VS 2022 issue.
+// NOTE: This workaround knowingly violates the Python.h include order requirement:
+// https://docs.python.org/3/c-api/intro.html#include-files
+// See https://github.com/pybind/pybind11/pull/3497 for full context.
+#        include <yvals.h>
+#        if _MSVC_STL_VERSION >= 143
+#            include <crtdefs.h>
+#        endif
+#        define PYBIND11_DEBUG_MARKER
+#        undef _DEBUG
+#    endif
+#endif
+
+// https://en.cppreference.com/w/c/chrono/localtime
+#if defined(__STDC_LIB_EXT1__) && !defined(__STDC_WANT_LIB_EXT1__)
+#    define __STDC_WANT_LIB_EXT1__
+#endif
+
+#ifdef __has_include
+// std::optional (but including it in c++14 mode isn't allowed)
+#    if defined(PYBIND11_CPP17) && __has_include(<optional>)
+#        define PYBIND11_HAS_OPTIONAL 1
+#    endif
+// std::experimental::optional (but not allowed in c++11 mode)
+#    if defined(PYBIND11_CPP14) && (__has_include(<experimental/optional>) && \
+                                 !__has_include(<optional>))
+#        define PYBIND11_HAS_EXP_OPTIONAL 1
+#    endif
+// std::variant
+#    if defined(PYBIND11_CPP17) && __has_include(<variant>)
+#        define PYBIND11_HAS_VARIANT 1
+#    endif
+#elif defined(_MSC_VER) && defined(PYBIND11_CPP17)
+#    define PYBIND11_HAS_OPTIONAL 1
+#    define PYBIND11_HAS_VARIANT 1
+#endif
+
+#if defined(PYBIND11_CPP17)
+#    if defined(__has_include)
+#        if __has_include(<string_view>)
+#            define PYBIND11_HAS_STRING_VIEW
+#        endif
+#    elif defined(_MSC_VER)
+#        define PYBIND11_HAS_STRING_VIEW
+#    endif
+#endif
+
+#include <Python.h>
+#if PY_VERSION_HEX < 0x03070000
+#    error "PYTHON < 3.7 IS UNSUPPORTED. pybind11 v2.12 was the last to support Python 3.6."
+#endif
+#include <frameobject.h>
+#include <pythread.h>
+
+/* Python #defines overrides on all sorts of core functions, which
+   tends to weak havok in C++ codebases that expect these to work
+   like regular functions (potentially with several overloads) */
+#if defined(isalnum)
+#    undef isalnum
+#    undef isalpha
+#    undef islower
+#    undef isspace
+#    undef isupper
+#    undef tolower
+#    undef toupper
+#endif
+
+#if defined(copysign)
+#    undef copysign
+#endif
+
+#if defined(PYBIND11_NUMPY_1_ONLY)
+#    define PYBIND11_INTERNAL_NUMPY_1_ONLY_DETECTED
+#endif
+
+#if defined(PYPY_VERSION) && !defined(PYBIND11_SIMPLE_GIL_MANAGEMENT)
+#    define PYBIND11_SIMPLE_GIL_MANAGEMENT
+#endif
+
+#if defined(_MSC_VER)
+#    if defined(PYBIND11_DEBUG_MARKER)
+#        define _DEBUG
+#        undef PYBIND11_DEBUG_MARKER
+#    endif
+PYBIND11_WARNING_POP
+#endif
+
+#include <cstddef>
+#include <cstring>
+#include <exception>
+#include <forward_list>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <typeindex>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#if defined(__has_include)
+#    if __has_include(<version>)
+#        include <version>
+#    endif
+#endif
+
+// Must be after including <version> or one of the other headers specified by the standard
+#if defined(__cpp_lib_char8_t) && __cpp_lib_char8_t >= 201811L
+#    define PYBIND11_HAS_U8STRING
+#endif
+
+// See description of PR #4246:
+#if !defined(PYBIND11_NO_ASSERT_GIL_HELD_INCREF_DECREF) && !defined(NDEBUG)                       \
+    && !defined(PYPY_VERSION) && !defined(PYBIND11_ASSERT_GIL_HELD_INCREF_DECREF)
+#    define PYBIND11_ASSERT_GIL_HELD_INCREF_DECREF
+#endif
+
+// #define PYBIND11_STR_LEGACY_PERMISSIVE
+// If DEFINED, pybind11::str can hold PyUnicodeObject or PyBytesObject
+//             (probably surprising and never documented, but this was the
+//             legacy behavior until and including v2.6.x). As a side-effect,
+//             pybind11::isinstance<str>() is true for both pybind11::str and
+//             pybind11::bytes.
+// If UNDEFINED, pybind11::str can only hold PyUnicodeObject, and
+//               pybind11::isinstance<str>() is true only for pybind11::str.
+//               However, for Python 2 only (!), the pybind11::str caster
+//               implicitly decoded bytes to PyUnicodeObject. This was to ease
+//               the transition from the legacy behavior to the non-permissive
+//               behavior.
+
+/// Compatibility macros for Python 2 / Python 3 versions TODO: remove
+#define PYBIND11_INSTANCE_METHOD_NEW(ptr, class_) PyInstanceMethod_New(ptr)
+#define PYBIND11_INSTANCE_METHOD_CHECK PyInstanceMethod_Check
+#define PYBIND11_INSTANCE_METHOD_GET_FUNCTION PyInstanceMethod_GET_FUNCTION
+#define PYBIND11_BYTES_CHECK PyBytes_Check
+#define PYBIND11_BYTES_FROM_STRING PyBytes_FromString
+#define PYBIND11_BYTES_FROM_STRING_AND_SIZE PyBytes_FromStringAndSize
+#define PYBIND11_BYTES_AS_STRING_AND_SIZE PyBytes_AsStringAndSize
+#define PYBIND11_BYTES_AS_STRING PyBytes_AsString
+#define PYBIND11_BYTES_SIZE PyBytes_Size
+#define PYBIND11_LONG_CHECK(o) PyLong_Check(o)
+#define PYBIND11_LONG_AS_LONGLONG(o) PyLong_AsLongLong(o)
+#define PYBIND11_LONG_FROM_SIGNED(o) PyLong_FromSsize_t((ssize_t) (o))
+#define PYBIND11_LONG_FROM_UNSIGNED(o) PyLong_FromSize_t((size_t) (o))
+#define PYBIND11_BYTES_NAME "bytes"
+#define PYBIND11_STRING_NAME "str"
+#define PYBIND11_SLICE_OBJECT PyObject
+#define PYBIND11_FROM_STRING PyUnicode_FromString
+#define PYBIND11_STR_TYPE ::pybind11::str
+#define PYBIND11_BOOL_ATTR "__bool__"
+#define PYBIND11_NB_BOOL(ptr) ((ptr)->nb_bool)
+#define PYBIND11_BUILTINS_MODULE "builtins"
+// Providing a separate declaration to make Clang's -Wmissing-prototypes happy.
+// See comment for PYBIND11_MODULE below for why this is marked "maybe unused".
+#define PYBIND11_PLUGIN_IMPL(name)                                                                \
+    extern "C" PYBIND11_MAYBE_UNUSED PYBIND11_EXPORT PyObject *PyInit_##name();                   \
+    extern "C" PYBIND11_EXPORT PyObject *PyInit_##name()
+
+#define PYBIND11_TRY_NEXT_OVERLOAD ((PyObject *) 1) // special failure return code
+#define PYBIND11_STRINGIFY(x) #x
+#define PYBIND11_TOSTRING(x) PYBIND11_STRINGIFY(x)
+#define PYBIND11_CONCAT(first, second) first##second
+#define PYBIND11_ENSURE_INTERNALS_READY pybind11::detail::get_internals();
+
+#define PYBIND11_CHECK_PYTHON_VERSION                                                             \
+    {                                                                                             \
+        const char *compiled_ver                                                                  \
+            = PYBIND11_TOSTRING(PY_MAJOR_VERSION) "." PYBIND11_TOSTRING(PY_MINOR_VERSION);        \
+        const char *runtime_ver = Py_GetVersion();                                                \
+        size_t len = std::strlen(compiled_ver);                                                   \
+        if (std::strncmp(runtime_ver, compiled_ver, len) != 0                                     \
+            || (runtime_ver[len] >= '0' && runtime_ver[len] <= '9')) {                            \
+            PyErr_Format(PyExc_ImportError,                                                       \
+                         "Python version mismatch: module was compiled for Python %s, "           \
+                         "but the interpreter version is incompatible: %s.",                      \
+                         compiled_ver,                                                            \
+                         runtime_ver);                                                            \
+            return nullptr;                                                                       \
+        }                                                                                         \
+    }
+
+#define PYBIND11_CATCH_INIT_EXCEPTIONS                                                            \
+    catch (pybind11::error_already_set & e) {                                                     \
+        pybind11::raise_from(e, PyExc_ImportError, "initialization failed");                      \
+        return nullptr;                                                                           \
+    }                                                                                             \
+    catch (const std::exception &e) {                                                             \
+        ::pybind11::set_error(PyExc_ImportError, e.what());                                       \
+        return nullptr;                                                                           \
+    }
+
+/** \rst
+    ***Deprecated in favor of PYBIND11_MODULE***
+
+    This macro creates the entry point that will be invoked when the Python interpreter
+    imports a plugin library. Please create a `module_` in the function body and return
+    the pointer to its underlying Python object at the end.
+
+    .. code-block:: cpp
+
+        PYBIND11_PLUGIN(example) {
+            pybind11::module_ m("example", "pybind11 example plugin");
+            /// Set up bindings here
+            return m.ptr();
+        }
+\endrst */
+#define PYBIND11_PLUGIN(name)                                                                     \
+    PYBIND11_DEPRECATED("PYBIND11_PLUGIN is deprecated, use PYBIND11_MODULE")                     \
+    static PyObject *pybind11_init();                                                             \
+    PYBIND11_PLUGIN_IMPL(name) {                                                                  \
+        PYBIND11_CHECK_PYTHON_VERSION                                                             \
+        PYBIND11_ENSURE_INTERNALS_READY                                                           \
+        try {                                                                                     \
+            return pybind11_init();                                                               \
+        }                                                                                         \
+        PYBIND11_CATCH_INIT_EXCEPTIONS                                                            \
+    }                                                                                             \
+    PyObject *pybind11_init()
+
+/** \rst
+    This macro creates the entry point that will be invoked when the Python interpreter
+    imports an extension module. The module name is given as the first argument and it
+    should not be in quotes. The second macro argument defines a variable of type
+    `py::module_` which can be used to initialize the module.
+
+    The entry point is marked as "maybe unused" to aid dead-code detection analysis:
+    since the entry point is typically only looked up at runtime and not referenced
+    during translation, it would otherwise appear as unused ("dead") code.
+
+    .. code-block:: cpp
+
+        PYBIND11_MODULE(example, m) {
+            m.doc() = "pybind11 example module";
+
+            // Add bindings here
+            m.def("foo", []() {
+                return "Hello, World!";
+            });
+        }
+
+    The third macro argument is optional (available since 2.13.0), and can be used to
+    mark the extension module as safe to run without the GIL under a free-threaded CPython
+    interpreter. Passing this argument has no effect on other interpreters.
+
+    .. code-block:: cpp
+
+        PYBIND11_MODULE(example, m, py::mod_gil_not_used()) {
+            m.doc() = "pybind11 example module safe to run without the GIL";
+
+            // Add bindings here
+            m.def("foo", []() {
+                return "Hello, Free-threaded World!";
+            });
+        }
+
+\endrst */
+PYBIND11_WARNING_PUSH
+PYBIND11_WARNING_DISABLE_CLANG("-Wgnu-zero-variadic-macro-arguments")
+#define PYBIND11_MODULE(name, variable, ...)                                                      \
+    static ::pybind11::module_::module_def PYBIND11_CONCAT(pybind11_module_def_, name)            \
+        PYBIND11_MAYBE_UNUSED;                                                                    \
+    PYBIND11_MAYBE_UNUSED                                                                         \
+    static void PYBIND11_CONCAT(pybind11_init_, name)(::pybind11::module_ &);                     \
+    PYBIND11_PLUGIN_IMPL(name) {                                                                  \
+        PYBIND11_CHECK_PYTHON_VERSION                                                             \
+        PYBIND11_ENSURE_INTERNALS_READY                                                           \
+        auto m = ::pybind11::module_::create_extension_module(                                    \
+            PYBIND11_TOSTRING(name),                                                              \
+            nullptr,                                                                              \
+            &PYBIND11_CONCAT(pybind11_module_def_, name),                                         \
+            ##__VA_ARGS__);                                                                       \
+        try {                                                                                     \
+            PYBIND11_CONCAT(pybind11_init_, name)(m);                                             \
+            return m.ptr();                                                                       \
+        }                                                                                         \
+        PYBIND11_CATCH_INIT_EXCEPTIONS                                                            \
+    }                                                                                             \
+    void PYBIND11_CONCAT(pybind11_init_, name)(::pybind11::module_ & (variable))
+PYBIND11_WARNING_POP
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+using ssize_t = Py_ssize_t;
+using size_t = std::size_t;
+
+template <typename IntType>
+inline ssize_t ssize_t_cast(const IntType &val) {
+    static_assert(sizeof(IntType) <= sizeof(ssize_t), "Implicit narrowing is not permitted.");
+    return static_cast<ssize_t>(val);
+}
+
+/// Approach used to cast a previously unknown C++ instance into a Python object
+enum class return_value_policy : uint8_t {
+    /** This is the default return value policy, which falls back to the policy
+        return_value_policy::take_ownership when the return value is a pointer.
+        Otherwise, it uses return_value::move or return_value::copy for rvalue
+        and lvalue references, respectively. See below for a description of what
+        all of these different policies do. */
+    automatic = 0,
+
+    /** As above, but use policy return_value_policy::reference when the return
+        value is a pointer. This is the default conversion policy for function
+        arguments when calling Python functions manually from C++ code (i.e. via
+        handle::operator()). You probably won't need to use this. */
+    automatic_reference,
+
+    /** Reference an existing object (i.e. do not create a new copy) and take
+        ownership. Python will call the destructor and delete operator when the
+        object's reference count reaches zero. Undefined behavior ensues when
+        the C++ side does the same.. */
+    take_ownership,
+
+    /** Create a new copy of the returned object, which will be owned by
+        Python. This policy is comparably safe because the lifetimes of the two
+        instances are decoupled. */
+    copy,
+
+    /** Use std::move to move the return value contents into a new instance
+        that will be owned by Python. This policy is comparably safe because the
+        lifetimes of the two instances (move source and destination) are
+        decoupled. */
+    move,
+
+    /** Reference an existing object, but do not take ownership. The C++ side
+        is responsible for managing the object's lifetime and deallocating it
+        when it is no longer used. Warning: undefined behavior will ensue when
+        the C++ side deletes an object that is still referenced and used by
+        Python. */
+    reference,
+
+    /** This policy only applies to methods and properties. It references the
+        object without taking ownership similar to the above
+        return_value_policy::reference policy. In contrast to that policy, the
+        function or property's implicit this argument (called the parent) is
+        considered to be the owner of the return value (the child).
+        pybind11 then couples the lifetime of the parent to the child via a
+        reference relationship that ensures that the parent cannot be garbage
+        collected while Python is still using the child. More advanced
+        variations of this scheme are also possible using combinations of
+        return_value_policy::reference and the keep_alive call policy */
+    reference_internal
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+inline static constexpr int log2(size_t n, int k = 0) {
+    return (n <= 1) ? k : log2(n >> 1, k + 1);
+}
+
+// Returns the size as a multiple of sizeof(void *), rounded up.
+inline static constexpr size_t size_in_ptrs(size_t s) {
+    return 1 + ((s - 1) >> log2(sizeof(void *)));
+}
+
+/**
+ * The space to allocate for simple layout instance holders (see below) in multiple of the size of
+ * a pointer (e.g.  2 means 16 bytes on 64-bit architectures).  The default is the minimum required
+ * to holder either a std::unique_ptr or std::shared_ptr (which is almost always
+ * sizeof(std::shared_ptr<T>)).
+ */
+constexpr size_t instance_simple_holder_in_ptrs() {
+    static_assert(sizeof(std::shared_ptr<int>) >= sizeof(std::unique_ptr<int>),
+                  "pybind assumes std::shared_ptrs are at least as big as std::unique_ptrs");
+    return size_in_ptrs(sizeof(std::shared_ptr<int>));
+}
+
+// Forward declarations
+struct type_info;
+struct value_and_holder;
+
+struct nonsimple_values_and_holders {
+    void **values_and_holders;
+    uint8_t *status;
+};
+
+/// The 'instance' type which needs to be standard layout (need to be able to use 'offsetof')
+struct instance {
+    PyObject_HEAD
+    /// Storage for pointers and holder; see simple_layout, below, for a description
+    union {
+        void *simple_value_holder[1 + instance_simple_holder_in_ptrs()];
+        nonsimple_values_and_holders nonsimple;
+    };
+    /// Weak references
+    PyObject *weakrefs;
+    /// If true, the pointer is owned which means we're free to manage it with a holder.
+    bool owned : 1;
+    /**
+     * An instance has two possible value/holder layouts.
+     *
+     * Simple layout (when this flag is true), means the `simple_value_holder` is set with a
+     * pointer and the holder object governing that pointer, i.e. [val1*][holder].  This layout is
+     * applied whenever there is no python-side multiple inheritance of bound C++ types *and* the
+     * type's holder will fit in the default space (which is large enough to hold either a
+     * std::unique_ptr or std::shared_ptr).
+     *
+     * Non-simple layout applies when using custom holders that require more space than
+     * `shared_ptr` (which is typically the size of two pointers), or when multiple inheritance is
+     * used on the python side.  Non-simple layout allocates the required amount of memory to have
+     * multiple bound C++ classes as parents.  Under this layout, `nonsimple.values_and_holders` is
+     * set to a pointer to allocated space of the required space to hold a sequence of value
+     * pointers and holders followed `status`, a set of bit flags (1 byte each), i.e.
+     * [val1*][holder1][val2*][holder2]...[bb...]  where each [block] is rounded up to a multiple
+     * of `sizeof(void *)`.  `nonsimple.status` is, for convenience, a pointer to the beginning of
+     * the [bb...] block (but not independently allocated).
+     *
+     * Status bits indicate whether the associated holder is constructed (&
+     * status_holder_constructed) and whether the value pointer is registered (&
+     * status_instance_registered) in `registered_instances`.
+     */
+    bool simple_layout : 1;
+    /// For simple layout, tracks whether the holder has been constructed
+    bool simple_holder_constructed : 1;
+    /// For simple layout, tracks whether the instance is registered in `registered_instances`
+    bool simple_instance_registered : 1;
+    /// If true, get_internals().patients has an entry for this object
+    bool has_patients : 1;
+
+    /// Initializes all of the above type/values/holders data (but not the instance values
+    /// themselves)
+    void allocate_layout();
+
+    /// Destroys/deallocates all of the above
+    void deallocate_layout();
+
+    /// Returns the value_and_holder wrapper for the given type (or the first, if `find_type`
+    /// omitted).  Returns a default-constructed (with `.inst = nullptr`) object on failure if
+    /// `throw_if_missing` is false.
+    value_and_holder get_value_and_holder(const type_info *find_type = nullptr,
+                                          bool throw_if_missing = true);
+
+    /// Bit values for the non-simple status flags
+    static constexpr uint8_t status_holder_constructed = 1;
+    static constexpr uint8_t status_instance_registered = 2;
+};
+
+static_assert(std::is_standard_layout<instance>::value,
+              "Internal error: `pybind11::detail::instance` is not standard layout!");
+
+/// from __cpp_future__ import (convenient aliases from C++14/17)
+#if defined(PYBIND11_CPP14)
+using std::conditional_t;
+using std::enable_if_t;
+using std::remove_cv_t;
+using std::remove_reference_t;
+#else
+template <bool B, typename T = void>
+using enable_if_t = typename std::enable_if<B, T>::type;
+template <bool B, typename T, typename F>
+using conditional_t = typename std::conditional<B, T, F>::type;
+template <typename T>
+using remove_cv_t = typename std::remove_cv<T>::type;
+template <typename T>
+using remove_reference_t = typename std::remove_reference<T>::type;
+#endif
+
+#if defined(PYBIND11_CPP20)
+using std::remove_cvref;
+using std::remove_cvref_t;
+#else
+template <class T>
+struct remove_cvref {
+    using type = remove_cv_t<remove_reference_t<T>>;
+};
+template <class T>
+using remove_cvref_t = typename remove_cvref<T>::type;
+#endif
+
+/// Example usage: is_same_ignoring_cvref<T, PyObject *>::value
+template <typename T, typename U>
+using is_same_ignoring_cvref = std::is_same<detail::remove_cvref_t<T>, U>;
+
+/// Index sequences
+#if defined(PYBIND11_CPP14)
+using std::index_sequence;
+using std::make_index_sequence;
+#else
+template <size_t...>
+struct index_sequence {};
+template <size_t N, size_t... S>
+struct make_index_sequence_impl : make_index_sequence_impl<N - 1, N - 1, S...> {};
+template <size_t... S>
+struct make_index_sequence_impl<0, S...> {
+    using type = index_sequence<S...>;
+};
+template <size_t N>
+using make_index_sequence = typename make_index_sequence_impl<N>::type;
+#endif
+
+/// Make an index sequence of the indices of true arguments
+template <typename ISeq, size_t, bool...>
+struct select_indices_impl {
+    using type = ISeq;
+};
+template <size_t... IPrev, size_t I, bool B, bool... Bs>
+struct select_indices_impl<index_sequence<IPrev...>, I, B, Bs...>
+    : select_indices_impl<conditional_t<B, index_sequence<IPrev..., I>, index_sequence<IPrev...>>,
+                          I + 1,
+                          Bs...> {};
+template <bool... Bs>
+using select_indices = typename select_indices_impl<index_sequence<>, 0, Bs...>::type;
+
+/// Backports of std::bool_constant and std::negation to accommodate older compilers
+template <bool B>
+using bool_constant = std::integral_constant<bool, B>;
+template <typename T>
+struct negation : bool_constant<!T::value> {};
+
+// PGI/Intel cannot detect operator delete with the "compatible" void_t impl, so
+// using the new one (C++14 defect, so generally works on newer compilers, even
+// if not in C++17 mode)
+#if defined(__PGIC__) || defined(__INTEL_COMPILER)
+template <typename...>
+using void_t = void;
+#else
+template <typename...>
+struct void_t_impl {
+    using type = void;
+};
+template <typename... Ts>
+using void_t = typename void_t_impl<Ts...>::type;
+#endif
+
+/// Compile-time all/any/none of that check the boolean value of all template types
+#if defined(__cpp_fold_expressions) && !(defined(_MSC_VER) && (_MSC_VER < 1916))
+template <class... Ts>
+using all_of = bool_constant<(Ts::value && ...)>;
+template <class... Ts>
+using any_of = bool_constant<(Ts::value || ...)>;
+#elif !defined(_MSC_VER)
+template <bool...>
+struct bools {};
+template <class... Ts>
+using all_of = std::is_same<bools<Ts::value..., true>, bools<true, Ts::value...>>;
+template <class... Ts>
+using any_of = negation<all_of<negation<Ts>...>>;
+#else
+// MSVC has trouble with the above, but supports std::conjunction, which we can use instead (albeit
+// at a slight loss of compilation efficiency).
+template <class... Ts>
+using all_of = std::conjunction<Ts...>;
+template <class... Ts>
+using any_of = std::disjunction<Ts...>;
+#endif
+template <class... Ts>
+using none_of = negation<any_of<Ts...>>;
+
+template <class T, template <class> class... Predicates>
+using satisfies_all_of = all_of<Predicates<T>...>;
+template <class T, template <class> class... Predicates>
+using satisfies_any_of = any_of<Predicates<T>...>;
+template <class T, template <class> class... Predicates>
+using satisfies_none_of = none_of<Predicates<T>...>;
+
+/// Strip the class from a method type
+template <typename T>
+struct remove_class {};
+template <typename C, typename R, typename... A>
+struct remove_class<R (C::*)(A...)> {
+    using type = R(A...);
+};
+template <typename C, typename R, typename... A>
+struct remove_class<R (C::*)(A...) const> {
+    using type = R(A...);
+};
+#ifdef __cpp_noexcept_function_type
+template <typename C, typename R, typename... A>
+struct remove_class<R (C::*)(A...) noexcept> {
+    using type = R(A...);
+};
+template <typename C, typename R, typename... A>
+struct remove_class<R (C::*)(A...) const noexcept> {
+    using type = R(A...);
+};
+#endif
+/// Helper template to strip away type modifiers
+template <typename T>
+struct intrinsic_type {
+    using type = T;
+};
+template <typename T>
+struct intrinsic_type<const T> {
+    using type = typename intrinsic_type<T>::type;
+};
+template <typename T>
+struct intrinsic_type<T *> {
+    using type = typename intrinsic_type<T>::type;
+};
+template <typename T>
+struct intrinsic_type<T &> {
+    using type = typename intrinsic_type<T>::type;
+};
+template <typename T>
+struct intrinsic_type<T &&> {
+    using type = typename intrinsic_type<T>::type;
+};
+template <typename T, size_t N>
+struct intrinsic_type<const T[N]> {
+    using type = typename intrinsic_type<T>::type;
+};
+template <typename T, size_t N>
+struct intrinsic_type<T[N]> {
+    using type = typename intrinsic_type<T>::type;
+};
+template <typename T>
+using intrinsic_t = typename intrinsic_type<T>::type;
+
+/// Helper type to replace 'void' in some expressions
+struct void_type {};
+
+/// Helper template which holds a list of types
+template <typename...>
+struct type_list {};
+
+/// Compile-time integer sum
+#ifdef __cpp_fold_expressions
+template <typename... Ts>
+constexpr size_t constexpr_sum(Ts... ns) {
+    return (0 + ... + size_t{ns});
+}
+#else
+constexpr size_t constexpr_sum() { return 0; }
+template <typename T, typename... Ts>
+constexpr size_t constexpr_sum(T n, Ts... ns) {
+    return size_t{n} + constexpr_sum(ns...);
+}
+#endif
+
+PYBIND11_NAMESPACE_BEGIN(constexpr_impl)
+/// Implementation details for constexpr functions
+constexpr int first(int i) { return i; }
+template <typename T, typename... Ts>
+constexpr int first(int i, T v, Ts... vs) {
+    return v ? i : first(i + 1, vs...);
+}
+
+constexpr int last(int /*i*/, int result) { return result; }
+template <typename T, typename... Ts>
+constexpr int last(int i, int result, T v, Ts... vs) {
+    return last(i + 1, v ? i : result, vs...);
+}
+PYBIND11_NAMESPACE_END(constexpr_impl)
+
+/// Return the index of the first type in Ts which satisfies Predicate<T>.
+/// Returns sizeof...(Ts) if none match.
+template <template <typename> class Predicate, typename... Ts>
+constexpr int constexpr_first() {
+    return constexpr_impl::first(0, Predicate<Ts>::value...);
+}
+
+/// Return the index of the last type in Ts which satisfies Predicate<T>, or -1 if none match.
+template <template <typename> class Predicate, typename... Ts>
+constexpr int constexpr_last() {
+    return constexpr_impl::last(0, -1, Predicate<Ts>::value...);
+}
+
+/// Return the Nth element from the parameter pack
+template <size_t N, typename T, typename... Ts>
+struct pack_element {
+    using type = typename pack_element<N - 1, Ts...>::type;
+};
+template <typename T, typename... Ts>
+struct pack_element<0, T, Ts...> {
+    using type = T;
+};
+
+/// Return the one and only type which matches the predicate, or Default if none match.
+/// If more than one type matches the predicate, fail at compile-time.
+template <template <typename> class Predicate, typename Default, typename... Ts>
+struct exactly_one {
+    static constexpr auto found = constexpr_sum(Predicate<Ts>::value...);
+    static_assert(found <= 1, "Found more than one type matching the predicate");
+
+    static constexpr auto index = found ? constexpr_first<Predicate, Ts...>() : 0;
+    using type = conditional_t<found, typename pack_element<index, Ts...>::type, Default>;
+};
+template <template <typename> class P, typename Default>
+struct exactly_one<P, Default> {
+    using type = Default;
+};
+
+template <template <typename> class Predicate, typename Default, typename... Ts>
+using exactly_one_t = typename exactly_one<Predicate, Default, Ts...>::type;
+
+/// Defer the evaluation of type T until types Us are instantiated
+template <typename T, typename... /*Us*/>
+struct deferred_type {
+    using type = T;
+};
+template <typename T, typename... Us>
+using deferred_t = typename deferred_type<T, Us...>::type;
+
+/// Like is_base_of, but requires a strict base (i.e. `is_strict_base_of<T, T>::value == false`,
+/// unlike `std::is_base_of`)
+template <typename Base, typename Derived>
+using is_strict_base_of
+    = bool_constant<std::is_base_of<Base, Derived>::value && !std::is_same<Base, Derived>::value>;
+
+/// Like is_base_of, but also requires that the base type is accessible (i.e. that a Derived
+/// pointer can be converted to a Base pointer) For unions, `is_base_of<T, T>::value` is False, so
+/// we need to check `is_same` as well.
+template <typename Base, typename Derived>
+using is_accessible_base_of
+    = bool_constant<(std::is_same<Base, Derived>::value || std::is_base_of<Base, Derived>::value)
+                    && std::is_convertible<Derived *, Base *>::value>;
+
+template <template <typename...> class Base>
+struct is_template_base_of_impl {
+    template <typename... Us>
+    static std::true_type check(Base<Us...> *);
+    static std::false_type check(...);
+};
+
+/// Check if a template is the base of a type. For example:
+/// `is_template_base_of<Base, T>` is true if `struct T : Base<U> {}` where U can be anything
+template <template <typename...> class Base, typename T>
+// Sadly, all MSVC versions incl. 2022 need the workaround, even in C++20 mode.
+// See also: https://github.com/pybind/pybind11/pull/3741
+#if !defined(_MSC_VER)
+using is_template_base_of
+    = decltype(is_template_base_of_impl<Base>::check((intrinsic_t<T> *) nullptr));
+#else
+struct is_template_base_of
+    : decltype(is_template_base_of_impl<Base>::check((intrinsic_t<T> *) nullptr)){};
+#endif
+
+/// Check if T is an instantiation of the template `Class`. For example:
+/// `is_instantiation<shared_ptr, T>` is true if `T == shared_ptr<U>` where U can be anything.
+template <template <typename...> class Class, typename T>
+struct is_instantiation : std::false_type {};
+template <template <typename...> class Class, typename... Us>
+struct is_instantiation<Class, Class<Us...>> : std::true_type {};
+
+/// Check if T is std::shared_ptr<U> where U can be anything
+template <typename T>
+using is_shared_ptr = is_instantiation<std::shared_ptr, T>;
+
+/// Check if T looks like an input iterator
+template <typename T, typename = void>
+struct is_input_iterator : std::false_type {};
+template <typename T>
+struct is_input_iterator<T,
+                         void_t<decltype(*std::declval<T &>()), decltype(++std::declval<T &>())>>
+    : std::true_type {};
+
+template <typename T>
+using is_function_pointer
+    = bool_constant<std::is_pointer<T>::value
+                    && std::is_function<typename std::remove_pointer<T>::type>::value>;
+
+template <typename F>
+struct strip_function_object {
+    // If you are encountering an
+    // 'error: name followed by "::" must be a class or namespace name'
+    // with the Intel compiler and a noexcept function here,
+    // try to use noexcept(true) instead of plain noexcept.
+    using type = typename remove_class<decltype(&F::operator())>::type;
+};
+
+// Extracts the function signature from a function, function pointer or lambda.
+template <typename Function, typename F = remove_reference_t<Function>>
+using function_signature_t = conditional_t<
+    std::is_function<F>::value,
+    F,
+    typename conditional_t<std::is_pointer<F>::value || std::is_member_pointer<F>::value,
+                           std::remove_pointer<F>,
+                           strip_function_object<F>>::type>;
+
+/// Returns true if the type looks like a lambda: that is, isn't a function, pointer or member
+/// pointer.  Note that this can catch all sorts of other things, too; this is intended to be used
+/// in a place where passing a lambda makes sense.
+template <typename T>
+using is_lambda = satisfies_none_of<remove_reference_t<T>,
+                                    std::is_function,
+                                    std::is_pointer,
+                                    std::is_member_pointer>;
+
+// [workaround(intel)] Internal error on fold expression
+/// Apply a function over each element of a parameter pack
+#if defined(__cpp_fold_expressions) && !defined(__INTEL_COMPILER)
+// Intel compiler produces an internal error on this fold expression (tested with ICC 19.0.2)
+#    define PYBIND11_EXPAND_SIDE_EFFECTS(PATTERN) (((PATTERN), void()), ...)
+#else
+using expand_side_effects = bool[];
+#    define PYBIND11_EXPAND_SIDE_EFFECTS(PATTERN)                                                 \
+        (void) pybind11::detail::expand_side_effects { ((PATTERN), void(), false)..., false }
+#endif
+
+PYBIND11_NAMESPACE_END(detail)
+
+/// C++ bindings of builtin Python exceptions
+class PYBIND11_EXPORT_EXCEPTION builtin_exception : public std::runtime_error {
+public:
+    using std::runtime_error::runtime_error;
+    /// Set the error using the Python C API
+    virtual void set_error() const = 0;
+};
+
+#define PYBIND11_RUNTIME_EXCEPTION(name, type)                                                    \
+    class PYBIND11_EXPORT_EXCEPTION name : public builtin_exception {                             \
+    public:                                                                                       \
+        using builtin_exception::builtin_exception;                                               \
+        name() : name("") {}                                                                      \
+        void set_error() const override { PyErr_SetString(type, what()); }                        \
+    };
+
+PYBIND11_RUNTIME_EXCEPTION(stop_iteration, PyExc_StopIteration)
+PYBIND11_RUNTIME_EXCEPTION(index_error, PyExc_IndexError)
+PYBIND11_RUNTIME_EXCEPTION(key_error, PyExc_KeyError)
+PYBIND11_RUNTIME_EXCEPTION(value_error, PyExc_ValueError)
+PYBIND11_RUNTIME_EXCEPTION(type_error, PyExc_TypeError)
+PYBIND11_RUNTIME_EXCEPTION(buffer_error, PyExc_BufferError)
+PYBIND11_RUNTIME_EXCEPTION(import_error, PyExc_ImportError)
+PYBIND11_RUNTIME_EXCEPTION(attribute_error, PyExc_AttributeError)
+PYBIND11_RUNTIME_EXCEPTION(cast_error, PyExc_RuntimeError) /// Thrown when pybind11::cast or
+                                                           /// handle::call fail due to a type
+                                                           /// casting error
+PYBIND11_RUNTIME_EXCEPTION(reference_cast_error, PyExc_RuntimeError) /// Used internally
+
+[[noreturn]] PYBIND11_NOINLINE void pybind11_fail(const char *reason) {
+    assert(!PyErr_Occurred());
+    throw std::runtime_error(reason);
+}
+[[noreturn]] PYBIND11_NOINLINE void pybind11_fail(const std::string &reason) {
+    assert(!PyErr_Occurred());
+    throw std::runtime_error(reason);
+}
+
+template <typename T, typename SFINAE = void>
+struct format_descriptor {};
+
+template <typename T>
+struct format_descriptor<
+    T,
+    detail::enable_if_t<detail::is_same_ignoring_cvref<T, PyObject *>::value>> {
+    static constexpr const char c = 'O';
+    static constexpr const char value[2] = {c, '\0'};
+    static std::string format() { return std::string(1, c); }
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+// Returns the index of the given type in the type char array below, and in the list in numpy.h
+// The order here is: bool; 8 ints ((signed,unsigned)x(8,16,32,64)bits); float,double,long double;
+// complex float,double,long double.  Note that the long double types only participate when long
+// double is actually longer than double (it isn't under MSVC).
+// NB: not only the string below but also complex.h and numpy.h rely on this order.
+template <typename T, typename SFINAE = void>
+struct is_fmt_numeric {
+    static constexpr bool value = false;
+};
+template <typename T>
+struct is_fmt_numeric<T, enable_if_t<std::is_arithmetic<T>::value>> {
+    static constexpr bool value = true;
+    static constexpr int index
+        = std::is_same<T, bool>::value
+              ? 0
+              : 1
+                    + (std::is_integral<T>::value
+                           ? detail::log2(sizeof(T)) * 2 + std::is_unsigned<T>::value
+                           : 8
+                                 + (std::is_same<T, double>::value        ? 1
+                                    : std::is_same<T, long double>::value ? 2
+                                                                          : 0));
+};
+PYBIND11_NAMESPACE_END(detail)
+
+template <typename T>
+struct format_descriptor<T, detail::enable_if_t<std::is_arithmetic<T>::value>> {
+    static constexpr const char c = "?bBhHiIqQfdg"[detail::is_fmt_numeric<T>::index];
+    static constexpr const char value[2] = {c, '\0'};
+    static std::string format() { return std::string(1, c); }
+};
+
+#if !defined(PYBIND11_CPP17)
+
+template <typename T>
+constexpr const char
+    format_descriptor<T, detail::enable_if_t<std::is_arithmetic<T>::value>>::value[2];
+
+#endif
+
+/// RAII wrapper that temporarily clears any Python error state
+struct error_scope {
+    PyObject *type, *value, *trace;
+    error_scope() { PyErr_Fetch(&type, &value, &trace); }
+    error_scope(const error_scope &) = delete;
+    error_scope &operator=(const error_scope &) = delete;
+    ~error_scope() { PyErr_Restore(type, value, trace); }
+};
+
+/// Dummy destructor wrapper that can be used to expose classes with a private destructor
+struct nodelete {
+    template <typename T>
+    void operator()(T *) {}
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+template <typename... Args>
+struct overload_cast_impl {
+    template <typename Return>
+    constexpr auto operator()(Return (*pf)(Args...)) const noexcept -> decltype(pf) {
+        return pf;
+    }
+
+    template <typename Return, typename Class>
+    constexpr auto operator()(Return (Class::*pmf)(Args...),
+                              std::false_type = {}) const noexcept -> decltype(pmf) {
+        return pmf;
+    }
+
+    template <typename Return, typename Class>
+    constexpr auto operator()(Return (Class::*pmf)(Args...) const,
+                              std::true_type) const noexcept -> decltype(pmf) {
+        return pmf;
+    }
+};
+PYBIND11_NAMESPACE_END(detail)
+
+// overload_cast requires variable templates: C++14
+#if defined(PYBIND11_CPP14)
+#    define PYBIND11_OVERLOAD_CAST 1
+/// Syntax sugar for resolving overloaded function pointers:
+///  - regular: static_cast<Return (Class::*)(Arg0, Arg1, Arg2)>(&Class::func)
+///  - sweet:   overload_cast<Arg0, Arg1, Arg2>(&Class::func)
+template <typename... Args>
+static constexpr detail::overload_cast_impl<Args...> overload_cast{};
+#endif
+
+/// Const member function selector for overload_cast
+///  - regular: static_cast<Return (Class::*)(Arg) const>(&Class::func)
+///  - sweet:   overload_cast<Arg>(&Class::func, const_)
+static constexpr auto const_ = std::true_type{};
+
+#if !defined(PYBIND11_CPP14) // no overload_cast: providing something that static_assert-fails:
+template <typename... Args>
+struct overload_cast {
+    static_assert(detail::deferred_t<std::false_type, Args...>::value,
+                  "pybind11::overload_cast<...> requires compiling in C++14 mode");
+};
+#endif // overload_cast
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// Adaptor for converting arbitrary container arguments into a vector; implicitly convertible from
+// any standard container (or C-style array) supporting std::begin/std::end, any singleton
+// arithmetic type (if T is arithmetic), or explicitly constructible from an iterator pair.
+template <typename T>
+class any_container {
+    std::vector<T> v;
+
+public:
+    any_container() = default;
+
+    // Can construct from a pair of iterators
+    template <typename It, typename = enable_if_t<is_input_iterator<It>::value>>
+    any_container(It first, It last) : v(first, last) {}
+
+    // Implicit conversion constructor from any arbitrary container type
+    // with values convertible to T
+    template <typename Container,
+              typename = enable_if_t<
+                  std::is_convertible<decltype(*std::begin(std::declval<const Container &>())),
+                                      T>::value>>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    any_container(const Container &c) : any_container(std::begin(c), std::end(c)) {}
+
+    // initializer_list's aren't deducible, so don't get matched by the above template;
+    // we need this to explicitly allow implicit conversion from one:
+    template <typename TIn, typename = enable_if_t<std::is_convertible<TIn, T>::value>>
+    any_container(const std::initializer_list<TIn> &c) : any_container(c.begin(), c.end()) {}
+
+    // Avoid copying if given an rvalue vector of the correct type.
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    any_container(std::vector<T> &&v) : v(std::move(v)) {}
+
+    // Moves the vector out of an rvalue any_container
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator std::vector<T> &&() && { return std::move(v); }
+
+    // Dereferencing obtains a reference to the underlying vector
+    std::vector<T> &operator*() { return v; }
+    const std::vector<T> &operator*() const { return v; }
+
+    // -> lets you call methods on the underlying vector
+    std::vector<T> *operator->() { return &v; }
+    const std::vector<T> *operator->() const { return &v; }
+};
+
+// Forward-declaration; see detail/class.h
+std::string get_fully_qualified_tp_name(PyTypeObject *);
+
+template <typename T>
+inline static std::shared_ptr<T>
+try_get_shared_from_this(std::enable_shared_from_this<T> *holder_value_ptr) {
+// Pre C++17, this code path exploits undefined behavior, but is known to work on many platforms.
+// Use at your own risk!
+// See also https://en.cppreference.com/w/cpp/memory/enable_shared_from_this, and in particular
+// the `std::shared_ptr<Good> gp1 = not_so_good.getptr();` and `try`-`catch` parts of the example.
+#if defined(__cpp_lib_enable_shared_from_this) && (!defined(_MSC_VER) || _MSC_VER >= 1912)
+    return holder_value_ptr->weak_from_this().lock();
+#else
+    try {
+        return holder_value_ptr->shared_from_this();
+    } catch (const std::bad_weak_ptr &) {
+        return nullptr;
+    }
+#endif
+}
+
+// For silencing "unused" compiler warnings in special situations.
+template <typename... Args>
+#if defined(_MSC_VER) && _MSC_VER < 1920 // MSVC 2017
+constexpr
+#endif
+    inline void
+    silence_unused_warnings(Args &&...) {
+}
+
+// MSVC warning C4100: Unreferenced formal parameter
+#if defined(_MSC_VER) && _MSC_VER <= 1916
+#    define PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(...)                                         \
+        detail::silence_unused_warnings(__VA_ARGS__)
+#else
+#    define PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(...)
+#endif
+
+// GCC -Wunused-but-set-parameter  All GCC versions (as of July 2021).
+#if defined(__GNUG__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+#    define PYBIND11_WORKAROUND_INCORRECT_GCC_UNUSED_BUT_SET_PARAMETER(...)                       \
+        detail::silence_unused_warnings(__VA_ARGS__)
+#else
+#    define PYBIND11_WORKAROUND_INCORRECT_GCC_UNUSED_BUT_SET_PARAMETER(...)
+#endif
+
+#if defined(__clang__)                                                                            \
+    && (defined(__apple_build_version__) /* AppleClang 13.0.0.13000029 was the only data point    \
+                                            available. */                                         \
+        || (__clang_major__ >= 7                                                                  \
+            && __clang_major__ <= 12) /* Clang 3, 5, 13, 14, 15 do not generate the warning. */   \
+    )
+#    define PYBIND11_DETECTED_CLANG_WITH_MISLEADING_CALL_STD_MOVE_EXPLICITLY_WARNING
+// Example:
+// tests/test_kwargs_and_defaults.cpp:46:68: error: local variable 'args' will be copied despite
+// being returned by name [-Werror,-Wreturn-std-move]
+//     m.def("args_function", [](py::args args) -> py::tuple { return args; });
+//                                                                    ^~~~
+// test_kwargs_and_defaults.cpp:46:68: note: call 'std::move' explicitly to avoid copying
+//     m.def("args_function", [](py::args args) -> py::tuple { return args; });
+//                                                                    ^~~~
+//                                                                    std::move(args)
+#endif
+
+// Pybind offers detailed error messages by default for all builts that are debug (through the
+// negation of NDEBUG). This can also be manually enabled by users, for any builds, through
+// defining PYBIND11_DETAILED_ERROR_MESSAGES. This information is primarily useful for those
+// who are writing (as opposed to merely using) libraries that use pybind11.
+#if !defined(PYBIND11_DETAILED_ERROR_MESSAGES) && !defined(NDEBUG)
+#    define PYBIND11_DETAILED_ERROR_MESSAGES
+#endif
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/phivenv/Lib/site-packages/torch/include/pybind11/detail/cpp_conduit.h b/phivenv/Lib/site-packages/torch/include/pybind11/detail/cpp_conduit.h
new file mode 100644
index 0000000000000000000000000000000000000000..86ee66995ce6e6703bc820c16bbf3b3918e34a9e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/pybind11/detail/cpp_conduit.h
@@ -0,0 +1,77 @@
+// Copyright (c) 2024 The pybind Community.
+
+#pragma once
+
+#include <pybind11/pytypes.h>
+
+#include "common.h"
+#include "internals.h"
+
+#include <typeinfo>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// Forward declaration needed here: Refactoring opportunity.
+extern "C" inline PyObject *pybind11_object_new(PyTypeObject *type, PyObject *, PyObject *);
+
+inline bool type_is_managed_by_our_internals(PyTypeObject *type_obj) {
+#if defined(PYPY_VERSION)
+    auto &internals = get_internals();
+    return bool(internals.registered_types_py.find(type_obj)
+                != internals.registered_types_py.end());
+#else
+    return bool(type_obj->tp_new == pybind11_object_new);
+#endif
+}
+
+inline bool is_instance_method_of_type(PyTypeObject *type_obj, PyObject *attr_name) {
+    PyObject *descr = _PyType_Lookup(type_obj, attr_name);
+    return bool((descr != nullptr) && PyInstanceMethod_Check(descr));
+}
+
+inline object try_get_cpp_conduit_method(PyObject *obj) {
+    if (PyType_Check(obj)) {
+        return object();
+    }
+    PyTypeObject *type_obj = Py_TYPE(obj);
+    str attr_name("_pybind11_conduit_v1_");
+    bool assumed_to_be_callable = false;
+    if (type_is_managed_by_our_internals(type_obj)) {
+        if (!is_instance_method_of_type(type_obj, attr_name.ptr())) {
+            return object();
+        }
+        assumed_to_be_callable = true;
+    }
+    PyObject *method = PyObject_GetAttr(obj, attr_name.ptr());
+    if (method == nullptr) {
+        PyErr_Clear();
+        return object();
+    }
+    if (!assumed_to_be_callable && PyCallable_Check(method) == 0) {
+        Py_DECREF(method);
+        return object();
+    }
+    return reinterpret_steal<object>(method);
+}
+
+inline void *try_raw_pointer_ephemeral_from_cpp_conduit(handle src,
+                                                        const std::type_info *cpp_type_info) {
+    object method = try_get_cpp_conduit_method(src.ptr());
+    if (method) {
+        capsule cpp_type_info_capsule(const_cast<void *>(static_cast<const void *>(cpp_type_info)),
+                                      typeid(std::type_info).name());
+        object cpp_conduit = method(bytes(PYBIND11_PLATFORM_ABI_ID),
+                                    cpp_type_info_capsule,
+                                    bytes("raw_pointer_ephemeral"));
+        if (isinstance<capsule>(cpp_conduit)) {
+            return reinterpret_borrow<capsule>(cpp_conduit).get_pointer();
+        }
+    }
+    return nullptr;
+}
+
+#define PYBIND11_HAS_CPP_CONDUIT 1
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/phivenv/Lib/site-packages/torch/include/pybind11/detail/descr.h b/phivenv/Lib/site-packages/torch/include/pybind11/detail/descr.h
new file mode 100644
index 0000000000000000000000000000000000000000..53c9c07ec570bf8160945e8a9297b6d2e1cc76d5
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/pybind11/detail/descr.h
@@ -0,0 +1,172 @@
+/*
+    pybind11/detail/descr.h: Helper type for concatenating type signatures at compile time
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "common.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+#if !defined(_MSC_VER)
+#    define PYBIND11_DESCR_CONSTEXPR static constexpr
+#else
+#    define PYBIND11_DESCR_CONSTEXPR const
+#endif
+
+/* Concatenate type signatures at compile time */
+template <size_t N, typename... Ts>
+struct descr {
+    char text[N + 1]{'\0'};
+
+    constexpr descr() = default;
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    constexpr descr(char const (&s)[N + 1]) : descr(s, make_index_sequence<N>()) {}
+
+    template <size_t... Is>
+    constexpr descr(char const (&s)[N + 1], index_sequence<Is...>) : text{s[Is]..., '\0'} {}
+
+    template <typename... Chars>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    constexpr descr(char c, Chars... cs) : text{c, static_cast<char>(cs)..., '\0'} {}
+
+    static constexpr std::array<const std::type_info *, sizeof...(Ts) + 1> types() {
+        return {{&typeid(Ts)..., nullptr}};
+    }
+};
+
+template <size_t N1, size_t N2, typename... Ts1, typename... Ts2, size_t... Is1, size_t... Is2>
+constexpr descr<N1 + N2, Ts1..., Ts2...> plus_impl(const descr<N1, Ts1...> &a,
+                                                   const descr<N2, Ts2...> &b,
+                                                   index_sequence<Is1...>,
+                                                   index_sequence<Is2...>) {
+    PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(b);
+    return {a.text[Is1]..., b.text[Is2]...};
+}
+
+template <size_t N1, size_t N2, typename... Ts1, typename... Ts2>
+constexpr descr<N1 + N2, Ts1..., Ts2...> operator+(const descr<N1, Ts1...> &a,
+                                                   const descr<N2, Ts2...> &b) {
+    return plus_impl(a, b, make_index_sequence<N1>(), make_index_sequence<N2>());
+}
+
+template <size_t N>
+constexpr descr<N - 1> const_name(char const (&text)[N]) {
+    return descr<N - 1>(text);
+}
+constexpr descr<0> const_name(char const (&)[1]) { return {}; }
+
+template <size_t Rem, size_t... Digits>
+struct int_to_str : int_to_str<Rem / 10, Rem % 10, Digits...> {};
+template <size_t... Digits>
+struct int_to_str<0, Digits...> {
+    // WARNING: This only works with C++17 or higher.
+    static constexpr auto digits = descr<sizeof...(Digits)>(('0' + Digits)...);
+};
+
+// Ternary description (like std::conditional)
+template <bool B, size_t N1, size_t N2>
+constexpr enable_if_t<B, descr<N1 - 1>> const_name(char const (&text1)[N1], char const (&)[N2]) {
+    return const_name(text1);
+}
+template <bool B, size_t N1, size_t N2>
+constexpr enable_if_t<!B, descr<N2 - 1>> const_name(char const (&)[N1], char const (&text2)[N2]) {
+    return const_name(text2);
+}
+
+template <bool B, typename T1, typename T2>
+constexpr enable_if_t<B, T1> const_name(const T1 &d, const T2 &) {
+    return d;
+}
+template <bool B, typename T1, typename T2>
+constexpr enable_if_t<!B, T2> const_name(const T1 &, const T2 &d) {
+    return d;
+}
+
+template <size_t Size>
+auto constexpr const_name() -> remove_cv_t<decltype(int_to_str<Size / 10, Size % 10>::digits)> {
+    return int_to_str<Size / 10, Size % 10>::digits;
+}
+
+template <typename Type>
+constexpr descr<1, Type> const_name() {
+    return {'%'};
+}
+
+// If "_" is defined as a macro, py::detail::_ cannot be provided.
+// It is therefore best to use py::detail::const_name universally.
+// This block is for backward compatibility only.
+// (The const_name code is repeated to avoid introducing a "_" #define ourselves.)
+#ifndef _
+#    define PYBIND11_DETAIL_UNDERSCORE_BACKWARD_COMPATIBILITY
+template <size_t N>
+constexpr descr<N - 1> _(char const (&text)[N]) {
+    return const_name<N>(text);
+}
+template <bool B, size_t N1, size_t N2>
+constexpr enable_if_t<B, descr<N1 - 1>> _(char const (&text1)[N1], char const (&text2)[N2]) {
+    return const_name<B, N1, N2>(text1, text2);
+}
+template <bool B, size_t N1, size_t N2>
+constexpr enable_if_t<!B, descr<N2 - 1>> _(char const (&text1)[N1], char const (&text2)[N2]) {
+    return const_name<B, N1, N2>(text1, text2);
+}
+template <bool B, typename T1, typename T2>
+constexpr enable_if_t<B, T1> _(const T1 &d1, const T2 &d2) {
+    return const_name<B, T1, T2>(d1, d2);
+}
+template <bool B, typename T1, typename T2>
+constexpr enable_if_t<!B, T2> _(const T1 &d1, const T2 &d2) {
+    return const_name<B, T1, T2>(d1, d2);
+}
+
+template <size_t Size>
+auto constexpr _() -> remove_cv_t<decltype(int_to_str<Size / 10, Size % 10>::digits)> {
+    return const_name<Size>();
+}
+template <typename Type>
+constexpr descr<1, Type> _() {
+    return const_name<Type>();
+}
+#endif // #ifndef _
+
+constexpr descr<0> concat() { return {}; }
+
+template <size_t N, typename... Ts>
+constexpr descr<N, Ts...> concat(const descr<N, Ts...> &descr) {
+    return descr;
+}
+
+#ifdef __cpp_fold_expressions
+template <size_t N1, size_t N2, typename... Ts1, typename... Ts2>
+constexpr descr<N1 + N2 + 2, Ts1..., Ts2...> operator,(const descr<N1, Ts1...> &a,
+                                                       const descr<N2, Ts2...> &b) {
+    return a + const_name(", ") + b;
+}
+
+template <size_t N, typename... Ts, typename... Args>
+constexpr auto concat(const descr<N, Ts...> &d, const Args &...args) {
+    return (d, ..., args);
+}
+#else
+template <size_t N, typename... Ts, typename... Args>
+constexpr auto concat(const descr<N, Ts...> &d,
+                      const Args &...args) -> decltype(std::declval<descr<N + 2, Ts...>>()
+                                                       + concat(args...)) {
+    return d + const_name(", ") + concat(args...);
+}
+#endif
+
+template <size_t N, typename... Ts>
+constexpr descr<N + 2, Ts...> type_descr(const descr<N, Ts...> &descr) {
+    return const_name("{") + descr + const_name("}");
+}
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/phivenv/Lib/site-packages/torch/include/pybind11/detail/exception_translation.h b/phivenv/Lib/site-packages/torch/include/pybind11/detail/exception_translation.h
new file mode 100644
index 0000000000000000000000000000000000000000..7f911163314747d8d2ecf4243220d71e891df516
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/pybind11/detail/exception_translation.h
@@ -0,0 +1,71 @@
+/*
+    pybind11/detail/exception_translation.h: means to translate C++ exceptions to Python exceptions
+
+    Copyright (c) 2024 The Pybind Development Team.
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "common.h"
+#include "internals.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// Apply all the extensions translators from a list
+// Return true if one of the translators completed without raising an exception
+// itself. Return of false indicates that if there are other translators
+// available, they should be tried.
+inline bool apply_exception_translators(std::forward_list<ExceptionTranslator> &translators) {
+    auto last_exception = std::current_exception();
+
+    for (auto &translator : translators) {
+        try {
+            translator(last_exception);
+            return true;
+        } catch (...) {
+            last_exception = std::current_exception();
+        }
+    }
+    return false;
+}
+
+inline void try_translate_exceptions() {
+    /* When an exception is caught, give each registered exception
+        translator a chance to translate it to a Python exception. First
+        all module-local translators will be tried in reverse order of
+        registration. If none of the module-locale translators handle
+        the exception (or there are no module-locale translators) then
+        the global translators will be tried, also in reverse order of
+        registration.
+
+        A translator may choose to do one of the following:
+
+        - catch the exception and call py::set_error()
+            to set a standard (or custom) Python exception, or
+        - do nothing and let the exception fall through to the next translator, or
+        - delegate translation to the next translator by throwing a new type of exception.
+        */
+
+    bool handled = with_internals([&](internals &internals) {
+        auto &local_exception_translators = get_local_internals().registered_exception_translators;
+        if (detail::apply_exception_translators(local_exception_translators)) {
+            return true;
+        }
+        auto &exception_translators = internals.registered_exception_translators;
+        if (detail::apply_exception_translators(exception_translators)) {
+            return true;
+        }
+        return false;
+    });
+
+    if (!handled) {
+        set_error(PyExc_SystemError, "Exception escaped from default exception translator!");
+    }
+}
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/phivenv/Lib/site-packages/torch/include/pybind11/detail/init.h b/phivenv/Lib/site-packages/torch/include/pybind11/detail/init.h
new file mode 100644
index 0000000000000000000000000000000000000000..e56ad35bde63a939aad3c990a4b9c5c6f3dce96e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/pybind11/detail/init.h
@@ -0,0 +1,436 @@
+/*
+    pybind11/detail/init.h: init factory function implementation and support code.
+
+    Copyright (c) 2017 Jason Rhinelander <jason@imaginary.ca>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "class.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+PYBIND11_WARNING_DISABLE_MSVC(4127)
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <>
+class type_caster<value_and_holder> {
+public:
+    bool load(handle h, bool) {
+        value = reinterpret_cast<value_and_holder *>(h.ptr());
+        return true;
+    }
+
+    template <typename>
+    using cast_op_type = value_and_holder &;
+    explicit operator value_and_holder &() { return *value; }
+    static constexpr auto name = const_name<value_and_holder>();
+
+private:
+    value_and_holder *value = nullptr;
+};
+
+PYBIND11_NAMESPACE_BEGIN(initimpl)
+
+inline void no_nullptr(void *ptr) {
+    if (!ptr) {
+        throw type_error("pybind11::init(): factory function returned nullptr");
+    }
+}
+
+// Implementing functions for all forms of py::init<...> and py::init(...)
+template <typename Class>
+using Cpp = typename Class::type;
+template <typename Class>
+using Alias = typename Class::type_alias;
+template <typename Class>
+using Holder = typename Class::holder_type;
+
+template <typename Class>
+using is_alias_constructible = std::is_constructible<Alias<Class>, Cpp<Class> &&>;
+
+// Takes a Cpp pointer and returns true if it actually is a polymorphic Alias instance.
+template <typename Class, enable_if_t<Class::has_alias, int> = 0>
+bool is_alias(Cpp<Class> *ptr) {
+    return dynamic_cast<Alias<Class> *>(ptr) != nullptr;
+}
+// Failing fallback version of the above for a no-alias class (always returns false)
+template <typename /*Class*/>
+constexpr bool is_alias(void *) {
+    return false;
+}
+
+// Constructs and returns a new object; if the given arguments don't map to a constructor, we fall
+// back to brace aggregate initialization so that for aggregate initialization can be used with
+// py::init, e.g.  `py::init<int, int>` to initialize a `struct T { int a; int b; }`.  For
+// non-aggregate types, we need to use an ordinary T(...) constructor (invoking as `T{...}` usually
+// works, but will not do the expected thing when `T` has an `initializer_list<T>` constructor).
+template <typename Class,
+          typename... Args,
+          detail::enable_if_t<std::is_constructible<Class, Args...>::value, int> = 0>
+inline Class *construct_or_initialize(Args &&...args) {
+    return new Class(std::forward<Args>(args)...);
+}
+template <typename Class,
+          typename... Args,
+          detail::enable_if_t<!std::is_constructible<Class, Args...>::value, int> = 0>
+inline Class *construct_or_initialize(Args &&...args) {
+    return new Class{std::forward<Args>(args)...};
+}
+
+// Attempts to constructs an alias using a `Alias(Cpp &&)` constructor.  This allows types with
+// an alias to provide only a single Cpp factory function as long as the Alias can be
+// constructed from an rvalue reference of the base Cpp type.  This means that Alias classes
+// can, when appropriate, simply define a `Alias(Cpp &&)` constructor rather than needing to
+// inherit all the base class constructors.
+template <typename Class>
+void construct_alias_from_cpp(std::true_type /*is_alias_constructible*/,
+                              value_and_holder &v_h,
+                              Cpp<Class> &&base) {
+    v_h.value_ptr() = new Alias<Class>(std::move(base));
+}
+template <typename Class>
+[[noreturn]] void construct_alias_from_cpp(std::false_type /*!is_alias_constructible*/,
+                                           value_and_holder &,
+                                           Cpp<Class> &&) {
+    throw type_error("pybind11::init(): unable to convert returned instance to required "
+                     "alias class: no `Alias<Class>(Class &&)` constructor available");
+}
+
+// Error-generating fallback for factories that don't match one of the below construction
+// mechanisms.
+template <typename Class>
+void construct(...) {
+    static_assert(!std::is_same<Class, Class>::value /* always false */,
+                  "pybind11::init(): init function must return a compatible pointer, "
+                  "holder, or value");
+}
+
+// Pointer return v1: the factory function returns a class pointer for a registered class.
+// If we don't need an alias (because this class doesn't have one, or because the final type is
+// inherited on the Python side) we can simply take over ownership.  Otherwise we need to try to
+// construct an Alias from the returned base instance.
+template <typename Class>
+void construct(value_and_holder &v_h, Cpp<Class> *ptr, bool need_alias) {
+    PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(need_alias);
+    no_nullptr(ptr);
+    if (Class::has_alias && need_alias && !is_alias<Class>(ptr)) {
+        // We're going to try to construct an alias by moving the cpp type.  Whether or not
+        // that succeeds, we still need to destroy the original cpp pointer (either the
+        // moved away leftover, if the alias construction works, or the value itself if we
+        // throw an error), but we can't just call `delete ptr`: it might have a special
+        // deleter, or might be shared_from_this.  So we construct a holder around it as if
+        // it was a normal instance, then steal the holder away into a local variable; thus
+        // the holder and destruction happens when we leave the C++ scope, and the holder
+        // class gets to handle the destruction however it likes.
+        v_h.value_ptr() = ptr;
+        v_h.set_instance_registered(true); // Trick to prevent init_instance from registering it
+        // DANGER ZONE BEGIN: exceptions will leave v_h in an invalid state.
+        v_h.type->init_instance(v_h.inst, nullptr);                        // Set up the holder
+        Holder<Class> temp_holder(std::move(v_h.holder<Holder<Class>>())); // Steal the holder
+        v_h.type->dealloc(v_h); // Destroys the moved-out holder remains, resets value ptr to null
+        v_h.set_instance_registered(false);
+        // DANGER ZONE END.
+
+        construct_alias_from_cpp<Class>(is_alias_constructible<Class>{}, v_h, std::move(*ptr));
+    } else {
+        // Otherwise the type isn't inherited, so we don't need an Alias
+        v_h.value_ptr() = ptr;
+    }
+}
+
+// Pointer return v2: a factory that always returns an alias instance ptr.  We simply take over
+// ownership of the pointer.
+template <typename Class, enable_if_t<Class::has_alias, int> = 0>
+void construct(value_and_holder &v_h, Alias<Class> *alias_ptr, bool) {
+    no_nullptr(alias_ptr);
+    v_h.value_ptr() = static_cast<Cpp<Class> *>(alias_ptr);
+}
+
+// Holder return: copy its pointer, and move or copy the returned holder into the new instance's
+// holder.  This also handles types like std::shared_ptr<T> and std::unique_ptr<T> where T is a
+// derived type (through those holder's implicit conversion from derived class holder
+// constructors).
+template <typename Class>
+void construct(value_and_holder &v_h, Holder<Class> holder, bool need_alias) {
+    PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(need_alias);
+    auto *ptr = holder_helper<Holder<Class>>::get(holder);
+    no_nullptr(ptr);
+    // If we need an alias, check that the held pointer is actually an alias instance
+    if (Class::has_alias && need_alias && !is_alias<Class>(ptr)) {
+        throw type_error("pybind11::init(): construction failed: returned holder-wrapped instance "
+                         "is not an alias instance");
+    }
+
+    v_h.value_ptr() = ptr;
+    v_h.type->init_instance(v_h.inst, &holder);
+}
+
+// return-by-value version 1: returning a cpp class by value.  If the class has an alias and an
+// alias is required the alias must have an `Alias(Cpp &&)` constructor so that we can construct
+// the alias from the base when needed (i.e. because of Python-side inheritance).  When we don't
+// need it, we simply move-construct the cpp value into a new instance.
+template <typename Class>
+void construct(value_and_holder &v_h, Cpp<Class> &&result, bool need_alias) {
+    PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(need_alias);
+    static_assert(is_move_constructible<Cpp<Class>>::value,
+                  "pybind11::init() return-by-value factory function requires a movable class");
+    if (Class::has_alias && need_alias) {
+        construct_alias_from_cpp<Class>(is_alias_constructible<Class>{}, v_h, std::move(result));
+    } else {
+        v_h.value_ptr() = new Cpp<Class>(std::move(result));
+    }
+}
+
+// return-by-value version 2: returning a value of the alias type itself.  We move-construct an
+// Alias instance (even if no the python-side inheritance is involved).  The is intended for
+// cases where Alias initialization is always desired.
+template <typename Class>
+void construct(value_and_holder &v_h, Alias<Class> &&result, bool) {
+    static_assert(
+        is_move_constructible<Alias<Class>>::value,
+        "pybind11::init() return-by-alias-value factory function requires a movable alias class");
+    v_h.value_ptr() = new Alias<Class>(std::move(result));
+}
+
+// Implementing class for py::init<...>()
+template <typename... Args>
+struct constructor {
+    template <typename Class, typename... Extra, enable_if_t<!Class::has_alias, int> = 0>
+    static void execute(Class &cl, const Extra &...extra) {
+        cl.def(
+            "__init__",
+            [](value_and_holder &v_h, Args... args) {
+                v_h.value_ptr() = construct_or_initialize<Cpp<Class>>(std::forward<Args>(args)...);
+            },
+            is_new_style_constructor(),
+            extra...);
+    }
+
+    template <
+        typename Class,
+        typename... Extra,
+        enable_if_t<Class::has_alias && std::is_constructible<Cpp<Class>, Args...>::value, int>
+        = 0>
+    static void execute(Class &cl, const Extra &...extra) {
+        cl.def(
+            "__init__",
+            [](value_and_holder &v_h, Args... args) {
+                if (Py_TYPE(v_h.inst) == v_h.type->type) {
+                    v_h.value_ptr()
+                        = construct_or_initialize<Cpp<Class>>(std::forward<Args>(args)...);
+                } else {
+                    v_h.value_ptr()
+                        = construct_or_initialize<Alias<Class>>(std::forward<Args>(args)...);
+                }
+            },
+            is_new_style_constructor(),
+            extra...);
+    }
+
+    template <
+        typename Class,
+        typename... Extra,
+        enable_if_t<Class::has_alias && !std::is_constructible<Cpp<Class>, Args...>::value, int>
+        = 0>
+    static void execute(Class &cl, const Extra &...extra) {
+        cl.def(
+            "__init__",
+            [](value_and_holder &v_h, Args... args) {
+                v_h.value_ptr()
+                    = construct_or_initialize<Alias<Class>>(std::forward<Args>(args)...);
+            },
+            is_new_style_constructor(),
+            extra...);
+    }
+};
+
+// Implementing class for py::init_alias<...>()
+template <typename... Args>
+struct alias_constructor {
+    template <
+        typename Class,
+        typename... Extra,
+        enable_if_t<Class::has_alias && std::is_constructible<Alias<Class>, Args...>::value, int>
+        = 0>
+    static void execute(Class &cl, const Extra &...extra) {
+        cl.def(
+            "__init__",
+            [](value_and_holder &v_h, Args... args) {
+                v_h.value_ptr()
+                    = construct_or_initialize<Alias<Class>>(std::forward<Args>(args)...);
+            },
+            is_new_style_constructor(),
+            extra...);
+    }
+};
+
+// Implementation class for py::init(Func) and py::init(Func, AliasFunc)
+template <typename CFunc,
+          typename AFunc = void_type (*)(),
+          typename = function_signature_t<CFunc>,
+          typename = function_signature_t<AFunc>>
+struct factory;
+
+// Specialization for py::init(Func)
+template <typename Func, typename Return, typename... Args>
+struct factory<Func, void_type (*)(), Return(Args...)> {
+    remove_reference_t<Func> class_factory;
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    factory(Func &&f) : class_factory(std::forward<Func>(f)) {}
+
+    // The given class either has no alias or has no separate alias factory;
+    // this always constructs the class itself.  If the class is registered with an alias
+    // type and an alias instance is needed (i.e. because the final type is a Python class
+    // inheriting from the C++ type) the returned value needs to either already be an alias
+    // instance, or the alias needs to be constructible from a `Class &&` argument.
+    template <typename Class, typename... Extra>
+    void execute(Class &cl, const Extra &...extra) && {
+#if defined(PYBIND11_CPP14)
+        cl.def(
+            "__init__",
+            [func = std::move(class_factory)]
+#else
+        auto &func = class_factory;
+        cl.def(
+            "__init__",
+            [func]
+#endif
+            (value_and_holder &v_h, Args... args) {
+                construct<Class>(
+                    v_h, func(std::forward<Args>(args)...), Py_TYPE(v_h.inst) != v_h.type->type);
+            },
+            is_new_style_constructor(),
+            extra...);
+    }
+};
+
+// Specialization for py::init(Func, AliasFunc)
+template <typename CFunc,
+          typename AFunc,
+          typename CReturn,
+          typename... CArgs,
+          typename AReturn,
+          typename... AArgs>
+struct factory<CFunc, AFunc, CReturn(CArgs...), AReturn(AArgs...)> {
+    static_assert(sizeof...(CArgs) == sizeof...(AArgs),
+                  "pybind11::init(class_factory, alias_factory): class and alias factories "
+                  "must have identical argument signatures");
+    static_assert(all_of<std::is_same<CArgs, AArgs>...>::value,
+                  "pybind11::init(class_factory, alias_factory): class and alias factories "
+                  "must have identical argument signatures");
+
+    remove_reference_t<CFunc> class_factory;
+    remove_reference_t<AFunc> alias_factory;
+
+    factory(CFunc &&c, AFunc &&a)
+        : class_factory(std::forward<CFunc>(c)), alias_factory(std::forward<AFunc>(a)) {}
+
+    // The class factory is called when the `self` type passed to `__init__` is the direct
+    // class (i.e. not inherited), the alias factory when `self` is a Python-side subtype.
+    template <typename Class, typename... Extra>
+    void execute(Class &cl, const Extra &...extra) && {
+        static_assert(Class::has_alias,
+                      "The two-argument version of `py::init()` can "
+                      "only be used if the class has an alias");
+#if defined(PYBIND11_CPP14)
+        cl.def(
+            "__init__",
+            [class_func = std::move(class_factory), alias_func = std::move(alias_factory)]
+#else
+        auto &class_func = class_factory;
+        auto &alias_func = alias_factory;
+        cl.def(
+            "__init__",
+            [class_func, alias_func]
+#endif
+            (value_and_holder &v_h, CArgs... args) {
+                if (Py_TYPE(v_h.inst) == v_h.type->type) {
+                    // If the instance type equals the registered type we don't have inheritance,
+                    // so don't need the alias and can construct using the class function:
+                    construct<Class>(v_h, class_func(std::forward<CArgs>(args)...), false);
+                } else {
+                    construct<Class>(v_h, alias_func(std::forward<CArgs>(args)...), true);
+                }
+            },
+            is_new_style_constructor(),
+            extra...);
+    }
+};
+
+/// Set just the C++ state. Same as `__init__`.
+template <typename Class, typename T>
+void setstate(value_and_holder &v_h, T &&result, bool need_alias) {
+    construct<Class>(v_h, std::forward<T>(result), need_alias);
+}
+
+/// Set both the C++ and Python states
+template <typename Class,
+          typename T,
+          typename O,
+          enable_if_t<std::is_convertible<O, handle>::value, int> = 0>
+void setstate(value_and_holder &v_h, std::pair<T, O> &&result, bool need_alias) {
+    construct<Class>(v_h, std::move(result.first), need_alias);
+    auto d = handle(result.second);
+    if (PyDict_Check(d.ptr()) && PyDict_Size(d.ptr()) == 0) {
+        // Skipping setattr below, to not force use of py::dynamic_attr() for Class unnecessarily.
+        // See PR #2972 for details.
+        return;
+    }
+    setattr((PyObject *) v_h.inst, "__dict__", d);
+}
+
+/// Implementation for py::pickle(GetState, SetState)
+template <typename Get,
+          typename Set,
+          typename = function_signature_t<Get>,
+          typename = function_signature_t<Set>>
+struct pickle_factory;
+
+template <typename Get,
+          typename Set,
+          typename RetState,
+          typename Self,
+          typename NewInstance,
+          typename ArgState>
+struct pickle_factory<Get, Set, RetState(Self), NewInstance(ArgState)> {
+    static_assert(std::is_same<intrinsic_t<RetState>, intrinsic_t<ArgState>>::value,
+                  "The type returned by `__getstate__` must be the same "
+                  "as the argument accepted by `__setstate__`");
+
+    remove_reference_t<Get> get;
+    remove_reference_t<Set> set;
+
+    pickle_factory(Get get, Set set) : get(std::forward<Get>(get)), set(std::forward<Set>(set)) {}
+
+    template <typename Class, typename... Extra>
+    void execute(Class &cl, const Extra &...extra) && {
+        cl.def("__getstate__", std::move(get));
+
+#if defined(PYBIND11_CPP14)
+        cl.def(
+            "__setstate__",
+            [func = std::move(set)]
+#else
+        auto &func = set;
+        cl.def(
+            "__setstate__",
+            [func]
+#endif
+            (value_and_holder &v_h, ArgState state) {
+                setstate<Class>(
+                    v_h, func(std::forward<ArgState>(state)), Py_TYPE(v_h.inst) != v_h.type->type);
+            },
+            is_new_style_constructor(),
+            extra...);
+    }
+};
+
+PYBIND11_NAMESPACE_END(initimpl)
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/phivenv/Lib/site-packages/torch/include/pybind11/detail/internals.h b/phivenv/Lib/site-packages/torch/include/pybind11/detail/internals.h
new file mode 100644
index 0000000000000000000000000000000000000000..48f668e733f92053ee3dcca2e4f8764c1f384e0a
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/pybind11/detail/internals.h
@@ -0,0 +1,766 @@
+/*
+    pybind11/detail/internals.h: Internal data structure and related functions
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "common.h"
+
+#if defined(PYBIND11_SIMPLE_GIL_MANAGEMENT)
+#    include <pybind11/gil.h>
+#endif
+
+#include <pybind11/pytypes.h>
+
+#include <exception>
+#include <mutex>
+#include <thread>
+
+/// Tracks the `internals` and `type_info` ABI version independent of the main library version.
+///
+/// Some portions of the code use an ABI that is conditional depending on this
+/// version number.  That allows ABI-breaking changes to be "pre-implemented".
+/// Once the default version number is incremented, the conditional logic that
+/// no longer applies can be removed.  Additionally, users that need not
+/// maintain ABI compatibility can increase the version number in order to take
+/// advantage of any functionality/efficiency improvements that depend on the
+/// newer ABI.
+///
+/// WARNING: If you choose to manually increase the ABI version, note that
+/// pybind11 may not be tested as thoroughly with a non-default ABI version, and
+/// further ABI-incompatible changes may be made before the ABI is officially
+/// changed to the new version.
+#ifndef PYBIND11_INTERNALS_VERSION
+#    if PY_VERSION_HEX >= 0x030C0000 || defined(_MSC_VER)
+// Version bump for Python 3.12+, before first 3.12 beta release.
+// Version bump for MSVC piggy-backed on PR #4779. See comments there.
+#        define PYBIND11_INTERNALS_VERSION 5
+#    else
+#        define PYBIND11_INTERNALS_VERSION 4
+#    endif
+#endif
+
+// This requirement is mainly to reduce the support burden (see PR #4570).
+static_assert(PY_VERSION_HEX < 0x030C0000 || PYBIND11_INTERNALS_VERSION >= 5,
+              "pybind11 ABI version 5 is the minimum for Python 3.12+");
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+using ExceptionTranslator = void (*)(std::exception_ptr);
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+constexpr const char *internals_function_record_capsule_name = "pybind11_function_record_capsule";
+
+// Forward declarations
+inline PyTypeObject *make_static_property_type();
+inline PyTypeObject *make_default_metaclass();
+inline PyObject *make_object_base_type(PyTypeObject *metaclass);
+
+// The old Python Thread Local Storage (TLS) API is deprecated in Python 3.7 in favor of the new
+// Thread Specific Storage (TSS) API.
+// Avoid unnecessary allocation of `Py_tss_t`, since we cannot use
+// `Py_LIMITED_API` anyway.
+#if PYBIND11_INTERNALS_VERSION > 4
+#    define PYBIND11_TLS_KEY_REF Py_tss_t &
+#    if defined(__clang__)
+#        define PYBIND11_TLS_KEY_INIT(var)                                                        \
+            _Pragma("clang diagnostic push")                                         /**/         \
+                _Pragma("clang diagnostic ignored \"-Wmissing-field-initializers\"") /**/         \
+                Py_tss_t var                                                                      \
+                = Py_tss_NEEDS_INIT;                                                              \
+            _Pragma("clang diagnostic pop")
+#    elif defined(__GNUC__) && !defined(__INTEL_COMPILER)
+#        define PYBIND11_TLS_KEY_INIT(var)                                                        \
+            _Pragma("GCC diagnostic push")                                         /**/           \
+                _Pragma("GCC diagnostic ignored \"-Wmissing-field-initializers\"") /**/           \
+                Py_tss_t var                                                                      \
+                = Py_tss_NEEDS_INIT;                                                              \
+            _Pragma("GCC diagnostic pop")
+#    else
+#        define PYBIND11_TLS_KEY_INIT(var) Py_tss_t var = Py_tss_NEEDS_INIT;
+#    endif
+#    define PYBIND11_TLS_KEY_CREATE(var) (PyThread_tss_create(&(var)) == 0)
+#    define PYBIND11_TLS_GET_VALUE(key) PyThread_tss_get(&(key))
+#    define PYBIND11_TLS_REPLACE_VALUE(key, value) PyThread_tss_set(&(key), (value))
+#    define PYBIND11_TLS_DELETE_VALUE(key) PyThread_tss_set(&(key), nullptr)
+#    define PYBIND11_TLS_FREE(key) PyThread_tss_delete(&(key))
+#else
+#    define PYBIND11_TLS_KEY_REF Py_tss_t *
+#    define PYBIND11_TLS_KEY_INIT(var) Py_tss_t *var = nullptr;
+#    define PYBIND11_TLS_KEY_CREATE(var)                                                          \
+        (((var) = PyThread_tss_alloc()) != nullptr && (PyThread_tss_create((var)) == 0))
+#    define PYBIND11_TLS_GET_VALUE(key) PyThread_tss_get((key))
+#    define PYBIND11_TLS_REPLACE_VALUE(key, value) PyThread_tss_set((key), (value))
+#    define PYBIND11_TLS_DELETE_VALUE(key) PyThread_tss_set((key), nullptr)
+#    define PYBIND11_TLS_FREE(key) PyThread_tss_free(key)
+#endif
+
+// Python loads modules by default with dlopen with the RTLD_LOCAL flag; under libc++ and possibly
+// other STLs, this means `typeid(A)` from one module won't equal `typeid(A)` from another module
+// even when `A` is the same, non-hidden-visibility type (e.g. from a common include).  Under
+// libstdc++, this doesn't happen: equality and the type_index hash are based on the type name,
+// which works.  If not under a known-good stl, provide our own name-based hash and equality
+// functions that use the type name.
+#if (PYBIND11_INTERNALS_VERSION <= 4 && defined(__GLIBCXX__))                                     \
+    || (PYBIND11_INTERNALS_VERSION >= 5 && !defined(_LIBCPP_VERSION))
+inline bool same_type(const std::type_info &lhs, const std::type_info &rhs) { return lhs == rhs; }
+using type_hash = std::hash<std::type_index>;
+using type_equal_to = std::equal_to<std::type_index>;
+#else
+inline bool same_type(const std::type_info &lhs, const std::type_info &rhs) {
+    return lhs.name() == rhs.name() || std::strcmp(lhs.name(), rhs.name()) == 0;
+}
+
+struct type_hash {
+    size_t operator()(const std::type_index &t) const {
+        size_t hash = 5381;
+        const char *ptr = t.name();
+        while (auto c = static_cast<unsigned char>(*ptr++)) {
+            hash = (hash * 33) ^ c;
+        }
+        return hash;
+    }
+};
+
+struct type_equal_to {
+    bool operator()(const std::type_index &lhs, const std::type_index &rhs) const {
+        return lhs.name() == rhs.name() || std::strcmp(lhs.name(), rhs.name()) == 0;
+    }
+};
+#endif
+
+template <typename value_type>
+using type_map = std::unordered_map<std::type_index, value_type, type_hash, type_equal_to>;
+
+struct override_hash {
+    inline size_t operator()(const std::pair<const PyObject *, const char *> &v) const {
+        size_t value = std::hash<const void *>()(v.first);
+        value ^= std::hash<const void *>()(v.second) + 0x9e3779b9 + (value << 6) + (value >> 2);
+        return value;
+    }
+};
+
+using instance_map = std::unordered_multimap<const void *, instance *>;
+
+#ifdef Py_GIL_DISABLED
+// Wrapper around PyMutex to provide BasicLockable semantics
+class pymutex {
+    PyMutex mutex;
+
+public:
+    pymutex() : mutex({}) {}
+    void lock() { PyMutex_Lock(&mutex); }
+    void unlock() { PyMutex_Unlock(&mutex); }
+};
+
+// Instance map shards are used to reduce mutex contention in free-threaded Python.
+struct instance_map_shard {
+    instance_map registered_instances;
+    pymutex mutex;
+    // alignas(64) would be better, but causes compile errors in macOS before 10.14 (see #5200)
+    char padding[64 - (sizeof(instance_map) + sizeof(pymutex)) % 64];
+};
+
+static_assert(sizeof(instance_map_shard) % 64 == 0,
+              "instance_map_shard size is not a multiple of 64 bytes");
+#endif
+
+/// Internal data structure used to track registered instances and types.
+/// Whenever binary incompatible changes are made to this structure,
+/// `PYBIND11_INTERNALS_VERSION` must be incremented.
+struct internals {
+#ifdef Py_GIL_DISABLED
+    pymutex mutex;
+#endif
+    // std::type_index -> pybind11's type information
+    type_map<type_info *> registered_types_cpp;
+    // PyTypeObject* -> base type_info(s)
+    std::unordered_map<PyTypeObject *, std::vector<type_info *>> registered_types_py;
+#ifdef Py_GIL_DISABLED
+    std::unique_ptr<instance_map_shard[]> instance_shards; // void * -> instance*
+    size_t instance_shards_mask;
+#else
+    instance_map registered_instances; // void * -> instance*
+#endif
+    std::unordered_set<std::pair<const PyObject *, const char *>, override_hash>
+        inactive_override_cache;
+    type_map<std::vector<bool (*)(PyObject *, void *&)>> direct_conversions;
+    std::unordered_map<const PyObject *, std::vector<PyObject *>> patients;
+    std::forward_list<ExceptionTranslator> registered_exception_translators;
+    std::unordered_map<std::string, void *> shared_data; // Custom data to be shared across
+                                                         // extensions
+#if PYBIND11_INTERNALS_VERSION == 4
+    std::vector<PyObject *> unused_loader_patient_stack_remove_at_v5;
+#endif
+    std::forward_list<std::string> static_strings; // Stores the std::strings backing
+                                                   // detail::c_str()
+    PyTypeObject *static_property_type;
+    PyTypeObject *default_metaclass;
+    PyObject *instance_base;
+    // Unused if PYBIND11_SIMPLE_GIL_MANAGEMENT is defined:
+    PYBIND11_TLS_KEY_INIT(tstate)
+#if PYBIND11_INTERNALS_VERSION > 4
+    PYBIND11_TLS_KEY_INIT(loader_life_support_tls_key)
+#endif // PYBIND11_INTERNALS_VERSION > 4
+    // Unused if PYBIND11_SIMPLE_GIL_MANAGEMENT is defined:
+    PyInterpreterState *istate = nullptr;
+
+#if PYBIND11_INTERNALS_VERSION > 4
+    // Note that we have to use a std::string to allocate memory to ensure a unique address
+    // We want unique addresses since we use pointer equality to compare function records
+    std::string function_record_capsule_name = internals_function_record_capsule_name;
+#endif
+
+    internals() = default;
+    internals(const internals &other) = delete;
+    internals &operator=(const internals &other) = delete;
+    ~internals() {
+#if PYBIND11_INTERNALS_VERSION > 4
+        PYBIND11_TLS_FREE(loader_life_support_tls_key);
+#endif // PYBIND11_INTERNALS_VERSION > 4
+
+        // This destructor is called *after* Py_Finalize() in finalize_interpreter().
+        // That *SHOULD BE* fine. The following details what happens when PyThread_tss_free is
+        // called. PYBIND11_TLS_FREE is PyThread_tss_free on python 3.7+. On older python, it does
+        // nothing. PyThread_tss_free calls PyThread_tss_delete and PyMem_RawFree.
+        // PyThread_tss_delete just calls TlsFree (on Windows) or pthread_key_delete (on *NIX).
+        // Neither of those have anything to do with CPython internals. PyMem_RawFree *requires*
+        // that the `tstate` be allocated with the CPython allocator.
+        PYBIND11_TLS_FREE(tstate);
+    }
+};
+
+/// Additional type information which does not fit into the PyTypeObject.
+/// Changes to this struct also require bumping `PYBIND11_INTERNALS_VERSION`.
+struct type_info {
+    PyTypeObject *type;
+    const std::type_info *cpptype;
+    size_t type_size, type_align, holder_size_in_ptrs;
+    void *(*operator_new)(size_t);
+    void (*init_instance)(instance *, const void *);
+    void (*dealloc)(value_and_holder &v_h);
+    std::vector<PyObject *(*) (PyObject *, PyTypeObject *)> implicit_conversions;
+    std::vector<std::pair<const std::type_info *, void *(*) (void *)>> implicit_casts;
+    std::vector<bool (*)(PyObject *, void *&)> *direct_conversions;
+    buffer_info *(*get_buffer)(PyObject *, void *) = nullptr;
+    void *get_buffer_data = nullptr;
+    void *(*module_local_load)(PyObject *, const type_info *) = nullptr;
+    /* A simple type never occurs as a (direct or indirect) parent
+     * of a class that makes use of multiple inheritance.
+     * A type can be simple even if it has non-simple ancestors as long as it has no descendants.
+     */
+    bool simple_type : 1;
+    /* True if there is no multiple inheritance in this type's inheritance tree */
+    bool simple_ancestors : 1;
+    /* for base vs derived holder_type checks */
+    bool default_holder : 1;
+    /* true if this is a type registered with py::module_local */
+    bool module_local : 1;
+};
+
+/// On MSVC, debug and release builds are not ABI-compatible!
+#if defined(_MSC_VER) && defined(_DEBUG)
+#    define PYBIND11_BUILD_TYPE "_debug"
+#else
+#    define PYBIND11_BUILD_TYPE ""
+#endif
+
+/// Let's assume that different compilers are ABI-incompatible.
+/// A user can manually set this string if they know their
+/// compiler is compatible.
+#ifndef PYBIND11_COMPILER_TYPE
+#    if defined(_MSC_VER)
+#        define PYBIND11_COMPILER_TYPE "_msvc"
+#    elif defined(__INTEL_COMPILER)
+#        define PYBIND11_COMPILER_TYPE "_icc"
+#    elif defined(__clang__)
+#        define PYBIND11_COMPILER_TYPE "_clang"
+#    elif defined(__PGI)
+#        define PYBIND11_COMPILER_TYPE "_pgi"
+#    elif defined(__MINGW32__)
+#        define PYBIND11_COMPILER_TYPE "_mingw"
+#    elif defined(__CYGWIN__)
+#        define PYBIND11_COMPILER_TYPE "_gcc_cygwin"
+#    elif defined(__GNUC__)
+#        define PYBIND11_COMPILER_TYPE "_gcc"
+#    else
+#        define PYBIND11_COMPILER_TYPE "_unknown"
+#    endif
+#endif
+
+/// Also standard libs
+#ifndef PYBIND11_STDLIB
+#    if defined(_LIBCPP_VERSION)
+#        define PYBIND11_STDLIB "_libcpp"
+#    elif defined(__GLIBCXX__) || defined(__GLIBCPP__)
+#        define PYBIND11_STDLIB "_libstdcpp"
+#    else
+#        define PYBIND11_STDLIB ""
+#    endif
+#endif
+
+/// On Linux/OSX, changes in __GXX_ABI_VERSION__ indicate ABI incompatibility.
+/// On MSVC, changes in _MSC_VER may indicate ABI incompatibility (#2898).
+#ifndef PYBIND11_BUILD_ABI
+#    if defined(__GXX_ABI_VERSION)
+#        define PYBIND11_BUILD_ABI "_cxxabi" PYBIND11_TOSTRING(__GXX_ABI_VERSION)
+#    elif defined(_MSC_VER)
+#        define PYBIND11_BUILD_ABI "_mscver" PYBIND11_TOSTRING(_MSC_VER)
+#    else
+#        define PYBIND11_BUILD_ABI ""
+#    endif
+#endif
+
+#ifndef PYBIND11_INTERNALS_KIND
+#    define PYBIND11_INTERNALS_KIND ""
+#endif
+
+#define PYBIND11_PLATFORM_ABI_ID                                                                  \
+    PYBIND11_INTERNALS_KIND PYBIND11_COMPILER_TYPE PYBIND11_STDLIB PYBIND11_BUILD_ABI             \
+        PYBIND11_BUILD_TYPE
+
+#define PYBIND11_INTERNALS_ID                                                                     \
+    "__pybind11_internals_v" PYBIND11_TOSTRING(PYBIND11_INTERNALS_VERSION)                        \
+        PYBIND11_PLATFORM_ABI_ID "__"
+
+#define PYBIND11_MODULE_LOCAL_ID                                                                  \
+    "__pybind11_module_local_v" PYBIND11_TOSTRING(PYBIND11_INTERNALS_VERSION)                     \
+        PYBIND11_PLATFORM_ABI_ID "__"
+
+/// Each module locally stores a pointer to the `internals` data. The data
+/// itself is shared among modules with the same `PYBIND11_INTERNALS_ID`.
+inline internals **&get_internals_pp() {
+    static internals **internals_pp = nullptr;
+    return internals_pp;
+}
+
+// forward decl
+inline void translate_exception(std::exception_ptr);
+
+template <class T,
+          enable_if_t<std::is_same<std::nested_exception, remove_cvref_t<T>>::value, int> = 0>
+bool handle_nested_exception(const T &exc, const std::exception_ptr &p) {
+    std::exception_ptr nested = exc.nested_ptr();
+    if (nested != nullptr && nested != p) {
+        translate_exception(nested);
+        return true;
+    }
+    return false;
+}
+
+template <class T,
+          enable_if_t<!std::is_same<std::nested_exception, remove_cvref_t<T>>::value, int> = 0>
+bool handle_nested_exception(const T &exc, const std::exception_ptr &p) {
+    if (const auto *nep = dynamic_cast<const std::nested_exception *>(std::addressof(exc))) {
+        return handle_nested_exception(*nep, p);
+    }
+    return false;
+}
+
+inline bool raise_err(PyObject *exc_type, const char *msg) {
+    if (PyErr_Occurred()) {
+        raise_from(exc_type, msg);
+        return true;
+    }
+    set_error(exc_type, msg);
+    return false;
+}
+
+inline void translate_exception(std::exception_ptr p) {
+    if (!p) {
+        return;
+    }
+    try {
+        std::rethrow_exception(p);
+    } catch (error_already_set &e) {
+        handle_nested_exception(e, p);
+        e.restore();
+        return;
+    } catch (const builtin_exception &e) {
+        // Could not use template since it's an abstract class.
+        if (const auto *nep = dynamic_cast<const std::nested_exception *>(std::addressof(e))) {
+            handle_nested_exception(*nep, p);
+        }
+        e.set_error();
+        return;
+    } catch (const std::bad_alloc &e) {
+        handle_nested_exception(e, p);
+        raise_err(PyExc_MemoryError, e.what());
+        return;
+    } catch (const std::domain_error &e) {
+        handle_nested_exception(e, p);
+        raise_err(PyExc_ValueError, e.what());
+        return;
+    } catch (const std::invalid_argument &e) {
+        handle_nested_exception(e, p);
+        raise_err(PyExc_ValueError, e.what());
+        return;
+    } catch (const std::length_error &e) {
+        handle_nested_exception(e, p);
+        raise_err(PyExc_ValueError, e.what());
+        return;
+    } catch (const std::out_of_range &e) {
+        handle_nested_exception(e, p);
+        raise_err(PyExc_IndexError, e.what());
+        return;
+    } catch (const std::range_error &e) {
+        handle_nested_exception(e, p);
+        raise_err(PyExc_ValueError, e.what());
+        return;
+    } catch (const std::overflow_error &e) {
+        handle_nested_exception(e, p);
+        raise_err(PyExc_OverflowError, e.what());
+        return;
+    } catch (const std::exception &e) {
+        handle_nested_exception(e, p);
+        raise_err(PyExc_RuntimeError, e.what());
+        return;
+    } catch (const std::nested_exception &e) {
+        handle_nested_exception(e, p);
+        raise_err(PyExc_RuntimeError, "Caught an unknown nested exception!");
+        return;
+    } catch (...) {
+        raise_err(PyExc_RuntimeError, "Caught an unknown exception!");
+        return;
+    }
+}
+
+#if !defined(__GLIBCXX__)
+inline void translate_local_exception(std::exception_ptr p) {
+    try {
+        if (p) {
+            std::rethrow_exception(p);
+        }
+    } catch (error_already_set &e) {
+        e.restore();
+        return;
+    } catch (const builtin_exception &e) {
+        e.set_error();
+        return;
+    }
+}
+#endif
+
+inline object get_python_state_dict() {
+    object state_dict;
+#if PYBIND11_INTERNALS_VERSION <= 4 || PY_VERSION_HEX < 0x03080000 || defined(PYPY_VERSION)
+    state_dict = reinterpret_borrow<object>(PyEval_GetBuiltins());
+#else
+#    if PY_VERSION_HEX < 0x03090000
+    PyInterpreterState *istate = _PyInterpreterState_Get();
+#    else
+    PyInterpreterState *istate = PyInterpreterState_Get();
+#    endif
+    if (istate) {
+        state_dict = reinterpret_borrow<object>(PyInterpreterState_GetDict(istate));
+    }
+#endif
+    if (!state_dict) {
+        raise_from(PyExc_SystemError, "pybind11::detail::get_python_state_dict() FAILED");
+        throw error_already_set();
+    }
+    return state_dict;
+}
+
+inline object get_internals_obj_from_state_dict(handle state_dict) {
+    return reinterpret_steal<object>(
+        dict_getitemstringref(state_dict.ptr(), PYBIND11_INTERNALS_ID));
+}
+
+inline internals **get_internals_pp_from_capsule(handle obj) {
+    void *raw_ptr = PyCapsule_GetPointer(obj.ptr(), /*name=*/nullptr);
+    if (raw_ptr == nullptr) {
+        raise_from(PyExc_SystemError, "pybind11::detail::get_internals_pp_from_capsule() FAILED");
+        throw error_already_set();
+    }
+    return static_cast<internals **>(raw_ptr);
+}
+
+inline uint64_t round_up_to_next_pow2(uint64_t x) {
+    // Round-up to the next power of two.
+    // See https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+    x--;
+    x |= (x >> 1);
+    x |= (x >> 2);
+    x |= (x >> 4);
+    x |= (x >> 8);
+    x |= (x >> 16);
+    x |= (x >> 32);
+    x++;
+    return x;
+}
+
+/// Return a reference to the current `internals` data
+PYBIND11_NOINLINE internals &get_internals() {
+    auto **&internals_pp = get_internals_pp();
+    if (internals_pp && *internals_pp) {
+        return **internals_pp;
+    }
+
+#if defined(PYBIND11_SIMPLE_GIL_MANAGEMENT)
+    gil_scoped_acquire gil;
+#else
+    // Ensure that the GIL is held since we will need to make Python calls.
+    // Cannot use py::gil_scoped_acquire here since that constructor calls get_internals.
+    struct gil_scoped_acquire_local {
+        gil_scoped_acquire_local() : state(PyGILState_Ensure()) {}
+        gil_scoped_acquire_local(const gil_scoped_acquire_local &) = delete;
+        gil_scoped_acquire_local &operator=(const gil_scoped_acquire_local &) = delete;
+        ~gil_scoped_acquire_local() { PyGILState_Release(state); }
+        const PyGILState_STATE state;
+    } gil;
+#endif
+    error_scope err_scope;
+
+    dict state_dict = get_python_state_dict();
+    if (object internals_obj = get_internals_obj_from_state_dict(state_dict)) {
+        internals_pp = get_internals_pp_from_capsule(internals_obj);
+    }
+    if (internals_pp && *internals_pp) {
+        // We loaded the internals through `state_dict`, which means that our `error_already_set`
+        // and `builtin_exception` may be different local classes than the ones set up in the
+        // initial exception translator, below, so add another for our local exception classes.
+        //
+        // libstdc++ doesn't require this (types there are identified only by name)
+        // libc++ with CPython doesn't require this (types are explicitly exported)
+        // libc++ with PyPy still need it, awaiting further investigation
+#if !defined(__GLIBCXX__)
+        (*internals_pp)->registered_exception_translators.push_front(&translate_local_exception);
+#endif
+    } else {
+        if (!internals_pp) {
+            internals_pp = new internals *();
+        }
+        auto *&internals_ptr = *internals_pp;
+        internals_ptr = new internals();
+
+        PyThreadState *tstate = PyThreadState_Get();
+        // NOLINTNEXTLINE(bugprone-assignment-in-if-condition)
+        if (!PYBIND11_TLS_KEY_CREATE(internals_ptr->tstate)) {
+            pybind11_fail("get_internals: could not successfully initialize the tstate TSS key!");
+        }
+        PYBIND11_TLS_REPLACE_VALUE(internals_ptr->tstate, tstate);
+
+#if PYBIND11_INTERNALS_VERSION > 4
+        // NOLINTNEXTLINE(bugprone-assignment-in-if-condition)
+        if (!PYBIND11_TLS_KEY_CREATE(internals_ptr->loader_life_support_tls_key)) {
+            pybind11_fail("get_internals: could not successfully initialize the "
+                          "loader_life_support TSS key!");
+        }
+#endif
+        internals_ptr->istate = tstate->interp;
+        state_dict[PYBIND11_INTERNALS_ID] = capsule(reinterpret_cast<void *>(internals_pp));
+        internals_ptr->registered_exception_translators.push_front(&translate_exception);
+        internals_ptr->static_property_type = make_static_property_type();
+        internals_ptr->default_metaclass = make_default_metaclass();
+        internals_ptr->instance_base = make_object_base_type(internals_ptr->default_metaclass);
+#ifdef Py_GIL_DISABLED
+        // Scale proportional to the number of cores. 2x is a heuristic to reduce contention.
+        auto num_shards
+            = static_cast<size_t>(round_up_to_next_pow2(2 * std::thread::hardware_concurrency()));
+        if (num_shards == 0) {
+            num_shards = 1;
+        }
+        internals_ptr->instance_shards.reset(new instance_map_shard[num_shards]);
+        internals_ptr->instance_shards_mask = num_shards - 1;
+#endif // Py_GIL_DISABLED
+    }
+    return **internals_pp;
+}
+
+// the internals struct (above) is shared between all the modules. local_internals are only
+// for a single module. Any changes made to internals may require an update to
+// PYBIND11_INTERNALS_VERSION, breaking backwards compatibility. local_internals is, by design,
+// restricted to a single module. Whether a module has local internals or not should not
+// impact any other modules, because the only things accessing the local internals is the
+// module that contains them.
+struct local_internals {
+    type_map<type_info *> registered_types_cpp;
+    std::forward_list<ExceptionTranslator> registered_exception_translators;
+#if PYBIND11_INTERNALS_VERSION == 4
+
+    // For ABI compatibility, we can't store the loader_life_support TLS key in
+    // the `internals` struct directly.  Instead, we store it in `shared_data` and
+    // cache a copy in `local_internals`.  If we allocated a separate TLS key for
+    // each instance of `local_internals`, we could end up allocating hundreds of
+    // TLS keys if hundreds of different pybind11 modules are loaded (which is a
+    // plausible number).
+    PYBIND11_TLS_KEY_INIT(loader_life_support_tls_key)
+
+    // Holds the shared TLS key for the loader_life_support stack.
+    struct shared_loader_life_support_data {
+        PYBIND11_TLS_KEY_INIT(loader_life_support_tls_key)
+        shared_loader_life_support_data() {
+            // NOLINTNEXTLINE(bugprone-assignment-in-if-condition)
+            if (!PYBIND11_TLS_KEY_CREATE(loader_life_support_tls_key)) {
+                pybind11_fail("local_internals: could not successfully initialize the "
+                              "loader_life_support TLS key!");
+            }
+        }
+        // We can't help but leak the TLS key, because Python never unloads extension modules.
+    };
+
+    local_internals() {
+        auto &internals = get_internals();
+        // Get or create the `loader_life_support_stack_key`.
+        auto &ptr = internals.shared_data["_life_support"];
+        if (!ptr) {
+            ptr = new shared_loader_life_support_data;
+        }
+        loader_life_support_tls_key
+            = static_cast<shared_loader_life_support_data *>(ptr)->loader_life_support_tls_key;
+    }
+#endif //  PYBIND11_INTERNALS_VERSION == 4
+};
+
+/// Works like `get_internals`, but for things which are locally registered.
+inline local_internals &get_local_internals() {
+    // Current static can be created in the interpreter finalization routine. If the later will be
+    // destroyed in another static variable destructor, creation of this static there will cause
+    // static deinitialization fiasco. In order to avoid it we avoid destruction of the
+    // local_internals static. One can read more about the problem and current solution here:
+    // https://google.github.io/styleguide/cppguide.html#Static_and_Global_Variables
+    static auto *locals = new local_internals();
+    return *locals;
+}
+
+#ifdef Py_GIL_DISABLED
+#    define PYBIND11_LOCK_INTERNALS(internals) std::unique_lock<pymutex> lock((internals).mutex)
+#else
+#    define PYBIND11_LOCK_INTERNALS(internals)
+#endif
+
+template <typename F>
+inline auto with_internals(const F &cb) -> decltype(cb(get_internals())) {
+    auto &internals = get_internals();
+    PYBIND11_LOCK_INTERNALS(internals);
+    return cb(internals);
+}
+
+inline std::uint64_t mix64(std::uint64_t z) {
+    // David Stafford's variant 13 of the MurmurHash3 finalizer popularized
+    // by the SplitMix PRNG.
+    // https://zimbry.blogspot.com/2011/09/better-bit-mixing-improving-on.html
+    z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9;
+    z = (z ^ (z >> 27)) * 0x94d049bb133111eb;
+    return z ^ (z >> 31);
+}
+
+template <typename F>
+inline auto with_instance_map(const void *ptr,
+                              const F &cb) -> decltype(cb(std::declval<instance_map &>())) {
+    auto &internals = get_internals();
+
+#ifdef Py_GIL_DISABLED
+    // Hash address to compute shard, but ignore low bits. We'd like allocations
+    // from the same thread/core to map to the same shard and allocations from
+    // other threads/cores to map to other shards. Using the high bits is a good
+    // heuristic because memory allocators often have a per-thread
+    // arena/superblock/segment from which smaller allocations are served.
+    auto addr = reinterpret_cast<std::uintptr_t>(ptr);
+    auto hash = mix64(static_cast<std::uint64_t>(addr >> 20));
+    auto idx = static_cast<size_t>(hash & internals.instance_shards_mask);
+
+    auto &shard = internals.instance_shards[idx];
+    std::unique_lock<pymutex> lock(shard.mutex);
+    return cb(shard.registered_instances);
+#else
+    (void) ptr;
+    return cb(internals.registered_instances);
+#endif
+}
+
+// Returns the number of registered instances for testing purposes.  The result may not be
+// consistent if other threads are registering or unregistering instances concurrently.
+inline size_t num_registered_instances() {
+    auto &internals = get_internals();
+#ifdef Py_GIL_DISABLED
+    size_t count = 0;
+    for (size_t i = 0; i <= internals.instance_shards_mask; ++i) {
+        auto &shard = internals.instance_shards[i];
+        std::unique_lock<pymutex> lock(shard.mutex);
+        count += shard.registered_instances.size();
+    }
+    return count;
+#else
+    return internals.registered_instances.size();
+#endif
+}
+
+/// Constructs a std::string with the given arguments, stores it in `internals`, and returns its
+/// `c_str()`.  Such strings objects have a long storage duration -- the internal strings are only
+/// cleared when the program exits or after interpreter shutdown (when embedding), and so are
+/// suitable for c-style strings needed by Python internals (such as PyTypeObject's tp_name).
+template <typename... Args>
+const char *c_str(Args &&...args) {
+    // GCC 4.8 doesn't like parameter unpack within lambda capture, so use
+    // PYBIND11_LOCK_INTERNALS.
+    auto &internals = get_internals();
+    PYBIND11_LOCK_INTERNALS(internals);
+    auto &strings = internals.static_strings;
+    strings.emplace_front(std::forward<Args>(args)...);
+    return strings.front().c_str();
+}
+
+inline const char *get_function_record_capsule_name() {
+#if PYBIND11_INTERNALS_VERSION > 4
+    return get_internals().function_record_capsule_name.c_str();
+#else
+    return nullptr;
+#endif
+}
+
+// Determine whether or not the following capsule contains a pybind11 function record.
+// Note that we use `internals` to make sure that only ABI compatible records are touched.
+//
+// This check is currently used in two places:
+// - An important optimization in functional.h to avoid overhead in C++ -> Python -> C++
+// - The sibling feature of cpp_function to allow overloads
+inline bool is_function_record_capsule(const capsule &cap) {
+    // Pointer equality as we rely on internals() to ensure unique pointers
+    return cap.name() == get_function_record_capsule_name();
+}
+
+PYBIND11_NAMESPACE_END(detail)
+
+/// Returns a named pointer that is shared among all extension modules (using the same
+/// pybind11 version) running in the current interpreter. Names starting with underscores
+/// are reserved for internal usage. Returns `nullptr` if no matching entry was found.
+PYBIND11_NOINLINE void *get_shared_data(const std::string &name) {
+    return detail::with_internals([&](detail::internals &internals) {
+        auto it = internals.shared_data.find(name);
+        return it != internals.shared_data.end() ? it->second : nullptr;
+    });
+}
+
+/// Set the shared data that can be later recovered by `get_shared_data()`.
+PYBIND11_NOINLINE void *set_shared_data(const std::string &name, void *data) {
+    return detail::with_internals([&](detail::internals &internals) {
+        internals.shared_data[name] = data;
+        return data;
+    });
+}
+
+/// Returns a typed reference to a shared data entry (by using `get_shared_data()`) if
+/// such entry exists. Otherwise, a new object of default-constructible type `T` is
+/// added to the shared data under the given name and a reference to it is returned.
+template <typename T>
+T &get_or_create_shared_data(const std::string &name) {
+    return *detail::with_internals([&](detail::internals &internals) {
+        auto it = internals.shared_data.find(name);
+        T *ptr = (T *) (it != internals.shared_data.end() ? it->second : nullptr);
+        if (!ptr) {
+            ptr = new T();
+            internals.shared_data[name] = ptr;
+        }
+        return ptr;
+    });
+}
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/phivenv/Lib/site-packages/torch/include/pybind11/detail/type_caster_base.h b/phivenv/Lib/site-packages/torch/include/pybind11/detail/type_caster_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..0a5feb841277d71f7e9a83c8a7ce6a8c8fcc13ea
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/pybind11/detail/type_caster_base.h
@@ -0,0 +1,1195 @@
+/*
+    pybind11/detail/type_caster_base.h (originally first part of pybind11/cast.h)
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include <pybind11/pytypes.h>
+
+#include "common.h"
+#include "cpp_conduit.h"
+#include "descr.h"
+#include "internals.h"
+#include "typeid.h"
+#include "value_and_holder.h"
+
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <new>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <typeindex>
+#include <typeinfo>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+/// A life support system for temporary objects created by `type_caster::load()`.
+/// Adding a patient will keep it alive up until the enclosing function returns.
+class loader_life_support {
+private:
+    loader_life_support *parent = nullptr;
+    std::unordered_set<PyObject *> keep_alive;
+
+    // Store stack pointer in thread-local storage.
+    static PYBIND11_TLS_KEY_REF get_stack_tls_key() {
+#if PYBIND11_INTERNALS_VERSION == 4
+        return get_local_internals().loader_life_support_tls_key;
+#else
+        return get_internals().loader_life_support_tls_key;
+#endif
+    }
+    static loader_life_support *get_stack_top() {
+        return static_cast<loader_life_support *>(PYBIND11_TLS_GET_VALUE(get_stack_tls_key()));
+    }
+    static void set_stack_top(loader_life_support *value) {
+        PYBIND11_TLS_REPLACE_VALUE(get_stack_tls_key(), value);
+    }
+
+public:
+    /// A new patient frame is created when a function is entered
+    loader_life_support() : parent{get_stack_top()} { set_stack_top(this); }
+
+    /// ... and destroyed after it returns
+    ~loader_life_support() {
+        if (get_stack_top() != this) {
+            pybind11_fail("loader_life_support: internal error");
+        }
+        set_stack_top(parent);
+        for (auto *item : keep_alive) {
+            Py_DECREF(item);
+        }
+    }
+
+    /// This can only be used inside a pybind11-bound function, either by `argument_loader`
+    /// at argument preparation time or by `py::cast()` at execution time.
+    PYBIND11_NOINLINE static void add_patient(handle h) {
+        loader_life_support *frame = get_stack_top();
+        if (!frame) {
+            // NOTE: It would be nice to include the stack frames here, as this indicates
+            // use of pybind11::cast<> outside the normal call framework, finding such
+            // a location is challenging. Developers could consider printing out
+            // stack frame addresses here using something like __builtin_frame_address(0)
+            throw cast_error("When called outside a bound function, py::cast() cannot "
+                             "do Python -> C++ conversions which require the creation "
+                             "of temporary values");
+        }
+
+        if (frame->keep_alive.insert(h.ptr()).second) {
+            Py_INCREF(h.ptr());
+        }
+    }
+};
+
+// Gets the cache entry for the given type, creating it if necessary.  The return value is the pair
+// returned by emplace, i.e. an iterator for the entry and a bool set to `true` if the entry was
+// just created.
+inline std::pair<decltype(internals::registered_types_py)::iterator, bool>
+all_type_info_get_cache(PyTypeObject *type);
+
+// Band-aid workaround to fix a subtle but serious bug in a minimalistic fashion. See PR #4762.
+inline void all_type_info_add_base_most_derived_first(std::vector<type_info *> &bases,
+                                                      type_info *addl_base) {
+    for (auto it = bases.begin(); it != bases.end(); it++) {
+        type_info *existing_base = *it;
+        if (PyType_IsSubtype(addl_base->type, existing_base->type) != 0) {
+            bases.insert(it, addl_base);
+            return;
+        }
+    }
+    bases.push_back(addl_base);
+}
+
+// Populates a just-created cache entry.
+PYBIND11_NOINLINE void all_type_info_populate(PyTypeObject *t, std::vector<type_info *> &bases) {
+    assert(bases.empty());
+    std::vector<PyTypeObject *> check;
+    for (handle parent : reinterpret_borrow<tuple>(t->tp_bases)) {
+        check.push_back((PyTypeObject *) parent.ptr());
+    }
+
+    auto const &type_dict = get_internals().registered_types_py;
+    for (size_t i = 0; i < check.size(); i++) {
+        auto *type = check[i];
+        // Ignore Python2 old-style class super type:
+        if (!PyType_Check((PyObject *) type)) {
+            continue;
+        }
+
+        // Check `type` in the current set of registered python types:
+        auto it = type_dict.find(type);
+        if (it != type_dict.end()) {
+            // We found a cache entry for it, so it's either pybind-registered or has pre-computed
+            // pybind bases, but we have to make sure we haven't already seen the type(s) before:
+            // we want to follow Python/virtual C++ rules that there should only be one instance of
+            // a common base.
+            for (auto *tinfo : it->second) {
+                // NB: Could use a second set here, rather than doing a linear search, but since
+                // having a large number of immediate pybind11-registered types seems fairly
+                // unlikely, that probably isn't worthwhile.
+                bool found = false;
+                for (auto *known : bases) {
+                    if (known == tinfo) {
+                        found = true;
+                        break;
+                    }
+                }
+                if (!found) {
+                    all_type_info_add_base_most_derived_first(bases, tinfo);
+                }
+            }
+        } else if (type->tp_bases) {
+            // It's some python type, so keep follow its bases classes to look for one or more
+            // registered types
+            if (i + 1 == check.size()) {
+                // When we're at the end, we can pop off the current element to avoid growing
+                // `check` when adding just one base (which is typical--i.e. when there is no
+                // multiple inheritance)
+                check.pop_back();
+                i--;
+            }
+            for (handle parent : reinterpret_borrow<tuple>(type->tp_bases)) {
+                check.push_back((PyTypeObject *) parent.ptr());
+            }
+        }
+    }
+}
+
+/**
+ * Extracts vector of type_info pointers of pybind-registered roots of the given Python type.  Will
+ * be just 1 pybind type for the Python type of a pybind-registered class, or for any Python-side
+ * derived class that uses single inheritance.  Will contain as many types as required for a Python
+ * class that uses multiple inheritance to inherit (directly or indirectly) from multiple
+ * pybind-registered classes.  Will be empty if neither the type nor any base classes are
+ * pybind-registered.
+ *
+ * The value is cached for the lifetime of the Python type.
+ */
+inline const std::vector<detail::type_info *> &all_type_info(PyTypeObject *type) {
+    auto ins = all_type_info_get_cache(type);
+    if (ins.second) {
+        // New cache entry: populate it
+        all_type_info_populate(type, ins.first->second);
+    }
+
+    return ins.first->second;
+}
+
+/**
+ * Gets a single pybind11 type info for a python type.  Returns nullptr if neither the type nor any
+ * ancestors are pybind11-registered.  Throws an exception if there are multiple bases--use
+ * `all_type_info` instead if you want to support multiple bases.
+ */
+PYBIND11_NOINLINE detail::type_info *get_type_info(PyTypeObject *type) {
+    const auto &bases = all_type_info(type);
+    if (bases.empty()) {
+        return nullptr;
+    }
+    if (bases.size() > 1) {
+        pybind11_fail(
+            "pybind11::detail::get_type_info: type has multiple pybind11-registered bases");
+    }
+    return bases.front();
+}
+
+inline detail::type_info *get_local_type_info(const std::type_index &tp) {
+    auto &locals = get_local_internals().registered_types_cpp;
+    auto it = locals.find(tp);
+    if (it != locals.end()) {
+        return it->second;
+    }
+    return nullptr;
+}
+
+inline detail::type_info *get_global_type_info(const std::type_index &tp) {
+    return with_internals([&](internals &internals) {
+        detail::type_info *type_info = nullptr;
+        auto &types = internals.registered_types_cpp;
+        auto it = types.find(tp);
+        if (it != types.end()) {
+            type_info = it->second;
+        }
+        return type_info;
+    });
+}
+
+/// Return the type info for a given C++ type; on lookup failure can either throw or return
+/// nullptr.
+PYBIND11_NOINLINE detail::type_info *get_type_info(const std::type_index &tp,
+                                                   bool throw_if_missing = false) {
+    if (auto *ltype = get_local_type_info(tp)) {
+        return ltype;
+    }
+    if (auto *gtype = get_global_type_info(tp)) {
+        return gtype;
+    }
+
+    if (throw_if_missing) {
+        std::string tname = tp.name();
+        detail::clean_type_id(tname);
+        pybind11_fail("pybind11::detail::get_type_info: unable to find type info for \""
+                      + std::move(tname) + '"');
+    }
+    return nullptr;
+}
+
+PYBIND11_NOINLINE handle get_type_handle(const std::type_info &tp, bool throw_if_missing) {
+    detail::type_info *type_info = get_type_info(tp, throw_if_missing);
+    return handle(type_info ? ((PyObject *) type_info->type) : nullptr);
+}
+
+// Searches the inheritance graph for a registered Python instance, using all_type_info().
+PYBIND11_NOINLINE handle find_registered_python_instance(void *src,
+                                                         const detail::type_info *tinfo) {
+    return with_instance_map(src, [&](instance_map &instances) {
+        auto it_instances = instances.equal_range(src);
+        for (auto it_i = it_instances.first; it_i != it_instances.second; ++it_i) {
+            for (auto *instance_type : detail::all_type_info(Py_TYPE(it_i->second))) {
+                if (instance_type && same_type(*instance_type->cpptype, *tinfo->cpptype)) {
+                    return handle((PyObject *) it_i->second).inc_ref();
+                }
+            }
+        }
+        return handle();
+    });
+}
+
+// Container for accessing and iterating over an instance's values/holders
+struct values_and_holders {
+private:
+    instance *inst;
+    using type_vec = std::vector<detail::type_info *>;
+    const type_vec &tinfo;
+
+public:
+    explicit values_and_holders(instance *inst)
+        : inst{inst}, tinfo(all_type_info(Py_TYPE(inst))) {}
+
+    explicit values_and_holders(PyObject *obj)
+        : inst{nullptr}, tinfo(all_type_info(Py_TYPE(obj))) {
+        if (!tinfo.empty()) {
+            inst = reinterpret_cast<instance *>(obj);
+        }
+    }
+
+    struct iterator {
+    private:
+        instance *inst = nullptr;
+        const type_vec *types = nullptr;
+        value_and_holder curr;
+        friend struct values_and_holders;
+        iterator(instance *inst, const type_vec *tinfo) : inst{inst}, types{tinfo} {
+            if (inst != nullptr) {
+                assert(!types->empty());
+                curr = value_and_holder(
+                    inst /* instance */,
+                    (*types)[0] /* type info */,
+                    0, /* vpos: (non-simple types only): the first vptr comes first */
+                    0 /* index */);
+            }
+        }
+        // Past-the-end iterator:
+        explicit iterator(size_t end) : curr(end) {}
+
+    public:
+        bool operator==(const iterator &other) const { return curr.index == other.curr.index; }
+        bool operator!=(const iterator &other) const { return curr.index != other.curr.index; }
+        iterator &operator++() {
+            if (!inst->simple_layout) {
+                curr.vh += 1 + (*types)[curr.index]->holder_size_in_ptrs;
+            }
+            ++curr.index;
+            curr.type = curr.index < types->size() ? (*types)[curr.index] : nullptr;
+            return *this;
+        }
+        value_and_holder &operator*() { return curr; }
+        value_and_holder *operator->() { return &curr; }
+    };
+
+    iterator begin() { return iterator(inst, &tinfo); }
+    iterator end() { return iterator(tinfo.size()); }
+
+    iterator find(const type_info *find_type) {
+        auto it = begin(), endit = end();
+        while (it != endit && it->type != find_type) {
+            ++it;
+        }
+        return it;
+    }
+
+    size_t size() { return tinfo.size(); }
+
+    // Band-aid workaround to fix a subtle but serious bug in a minimalistic fashion. See PR #4762.
+    bool is_redundant_value_and_holder(const value_and_holder &vh) {
+        for (size_t i = 0; i < vh.index; i++) {
+            if (PyType_IsSubtype(tinfo[i]->type, tinfo[vh.index]->type) != 0) {
+                return true;
+            }
+        }
+        return false;
+    }
+};
+
+/**
+ * Extracts C++ value and holder pointer references from an instance (which may contain multiple
+ * values/holders for python-side multiple inheritance) that match the given type.  Throws an error
+ * if the given type (or ValueType, if omitted) is not a pybind11 base of the given instance.  If
+ * `find_type` is omitted (or explicitly specified as nullptr) the first value/holder are returned,
+ * regardless of type (and the resulting .type will be nullptr).
+ *
+ * The returned object should be short-lived: in particular, it must not outlive the called-upon
+ * instance.
+ */
+PYBIND11_NOINLINE value_and_holder
+instance::get_value_and_holder(const type_info *find_type /*= nullptr default in common.h*/,
+                               bool throw_if_missing /*= true in common.h*/) {
+    // Optimize common case:
+    if (!find_type || Py_TYPE(this) == find_type->type) {
+        return value_and_holder(this, find_type, 0, 0);
+    }
+
+    detail::values_and_holders vhs(this);
+    auto it = vhs.find(find_type);
+    if (it != vhs.end()) {
+        return *it;
+    }
+
+    if (!throw_if_missing) {
+        return value_and_holder();
+    }
+
+#if defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+    pybind11_fail("pybind11::detail::instance::get_value_and_holder: `"
+                  + get_fully_qualified_tp_name(find_type->type)
+                  + "' is not a pybind11 base of the given `"
+                  + get_fully_qualified_tp_name(Py_TYPE(this)) + "' instance");
+#else
+    pybind11_fail(
+        "pybind11::detail::instance::get_value_and_holder: "
+        "type is not a pybind11 base of the given instance "
+        "(#define PYBIND11_DETAILED_ERROR_MESSAGES or compile in debug mode for type details)");
+#endif
+}
+
+PYBIND11_NOINLINE void instance::allocate_layout() {
+    const auto &tinfo = all_type_info(Py_TYPE(this));
+
+    const size_t n_types = tinfo.size();
+
+    if (n_types == 0) {
+        pybind11_fail(
+            "instance allocation failed: new instance has no pybind11-registered base types");
+    }
+
+    simple_layout
+        = n_types == 1 && tinfo.front()->holder_size_in_ptrs <= instance_simple_holder_in_ptrs();
+
+    // Simple path: no python-side multiple inheritance, and a small-enough holder
+    if (simple_layout) {
+        simple_value_holder[0] = nullptr;
+        simple_holder_constructed = false;
+        simple_instance_registered = false;
+    } else { // multiple base types or a too-large holder
+        // Allocate space to hold: [v1*][h1][v2*][h2]...[bb...] where [vN*] is a value pointer,
+        // [hN] is the (uninitialized) holder instance for value N, and [bb...] is a set of bool
+        // values that tracks whether each associated holder has been initialized.  Each [block] is
+        // padded, if necessary, to an integer multiple of sizeof(void *).
+        size_t space = 0;
+        for (auto *t : tinfo) {
+            space += 1;                      // value pointer
+            space += t->holder_size_in_ptrs; // holder instance
+        }
+        size_t flags_at = space;
+        space += size_in_ptrs(n_types); // status bytes (holder_constructed and
+                                        // instance_registered)
+
+        // Allocate space for flags, values, and holders, and initialize it to 0 (flags and values,
+        // in particular, need to be 0).  Use Python's memory allocation
+        // functions: Python is using pymalloc, which is designed to be
+        // efficient for small allocations like the one we're doing here;
+        // for larger allocations they are just wrappers around malloc.
+        // TODO: is this still true for pure Python 3.6?
+        nonsimple.values_and_holders = (void **) PyMem_Calloc(space, sizeof(void *));
+        if (!nonsimple.values_and_holders) {
+            throw std::bad_alloc();
+        }
+        nonsimple.status
+            = reinterpret_cast<std::uint8_t *>(&nonsimple.values_and_holders[flags_at]);
+    }
+    owned = true;
+}
+
+// NOLINTNEXTLINE(readability-make-member-function-const)
+PYBIND11_NOINLINE void instance::deallocate_layout() {
+    if (!simple_layout) {
+        PyMem_Free(reinterpret_cast<void *>(nonsimple.values_and_holders));
+    }
+}
+
+PYBIND11_NOINLINE bool isinstance_generic(handle obj, const std::type_info &tp) {
+    handle type = detail::get_type_handle(tp, false);
+    if (!type) {
+        return false;
+    }
+    return isinstance(obj, type);
+}
+
+PYBIND11_NOINLINE handle get_object_handle(const void *ptr, const detail::type_info *type) {
+    return with_instance_map(ptr, [&](instance_map &instances) {
+        auto range = instances.equal_range(ptr);
+        for (auto it = range.first; it != range.second; ++it) {
+            for (const auto &vh : values_and_holders(it->second)) {
+                if (vh.type == type) {
+                    return handle((PyObject *) it->second);
+                }
+            }
+        }
+        return handle();
+    });
+}
+
+inline PyThreadState *get_thread_state_unchecked() {
+#if defined(PYPY_VERSION)
+    return PyThreadState_GET();
+#elif PY_VERSION_HEX < 0x030D0000
+    return _PyThreadState_UncheckedGet();
+#else
+    return PyThreadState_GetUnchecked();
+#endif
+}
+
+// Forward declarations
+void keep_alive_impl(handle nurse, handle patient);
+inline PyObject *make_new_instance(PyTypeObject *type);
+
+class type_caster_generic {
+public:
+    PYBIND11_NOINLINE explicit type_caster_generic(const std::type_info &type_info)
+        : typeinfo(get_type_info(type_info)), cpptype(&type_info) {}
+
+    explicit type_caster_generic(const type_info *typeinfo)
+        : typeinfo(typeinfo), cpptype(typeinfo ? typeinfo->cpptype : nullptr) {}
+
+    bool load(handle src, bool convert) { return load_impl<type_caster_generic>(src, convert); }
+
+    PYBIND11_NOINLINE static handle cast(const void *_src,
+                                         return_value_policy policy,
+                                         handle parent,
+                                         const detail::type_info *tinfo,
+                                         void *(*copy_constructor)(const void *),
+                                         void *(*move_constructor)(const void *),
+                                         const void *existing_holder = nullptr) {
+        if (!tinfo) { // no type info: error will be set already
+            return handle();
+        }
+
+        void *src = const_cast<void *>(_src);
+        if (src == nullptr) {
+            return none().release();
+        }
+
+        if (handle registered_inst = find_registered_python_instance(src, tinfo)) {
+            return registered_inst;
+        }
+
+        auto inst = reinterpret_steal<object>(make_new_instance(tinfo->type));
+        auto *wrapper = reinterpret_cast<instance *>(inst.ptr());
+        wrapper->owned = false;
+        void *&valueptr = values_and_holders(wrapper).begin()->value_ptr();
+
+        switch (policy) {
+            case return_value_policy::automatic:
+            case return_value_policy::take_ownership:
+                valueptr = src;
+                wrapper->owned = true;
+                break;
+
+            case return_value_policy::automatic_reference:
+            case return_value_policy::reference:
+                valueptr = src;
+                wrapper->owned = false;
+                break;
+
+            case return_value_policy::copy:
+                if (copy_constructor) {
+                    valueptr = copy_constructor(src);
+                } else {
+#if defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+                    std::string type_name(tinfo->cpptype->name());
+                    detail::clean_type_id(type_name);
+                    throw cast_error("return_value_policy = copy, but type " + type_name
+                                     + " is non-copyable!");
+#else
+                    throw cast_error("return_value_policy = copy, but type is "
+                                     "non-copyable! (#define PYBIND11_DETAILED_ERROR_MESSAGES or "
+                                     "compile in debug mode for details)");
+#endif
+                }
+                wrapper->owned = true;
+                break;
+
+            case return_value_policy::move:
+                if (move_constructor) {
+                    valueptr = move_constructor(src);
+                } else if (copy_constructor) {
+                    valueptr = copy_constructor(src);
+                } else {
+#if defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+                    std::string type_name(tinfo->cpptype->name());
+                    detail::clean_type_id(type_name);
+                    throw cast_error("return_value_policy = move, but type " + type_name
+                                     + " is neither movable nor copyable!");
+#else
+                    throw cast_error("return_value_policy = move, but type is neither "
+                                     "movable nor copyable! "
+                                     "(#define PYBIND11_DETAILED_ERROR_MESSAGES or compile in "
+                                     "debug mode for details)");
+#endif
+                }
+                wrapper->owned = true;
+                break;
+
+            case return_value_policy::reference_internal:
+                valueptr = src;
+                wrapper->owned = false;
+                keep_alive_impl(inst, parent);
+                break;
+
+            default:
+                throw cast_error("unhandled return_value_policy: should not happen!");
+        }
+
+        tinfo->init_instance(wrapper, existing_holder);
+
+        return inst.release();
+    }
+
+    // Base methods for generic caster; there are overridden in copyable_holder_caster
+    void load_value(value_and_holder &&v_h) {
+        auto *&vptr = v_h.value_ptr();
+        // Lazy allocation for unallocated values:
+        if (vptr == nullptr) {
+            const auto *type = v_h.type ? v_h.type : typeinfo;
+            if (type->operator_new) {
+                vptr = type->operator_new(type->type_size);
+            } else {
+#if defined(__cpp_aligned_new) && (!defined(_MSC_VER) || _MSC_VER >= 1912)
+                if (type->type_align > __STDCPP_DEFAULT_NEW_ALIGNMENT__) {
+                    vptr = ::operator new(type->type_size, std::align_val_t(type->type_align));
+                } else {
+                    vptr = ::operator new(type->type_size);
+                }
+#else
+                vptr = ::operator new(type->type_size);
+#endif
+            }
+        }
+        value = vptr;
+    }
+    bool try_implicit_casts(handle src, bool convert) {
+        for (const auto &cast : typeinfo->implicit_casts) {
+            type_caster_generic sub_caster(*cast.first);
+            if (sub_caster.load(src, convert)) {
+                value = cast.second(sub_caster.value);
+                return true;
+            }
+        }
+        return false;
+    }
+    bool try_direct_conversions(handle src) {
+        for (auto &converter : *typeinfo->direct_conversions) {
+            if (converter(src.ptr(), value)) {
+                return true;
+            }
+        }
+        return false;
+    }
+    bool try_cpp_conduit(handle src) {
+        value = try_raw_pointer_ephemeral_from_cpp_conduit(src, cpptype);
+        if (value != nullptr) {
+            return true;
+        }
+        return false;
+    }
+    void check_holder_compat() {}
+
+    PYBIND11_NOINLINE static void *local_load(PyObject *src, const type_info *ti) {
+        auto caster = type_caster_generic(ti);
+        if (caster.load(src, false)) {
+            return caster.value;
+        }
+        return nullptr;
+    }
+
+    /// Try to load with foreign typeinfo, if available. Used when there is no
+    /// native typeinfo, or when the native one wasn't able to produce a value.
+    PYBIND11_NOINLINE bool try_load_foreign_module_local(handle src) {
+        constexpr auto *local_key = PYBIND11_MODULE_LOCAL_ID;
+        const auto pytype = type::handle_of(src);
+        if (!hasattr(pytype, local_key)) {
+            return false;
+        }
+
+        type_info *foreign_typeinfo = reinterpret_borrow<capsule>(getattr(pytype, local_key));
+        // Only consider this foreign loader if actually foreign and is a loader of the correct cpp
+        // type
+        if (foreign_typeinfo->module_local_load == &local_load
+            || (cpptype && !same_type(*cpptype, *foreign_typeinfo->cpptype))) {
+            return false;
+        }
+
+        if (auto *result = foreign_typeinfo->module_local_load(src.ptr(), foreign_typeinfo)) {
+            value = result;
+            return true;
+        }
+        return false;
+    }
+
+    // Implementation of `load`; this takes the type of `this` so that it can dispatch the relevant
+    // bits of code between here and copyable_holder_caster where the two classes need different
+    // logic (without having to resort to virtual inheritance).
+    template <typename ThisT>
+    PYBIND11_NOINLINE bool load_impl(handle src, bool convert) {
+        if (!src) {
+            return false;
+        }
+        if (!typeinfo) {
+            return try_load_foreign_module_local(src);
+        }
+
+        auto &this_ = static_cast<ThisT &>(*this);
+        this_.check_holder_compat();
+
+        PyTypeObject *srctype = Py_TYPE(src.ptr());
+
+        // Case 1: If src is an exact type match for the target type then we can reinterpret_cast
+        // the instance's value pointer to the target type:
+        if (srctype == typeinfo->type) {
+            this_.load_value(reinterpret_cast<instance *>(src.ptr())->get_value_and_holder());
+            return true;
+        }
+        // Case 2: We have a derived class
+        if (PyType_IsSubtype(srctype, typeinfo->type)) {
+            const auto &bases = all_type_info(srctype);
+            bool no_cpp_mi = typeinfo->simple_type;
+
+            // Case 2a: the python type is a Python-inherited derived class that inherits from just
+            // one simple (no MI) pybind11 class, or is an exact match, so the C++ instance is of
+            // the right type and we can use reinterpret_cast.
+            // (This is essentially the same as case 2b, but because not using multiple inheritance
+            // is extremely common, we handle it specially to avoid the loop iterator and type
+            // pointer lookup overhead)
+            if (bases.size() == 1 && (no_cpp_mi || bases.front()->type == typeinfo->type)) {
+                this_.load_value(reinterpret_cast<instance *>(src.ptr())->get_value_and_holder());
+                return true;
+            }
+            // Case 2b: the python type inherits from multiple C++ bases.  Check the bases to see
+            // if we can find an exact match (or, for a simple C++ type, an inherited match); if
+            // so, we can safely reinterpret_cast to the relevant pointer.
+            if (bases.size() > 1) {
+                for (auto *base : bases) {
+                    if (no_cpp_mi ? PyType_IsSubtype(base->type, typeinfo->type)
+                                  : base->type == typeinfo->type) {
+                        this_.load_value(
+                            reinterpret_cast<instance *>(src.ptr())->get_value_and_holder(base));
+                        return true;
+                    }
+                }
+            }
+
+            // Case 2c: C++ multiple inheritance is involved and we couldn't find an exact type
+            // match in the registered bases, above, so try implicit casting (needed for proper C++
+            // casting when MI is involved).
+            if (this_.try_implicit_casts(src, convert)) {
+                return true;
+            }
+        }
+
+        // Perform an implicit conversion
+        if (convert) {
+            for (const auto &converter : typeinfo->implicit_conversions) {
+                auto temp = reinterpret_steal<object>(converter(src.ptr(), typeinfo->type));
+                if (load_impl<ThisT>(temp, false)) {
+                    loader_life_support::add_patient(temp);
+                    return true;
+                }
+            }
+            if (this_.try_direct_conversions(src)) {
+                return true;
+            }
+        }
+
+        // Failed to match local typeinfo. Try again with global.
+        if (typeinfo->module_local) {
+            if (auto *gtype = get_global_type_info(*typeinfo->cpptype)) {
+                typeinfo = gtype;
+                return load(src, false);
+            }
+        }
+
+        // Global typeinfo has precedence over foreign module_local
+        if (try_load_foreign_module_local(src)) {
+            return true;
+        }
+
+        // Custom converters didn't take None, now we convert None to nullptr.
+        if (src.is_none()) {
+            // Defer accepting None to other overloads (if we aren't in convert mode):
+            if (!convert) {
+                return false;
+            }
+            value = nullptr;
+            return true;
+        }
+
+        if (convert && cpptype && this_.try_cpp_conduit(src)) {
+            return true;
+        }
+
+        return false;
+    }
+
+    // Called to do type lookup and wrap the pointer and type in a pair when a dynamic_cast
+    // isn't needed or can't be used.  If the type is unknown, sets the error and returns a pair
+    // with .second = nullptr.  (p.first = nullptr is not an error: it becomes None).
+    PYBIND11_NOINLINE static std::pair<const void *, const type_info *>
+    src_and_type(const void *src,
+                 const std::type_info &cast_type,
+                 const std::type_info *rtti_type = nullptr) {
+        if (auto *tpi = get_type_info(cast_type)) {
+            return {src, const_cast<const type_info *>(tpi)};
+        }
+
+        // Not found, set error:
+        std::string tname = rtti_type ? rtti_type->name() : cast_type.name();
+        detail::clean_type_id(tname);
+        std::string msg = "Unregistered type : " + tname;
+        set_error(PyExc_TypeError, msg.c_str());
+        return {nullptr, nullptr};
+    }
+
+    const type_info *typeinfo = nullptr;
+    const std::type_info *cpptype = nullptr;
+    void *value = nullptr;
+};
+
+inline object cpp_conduit_method(handle self,
+                                 const bytes &pybind11_platform_abi_id,
+                                 const capsule &cpp_type_info_capsule,
+                                 const bytes &pointer_kind) {
+#ifdef PYBIND11_HAS_STRING_VIEW
+    using cpp_str = std::string_view;
+#else
+    using cpp_str = std::string;
+#endif
+    if (cpp_str(pybind11_platform_abi_id) != PYBIND11_PLATFORM_ABI_ID) {
+        return none();
+    }
+    if (std::strcmp(cpp_type_info_capsule.name(), typeid(std::type_info).name()) != 0) {
+        return none();
+    }
+    if (cpp_str(pointer_kind) != "raw_pointer_ephemeral") {
+        throw std::runtime_error("Invalid pointer_kind: \"" + std::string(pointer_kind) + "\"");
+    }
+    const auto *cpp_type_info = cpp_type_info_capsule.get_pointer<const std::type_info>();
+    type_caster_generic caster(*cpp_type_info);
+    if (!caster.load(self, false)) {
+        return none();
+    }
+    return capsule(caster.value, cpp_type_info->name());
+}
+
+/**
+ * Determine suitable casting operator for pointer-or-lvalue-casting type casters.  The type caster
+ * needs to provide `operator T*()` and `operator T&()` operators.
+ *
+ * If the type supports moving the value away via an `operator T&&() &&` method, it should use
+ * `movable_cast_op_type` instead.
+ */
+template <typename T>
+using cast_op_type = conditional_t<std::is_pointer<remove_reference_t<T>>::value,
+                                   typename std::add_pointer<intrinsic_t<T>>::type,
+                                   typename std::add_lvalue_reference<intrinsic_t<T>>::type>;
+
+/**
+ * Determine suitable casting operator for a type caster with a movable value.  Such a type caster
+ * needs to provide `operator T*()`, `operator T&()`, and `operator T&&() &&`.  The latter will be
+ * called in appropriate contexts where the value can be moved rather than copied.
+ *
+ * These operator are automatically provided when using the PYBIND11_TYPE_CASTER macro.
+ */
+template <typename T>
+using movable_cast_op_type
+    = conditional_t<std::is_pointer<typename std::remove_reference<T>::type>::value,
+                    typename std::add_pointer<intrinsic_t<T>>::type,
+                    conditional_t<std::is_rvalue_reference<T>::value,
+                                  typename std::add_rvalue_reference<intrinsic_t<T>>::type,
+                                  typename std::add_lvalue_reference<intrinsic_t<T>>::type>>;
+
+// Does the container have a mapped type and is it recursive?
+// Implemented by specializations below.
+template <typename Container, typename SFINAE = void>
+struct container_mapped_type_traits {
+    static constexpr bool has_mapped_type = false;
+    static constexpr bool has_recursive_mapped_type = false;
+};
+
+template <typename Container>
+struct container_mapped_type_traits<
+    Container,
+    typename std::enable_if<
+        std::is_same<typename Container::mapped_type, Container>::value>::type> {
+    static constexpr bool has_mapped_type = true;
+    static constexpr bool has_recursive_mapped_type = true;
+};
+
+template <typename Container>
+struct container_mapped_type_traits<
+    Container,
+    typename std::enable_if<
+        negation<std::is_same<typename Container::mapped_type, Container>>::value>::type> {
+    static constexpr bool has_mapped_type = true;
+    static constexpr bool has_recursive_mapped_type = false;
+};
+
+// Does the container have a value type and is it recursive?
+// Implemented by specializations below.
+template <typename Container, typename SFINAE = void>
+struct container_value_type_traits : std::false_type {
+    static constexpr bool has_value_type = false;
+    static constexpr bool has_recursive_value_type = false;
+};
+
+template <typename Container>
+struct container_value_type_traits<
+    Container,
+    typename std::enable_if<
+        std::is_same<typename Container::value_type, Container>::value>::type> {
+    static constexpr bool has_value_type = true;
+    static constexpr bool has_recursive_value_type = true;
+};
+
+template <typename Container>
+struct container_value_type_traits<
+    Container,
+    typename std::enable_if<
+        negation<std::is_same<typename Container::value_type, Container>>::value>::type> {
+    static constexpr bool has_value_type = true;
+    static constexpr bool has_recursive_value_type = false;
+};
+
+/*
+ * Tag to be used for representing the bottom of recursively defined types.
+ * Define this tag so we don't have to use void.
+ */
+struct recursive_bottom {};
+
+/*
+ * Implementation detail of `recursive_container_traits` below.
+ * `T` is the `value_type` of the container, which might need to be modified to
+ * avoid recursive types and const types.
+ */
+template <typename T, bool is_this_a_map>
+struct impl_type_to_check_recursively {
+    /*
+     * If the container is recursive, then no further recursion should be done.
+     */
+    using if_recursive = recursive_bottom;
+    /*
+     * Otherwise yield `T` unchanged.
+     */
+    using if_not_recursive = T;
+};
+
+/*
+ * For pairs - only as value type of a map -, the first type should remove the `const`.
+ * Also, if the map is recursive, then the recursive checking should consider
+ * the first type only.
+ */
+template <typename A, typename B>
+struct impl_type_to_check_recursively<std::pair<A, B>, /* is_this_a_map = */ true> {
+    using if_recursive = typename std::remove_const<A>::type;
+    using if_not_recursive = std::pair<typename std::remove_const<A>::type, B>;
+};
+
+/*
+ * Implementation of `recursive_container_traits` below.
+ */
+template <typename Container, typename SFINAE = void>
+struct impl_recursive_container_traits {
+    using type_to_check_recursively = recursive_bottom;
+};
+
+template <typename Container>
+struct impl_recursive_container_traits<
+    Container,
+    typename std::enable_if<container_value_type_traits<Container>::has_value_type>::type> {
+    static constexpr bool is_recursive
+        = container_mapped_type_traits<Container>::has_recursive_mapped_type
+          || container_value_type_traits<Container>::has_recursive_value_type;
+    /*
+     * This member dictates which type Pybind11 should check recursively in traits
+     * such as `is_move_constructible`, `is_copy_constructible`, `is_move_assignable`, ...
+     * Direct access to `value_type` should be avoided:
+     * 1. `value_type` might recursively contain the type again
+     * 2. `value_type` of STL map types is `std::pair<A const, B>`, the `const`
+     *    should be removed.
+     *
+     */
+    using type_to_check_recursively = typename std::conditional<
+        is_recursive,
+        typename impl_type_to_check_recursively<
+            typename Container::value_type,
+            container_mapped_type_traits<Container>::has_mapped_type>::if_recursive,
+        typename impl_type_to_check_recursively<
+            typename Container::value_type,
+            container_mapped_type_traits<Container>::has_mapped_type>::if_not_recursive>::type;
+};
+
+/*
+ * This trait defines the `type_to_check_recursively` which is needed to properly
+ * handle recursively defined traits such as `is_move_constructible` without going
+ * into an infinite recursion.
+ * Should be used instead of directly accessing the `value_type`.
+ * It cancels the recursion by returning the `recursive_bottom` tag.
+ *
+ * The default definition of `type_to_check_recursively` is as follows:
+ *
+ * 1. By default, it is `recursive_bottom`, so that the recursion is canceled.
+ * 2. If the type is non-recursive and defines a `value_type`, then the `value_type` is used.
+ *    If the `value_type` is a pair and a `mapped_type` is defined,
+ *    then the `const` is removed from the first type.
+ * 3. If the type is recursive and `value_type` is not a pair, then `recursive_bottom` is returned.
+ * 4. If the type is recursive and `value_type` is a pair and a `mapped_type` is defined,
+ *    then `const` is removed from the first type and the first type is returned.
+ *
+ * This behavior can be extended by the user as seen in test_stl_binders.cpp.
+ *
+ * This struct is exactly the same as impl_recursive_container_traits.
+ * The duplication achieves that user-defined specializations don't compete
+ * with internal specializations, but take precedence.
+ */
+template <typename Container, typename SFINAE = void>
+struct recursive_container_traits : impl_recursive_container_traits<Container> {};
+
+template <typename T>
+struct is_move_constructible
+    : all_of<std::is_move_constructible<T>,
+             is_move_constructible<
+                 typename recursive_container_traits<T>::type_to_check_recursively>> {};
+
+template <>
+struct is_move_constructible<recursive_bottom> : std::true_type {};
+
+// Likewise for std::pair
+// (after C++17 it is mandatory that the move constructor not exist when the two types aren't
+// themselves move constructible, but this can not be relied upon when T1 or T2 are themselves
+// containers).
+template <typename T1, typename T2>
+struct is_move_constructible<std::pair<T1, T2>>
+    : all_of<is_move_constructible<T1>, is_move_constructible<T2>> {};
+
+// std::is_copy_constructible isn't quite enough: it lets std::vector<T> (and similar) through when
+// T is non-copyable, but code containing such a copy constructor fails to actually compile.
+template <typename T>
+struct is_copy_constructible
+    : all_of<std::is_copy_constructible<T>,
+             is_copy_constructible<
+                 typename recursive_container_traits<T>::type_to_check_recursively>> {};
+
+template <>
+struct is_copy_constructible<recursive_bottom> : std::true_type {};
+
+// Likewise for std::pair
+// (after C++17 it is mandatory that the copy constructor not exist when the two types aren't
+// themselves copy constructible, but this can not be relied upon when T1 or T2 are themselves
+// containers).
+template <typename T1, typename T2>
+struct is_copy_constructible<std::pair<T1, T2>>
+    : all_of<is_copy_constructible<T1>, is_copy_constructible<T2>> {};
+
+// The same problems arise with std::is_copy_assignable, so we use the same workaround.
+template <typename T>
+struct is_copy_assignable
+    : all_of<
+          std::is_copy_assignable<T>,
+          is_copy_assignable<typename recursive_container_traits<T>::type_to_check_recursively>> {
+};
+
+template <>
+struct is_copy_assignable<recursive_bottom> : std::true_type {};
+
+template <typename T1, typename T2>
+struct is_copy_assignable<std::pair<T1, T2>>
+    : all_of<is_copy_assignable<T1>, is_copy_assignable<T2>> {};
+
+PYBIND11_NAMESPACE_END(detail)
+
+// polymorphic_type_hook<itype>::get(src, tinfo) determines whether the object pointed
+// to by `src` actually is an instance of some class derived from `itype`.
+// If so, it sets `tinfo` to point to the std::type_info representing that derived
+// type, and returns a pointer to the start of the most-derived object of that type
+// (in which `src` is a subobject; this will be the same address as `src` in most
+// single inheritance cases). If not, or if `src` is nullptr, it simply returns `src`
+// and leaves `tinfo` at its default value of nullptr.
+//
+// The default polymorphic_type_hook just returns src. A specialization for polymorphic
+// types determines the runtime type of the passed object and adjusts the this-pointer
+// appropriately via dynamic_cast<void*>. This is what enables a C++ Animal* to appear
+// to Python as a Dog (if Dog inherits from Animal, Animal is polymorphic, Dog is
+// registered with pybind11, and this Animal is in fact a Dog).
+//
+// You may specialize polymorphic_type_hook yourself for types that want to appear
+// polymorphic to Python but do not use C++ RTTI. (This is a not uncommon pattern
+// in performance-sensitive applications, used most notably in LLVM.)
+//
+// polymorphic_type_hook_base allows users to specialize polymorphic_type_hook with
+// std::enable_if. User provided specializations will always have higher priority than
+// the default implementation and specialization provided in polymorphic_type_hook_base.
+template <typename itype, typename SFINAE = void>
+struct polymorphic_type_hook_base {
+    static const void *get(const itype *src, const std::type_info *&) { return src; }
+};
+template <typename itype>
+struct polymorphic_type_hook_base<itype, detail::enable_if_t<std::is_polymorphic<itype>::value>> {
+    static const void *get(const itype *src, const std::type_info *&type) {
+        type = src ? &typeid(*src) : nullptr;
+        return dynamic_cast<const void *>(src);
+    }
+};
+template <typename itype, typename SFINAE = void>
+struct polymorphic_type_hook : public polymorphic_type_hook_base<itype> {};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+/// Generic type caster for objects stored on the heap
+template <typename type>
+class type_caster_base : public type_caster_generic {
+    using itype = intrinsic_t<type>;
+
+public:
+    static constexpr auto name = const_name<type>();
+
+    type_caster_base() : type_caster_base(typeid(type)) {}
+    explicit type_caster_base(const std::type_info &info) : type_caster_generic(info) {}
+
+    static handle cast(const itype &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic
+            || policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::copy;
+        }
+        return cast(std::addressof(src), policy, parent);
+    }
+
+    static handle cast(itype &&src, return_value_policy, handle parent) {
+        return cast(std::addressof(src), return_value_policy::move, parent);
+    }
+
+    // Returns a (pointer, type_info) pair taking care of necessary type lookup for a
+    // polymorphic type (using RTTI by default, but can be overridden by specializing
+    // polymorphic_type_hook). If the instance isn't derived, returns the base version.
+    static std::pair<const void *, const type_info *> src_and_type(const itype *src) {
+        const auto &cast_type = typeid(itype);
+        const std::type_info *instance_type = nullptr;
+        const void *vsrc = polymorphic_type_hook<itype>::get(src, instance_type);
+        if (instance_type && !same_type(cast_type, *instance_type)) {
+            // This is a base pointer to a derived type. If the derived type is registered
+            // with pybind11, we want to make the full derived object available.
+            // In the typical case where itype is polymorphic, we get the correct
+            // derived pointer (which may be != base pointer) by a dynamic_cast to
+            // most derived type. If itype is not polymorphic, we won't get here
+            // except via a user-provided specialization of polymorphic_type_hook,
+            // and the user has promised that no this-pointer adjustment is
+            // required in that case, so it's OK to use static_cast.
+            if (const auto *tpi = get_type_info(*instance_type)) {
+                return {vsrc, tpi};
+            }
+        }
+        // Otherwise we have either a nullptr, an `itype` pointer, or an unknown derived pointer,
+        // so don't do a cast
+        return type_caster_generic::src_and_type(src, cast_type, instance_type);
+    }
+
+    static handle cast(const itype *src, return_value_policy policy, handle parent) {
+        auto st = src_and_type(src);
+        return type_caster_generic::cast(st.first,
+                                         policy,
+                                         parent,
+                                         st.second,
+                                         make_copy_constructor(src),
+                                         make_move_constructor(src));
+    }
+
+    static handle cast_holder(const itype *src, const void *holder) {
+        auto st = src_and_type(src);
+        return type_caster_generic::cast(st.first,
+                                         return_value_policy::take_ownership,
+                                         {},
+                                         st.second,
+                                         nullptr,
+                                         nullptr,
+                                         holder);
+    }
+
+    template <typename T>
+    using cast_op_type = detail::cast_op_type<T>;
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator itype *() { return (type *) value; }
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator itype &() {
+        if (!value) {
+            throw reference_cast_error();
+        }
+        return *((itype *) value);
+    }
+
+protected:
+    using Constructor = void *(*) (const void *);
+
+    /* Only enabled when the types are {copy,move}-constructible *and* when the type
+       does not have a private operator new implementation. A comma operator is used in the
+       decltype argument to apply SFINAE to the public copy/move constructors.*/
+    template <typename T, typename = enable_if_t<is_copy_constructible<T>::value>>
+    static auto make_copy_constructor(const T *) -> decltype(new T(std::declval<const T>()),
+                                                             Constructor{}) {
+        return [](const void *arg) -> void * { return new T(*reinterpret_cast<const T *>(arg)); };
+    }
+
+    template <typename T, typename = enable_if_t<is_move_constructible<T>::value>>
+    static auto make_move_constructor(const T *) -> decltype(new T(std::declval<T &&>()),
+                                                             Constructor{}) {
+        return [](const void *arg) -> void * {
+            return new T(std::move(*const_cast<T *>(reinterpret_cast<const T *>(arg))));
+        };
+    }
+
+    static Constructor make_copy_constructor(...) { return nullptr; }
+    static Constructor make_move_constructor(...) { return nullptr; }
+};
+
+inline std::string quote_cpp_type_name(const std::string &cpp_type_name) {
+    return cpp_type_name; // No-op for now. See PR #4888
+}
+
+PYBIND11_NOINLINE std::string type_info_description(const std::type_info &ti) {
+    if (auto *type_data = get_type_info(ti)) {
+        handle th((PyObject *) type_data->type);
+        return th.attr("__module__").cast<std::string>() + '.'
+               + th.attr("__qualname__").cast<std::string>();
+    }
+    return quote_cpp_type_name(clean_type_id(ti.name()));
+}
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/phivenv/Lib/site-packages/torch/include/pybind11/detail/typeid.h b/phivenv/Lib/site-packages/torch/include/pybind11/detail/typeid.h
new file mode 100644
index 0000000000000000000000000000000000000000..a0722b114456cf91b3553657c9f0e39f71101f37
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/pybind11/detail/typeid.h
@@ -0,0 +1,65 @@
+/*
+    pybind11/detail/typeid.h: Compiler-independent access to type identifiers
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include <cstdio>
+#include <cstdlib>
+
+#if defined(__GNUG__)
+#    include <cxxabi.h>
+#endif
+
+#include "common.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+/// Erase all occurrences of a substring
+inline void erase_all(std::string &string, const std::string &search) {
+    for (size_t pos = 0;;) {
+        pos = string.find(search, pos);
+        if (pos == std::string::npos) {
+            break;
+        }
+        string.erase(pos, search.length());
+    }
+}
+
+PYBIND11_NOINLINE void clean_type_id(std::string &name) {
+#if defined(__GNUG__)
+    int status = 0;
+    std::unique_ptr<char, void (*)(void *)> res{
+        abi::__cxa_demangle(name.c_str(), nullptr, nullptr, &status), std::free};
+    if (status == 0) {
+        name = res.get();
+    }
+#else
+    detail::erase_all(name, "class ");
+    detail::erase_all(name, "struct ");
+    detail::erase_all(name, "enum ");
+#endif
+    detail::erase_all(name, "pybind11::");
+}
+
+inline std::string clean_type_id(const char *typeid_name) {
+    std::string name(typeid_name);
+    detail::clean_type_id(name);
+    return name;
+}
+
+PYBIND11_NAMESPACE_END(detail)
+
+/// Return a string representation of a C++ type
+template <typename T>
+static std::string type_id() {
+    return detail::clean_type_id(typeid(T).name());
+}
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/phivenv/Lib/site-packages/torch/include/pybind11/detail/value_and_holder.h b/phivenv/Lib/site-packages/torch/include/pybind11/detail/value_and_holder.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a324c4a5ba0e73abc3e8ed7bb6e9494f5190bf2
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/pybind11/detail/value_and_holder.h
@@ -0,0 +1,77 @@
+// Copyright (c) 2016-2024 The Pybind Development Team.
+// All rights reserved. Use of this source code is governed by a
+// BSD-style license that can be found in the LICENSE file.
+
+#pragma once
+
+#include "common.h"
+
+#include <cstddef>
+#include <typeinfo>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+struct value_and_holder {
+    instance *inst = nullptr;
+    size_t index = 0u;
+    const detail::type_info *type = nullptr;
+    void **vh = nullptr;
+
+    // Main constructor for a found value/holder:
+    value_and_holder(instance *i, const detail::type_info *type, size_t vpos, size_t index)
+        : inst{i}, index{index}, type{type},
+          vh{inst->simple_layout ? inst->simple_value_holder
+                                 : &inst->nonsimple.values_and_holders[vpos]} {}
+
+    // Default constructor (used to signal a value-and-holder not found by get_value_and_holder())
+    value_and_holder() = default;
+
+    // Used for past-the-end iterator
+    explicit value_and_holder(size_t index) : index{index} {}
+
+    template <typename V = void>
+    V *&value_ptr() const {
+        return reinterpret_cast<V *&>(vh[0]);
+    }
+    // True if this `value_and_holder` has a non-null value pointer
+    explicit operator bool() const { return value_ptr() != nullptr; }
+
+    template <typename H>
+    H &holder() const {
+        return reinterpret_cast<H &>(vh[1]);
+    }
+    bool holder_constructed() const {
+        return inst->simple_layout
+                   ? inst->simple_holder_constructed
+                   : (inst->nonsimple.status[index] & instance::status_holder_constructed) != 0u;
+    }
+    // NOLINTNEXTLINE(readability-make-member-function-const)
+    void set_holder_constructed(bool v = true) {
+        if (inst->simple_layout) {
+            inst->simple_holder_constructed = v;
+        } else if (v) {
+            inst->nonsimple.status[index] |= instance::status_holder_constructed;
+        } else {
+            inst->nonsimple.status[index] &= (std::uint8_t) ~instance::status_holder_constructed;
+        }
+    }
+    bool instance_registered() const {
+        return inst->simple_layout
+                   ? inst->simple_instance_registered
+                   : ((inst->nonsimple.status[index] & instance::status_instance_registered) != 0);
+    }
+    // NOLINTNEXTLINE(readability-make-member-function-const)
+    void set_instance_registered(bool v = true) {
+        if (inst->simple_layout) {
+            inst->simple_instance_registered = v;
+        } else if (v) {
+            inst->nonsimple.status[index] |= instance::status_instance_registered;
+        } else {
+            inst->nonsimple.status[index] &= (std::uint8_t) ~instance::status_instance_registered;
+        }
+    }
+};
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/phivenv/Lib/site-packages/torch/include/pybind11/eigen.h b/phivenv/Lib/site-packages/torch/include/pybind11/eigen.h
new file mode 100644
index 0000000000000000000000000000000000000000..a6c87c0c4728dab5bfd64d67067d65dc3cae4e8a
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/pybind11/eigen.h
@@ -0,0 +1,12 @@
+/*
+    pybind11/eigen.h: Transparent conversion for dense and sparse Eigen matrices
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "eigen/matrix.h"
diff --git a/phivenv/Lib/site-packages/torch/include/pybind11/eigen/common.h b/phivenv/Lib/site-packages/torch/include/pybind11/eigen/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..94542aaba1bf1397fd45a52c8328318e29ae6e35
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/pybind11/eigen/common.h
@@ -0,0 +1,9 @@
+// Copyright (c) 2023 The pybind Community.
+
+#pragma once
+
+// Common message for `static_assert()`s, which are useful to easily
+// preempt much less obvious errors.
+#define PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED                                    \
+    "Pointer types (in particular `PyObject *`) are not supported as scalar types for Eigen "     \
+    "types."
diff --git a/phivenv/Lib/site-packages/torch/include/pybind11/eigen/matrix.h b/phivenv/Lib/site-packages/torch/include/pybind11/eigen/matrix.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0e897b35573f298cf884f07d551c29e51d4b12e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/pybind11/eigen/matrix.h
@@ -0,0 +1,715 @@
+/*
+    pybind11/eigen/matrix.h: Transparent conversion for dense and sparse Eigen matrices
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include <pybind11/numpy.h>
+
+#include "common.h"
+
+/* HINT: To suppress warnings originating from the Eigen headers, use -isystem.
+   See also:
+       https://stackoverflow.com/questions/2579576/i-dir-vs-isystem-dir
+       https://stackoverflow.com/questions/1741816/isystem-for-ms-visual-studio-c-compiler
+*/
+PYBIND11_WARNING_PUSH
+PYBIND11_WARNING_DISABLE_MSVC(5054) // https://github.com/pybind/pybind11/pull/3741
+//       C5054: operator '&': deprecated between enumerations of different types
+#if defined(__MINGW32__)
+PYBIND11_WARNING_DISABLE_GCC("-Wmaybe-uninitialized")
+#endif
+
+#include <Eigen/Core>
+#include <Eigen/SparseCore>
+
+PYBIND11_WARNING_POP
+
+// Eigen prior to 3.2.7 doesn't have proper move constructors--but worse, some classes get implicit
+// move constructors that break things.  We could detect this an explicitly copy, but an extra copy
+// of matrices seems highly undesirable.
+static_assert(EIGEN_VERSION_AT_LEAST(3, 2, 7),
+              "Eigen matrix support in pybind11 requires Eigen >= 3.2.7");
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+PYBIND11_WARNING_DISABLE_MSVC(4127)
+
+// Provide a convenience alias for easier pass-by-ref usage with fully dynamic strides:
+using EigenDStride = Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>;
+template <typename MatrixType>
+using EigenDRef = Eigen::Ref<MatrixType, 0, EigenDStride>;
+template <typename MatrixType>
+using EigenDMap = Eigen::Map<MatrixType, 0, EigenDStride>;
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+#if EIGEN_VERSION_AT_LEAST(3, 3, 0)
+using EigenIndex = Eigen::Index;
+template <typename Scalar, int Flags, typename StorageIndex>
+using EigenMapSparseMatrix = Eigen::Map<Eigen::SparseMatrix<Scalar, Flags, StorageIndex>>;
+#else
+using EigenIndex = EIGEN_DEFAULT_DENSE_INDEX_TYPE;
+template <typename Scalar, int Flags, typename StorageIndex>
+using EigenMapSparseMatrix = Eigen::MappedSparseMatrix<Scalar, Flags, StorageIndex>;
+#endif
+
+// Matches Eigen::Map, Eigen::Ref, blocks, etc:
+template <typename T>
+using is_eigen_dense_map = all_of<is_template_base_of<Eigen::DenseBase, T>,
+                                  std::is_base_of<Eigen::MapBase<T, Eigen::ReadOnlyAccessors>, T>>;
+template <typename T>
+using is_eigen_mutable_map = std::is_base_of<Eigen::MapBase<T, Eigen::WriteAccessors>, T>;
+template <typename T>
+using is_eigen_dense_plain
+    = all_of<negation<is_eigen_dense_map<T>>, is_template_base_of<Eigen::PlainObjectBase, T>>;
+template <typename T>
+using is_eigen_sparse = is_template_base_of<Eigen::SparseMatrixBase, T>;
+// Test for objects inheriting from EigenBase<Derived> that aren't captured by the above.  This
+// basically covers anything that can be assigned to a dense matrix but that don't have a typical
+// matrix data layout that can be copied from their .data().  For example, DiagonalMatrix and
+// SelfAdjointView fall into this category.
+template <typename T>
+using is_eigen_other
+    = all_of<is_template_base_of<Eigen::EigenBase, T>,
+             negation<any_of<is_eigen_dense_map<T>, is_eigen_dense_plain<T>, is_eigen_sparse<T>>>>;
+
+// Captures numpy/eigen conformability status (returned by EigenProps::conformable()):
+template <bool EigenRowMajor>
+struct EigenConformable {
+    bool conformable = false;
+    EigenIndex rows = 0, cols = 0;
+    EigenDStride stride{0, 0};    // Only valid if negativestrides is false!
+    bool negativestrides = false; // If true, do not use stride!
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    EigenConformable(bool fits = false) : conformable{fits} {}
+    // Matrix type:
+    EigenConformable(EigenIndex r, EigenIndex c, EigenIndex rstride, EigenIndex cstride)
+        : conformable{true}, rows{r}, cols{c},
+          // TODO: when Eigen bug #747 is fixed, remove the tests for non-negativity.
+          // http://eigen.tuxfamily.org/bz/show_bug.cgi?id=747
+          stride{EigenRowMajor ? (rstride > 0 ? rstride : 0)
+                               : (cstride > 0 ? cstride : 0) /* outer stride */,
+                 EigenRowMajor ? (cstride > 0 ? cstride : 0)
+                               : (rstride > 0 ? rstride : 0) /* inner stride */},
+          negativestrides{rstride < 0 || cstride < 0} {}
+    // Vector type:
+    EigenConformable(EigenIndex r, EigenIndex c, EigenIndex stride)
+        : EigenConformable(r, c, r == 1 ? c * stride : stride, c == 1 ? r : r * stride) {}
+
+    template <typename props>
+    bool stride_compatible() const {
+        // To have compatible strides, we need (on both dimensions) one of fully dynamic strides,
+        // matching strides, or a dimension size of 1 (in which case the stride value is
+        // irrelevant). Alternatively, if any dimension size is 0, the strides are not relevant
+        // (and numpy ≥ 1.23 sets the strides to 0 in that case, so we need to check explicitly).
+        if (negativestrides) {
+            return false;
+        }
+        if (rows == 0 || cols == 0) {
+            return true;
+        }
+        return (props::inner_stride == Eigen::Dynamic || props::inner_stride == stride.inner()
+                || (EigenRowMajor ? cols : rows) == 1)
+               && (props::outer_stride == Eigen::Dynamic || props::outer_stride == stride.outer()
+                   || (EigenRowMajor ? rows : cols) == 1);
+    }
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator bool() const { return conformable; }
+};
+
+template <typename Type>
+struct eigen_extract_stride {
+    using type = Type;
+};
+template <typename PlainObjectType, int MapOptions, typename StrideType>
+struct eigen_extract_stride<Eigen::Map<PlainObjectType, MapOptions, StrideType>> {
+    using type = StrideType;
+};
+template <typename PlainObjectType, int Options, typename StrideType>
+struct eigen_extract_stride<Eigen::Ref<PlainObjectType, Options, StrideType>> {
+    using type = StrideType;
+};
+
+// Helper struct for extracting information from an Eigen type
+template <typename Type_>
+struct EigenProps {
+    using Type = Type_;
+    using Scalar = typename Type::Scalar;
+    using StrideType = typename eigen_extract_stride<Type>::type;
+    static constexpr EigenIndex rows = Type::RowsAtCompileTime, cols = Type::ColsAtCompileTime,
+                                size = Type::SizeAtCompileTime;
+    static constexpr bool row_major = Type::IsRowMajor,
+                          vector
+                          = Type::IsVectorAtCompileTime, // At least one dimension has fixed size 1
+        fixed_rows = rows != Eigen::Dynamic, fixed_cols = cols != Eigen::Dynamic,
+                          fixed = size != Eigen::Dynamic, // Fully-fixed size
+        dynamic = !fixed_rows && !fixed_cols;             // Fully-dynamic size
+
+    template <EigenIndex i, EigenIndex ifzero>
+    using if_zero = std::integral_constant<EigenIndex, i == 0 ? ifzero : i>;
+    static constexpr EigenIndex inner_stride
+        = if_zero<StrideType::InnerStrideAtCompileTime, 1>::value,
+        outer_stride = if_zero < StrideType::OuterStrideAtCompileTime,
+        vector      ? size
+        : row_major ? cols
+                    : rows > ::value;
+    static constexpr bool dynamic_stride
+        = inner_stride == Eigen::Dynamic && outer_stride == Eigen::Dynamic;
+    static constexpr bool requires_row_major
+        = !dynamic_stride && !vector && (row_major ? inner_stride : outer_stride) == 1;
+    static constexpr bool requires_col_major
+        = !dynamic_stride && !vector && (row_major ? outer_stride : inner_stride) == 1;
+
+    // Takes an input array and determines whether we can make it fit into the Eigen type.  If
+    // the array is a vector, we attempt to fit it into either an Eigen 1xN or Nx1 vector
+    // (preferring the latter if it will fit in either, i.e. for a fully dynamic matrix type).
+    static EigenConformable<row_major> conformable(const array &a) {
+        const auto dims = a.ndim();
+        if (dims < 1 || dims > 2) {
+            return false;
+        }
+
+        if (dims == 2) { // Matrix type: require exact match (or dynamic)
+
+            EigenIndex np_rows = a.shape(0), np_cols = a.shape(1),
+                       np_rstride = a.strides(0) / static_cast<ssize_t>(sizeof(Scalar)),
+                       np_cstride = a.strides(1) / static_cast<ssize_t>(sizeof(Scalar));
+            if ((fixed_rows && np_rows != rows) || (fixed_cols && np_cols != cols)) {
+                return false;
+            }
+
+            return {np_rows, np_cols, np_rstride, np_cstride};
+        }
+
+        // Otherwise we're storing an n-vector.  Only one of the strides will be used, but
+        // whichever is used, we want the (single) numpy stride value.
+        const EigenIndex n = a.shape(0),
+                         stride = a.strides(0) / static_cast<ssize_t>(sizeof(Scalar));
+
+        if (vector) { // Eigen type is a compile-time vector
+            if (fixed && size != n) {
+                return false; // Vector size mismatch
+            }
+            return {rows == 1 ? 1 : n, cols == 1 ? 1 : n, stride};
+        }
+        if (fixed) {
+            // The type has a fixed size, but is not a vector: abort
+            return false;
+        }
+        if (fixed_cols) {
+            // Since this isn't a vector, cols must be != 1.  We allow this only if it exactly
+            // equals the number of elements (rows is Dynamic, and so 1 row is allowed).
+            if (cols != n) {
+                return false;
+            }
+            return {1, n, stride};
+        } // Otherwise it's either fully dynamic, or column dynamic; both become a column vector
+        if (fixed_rows && rows != n) {
+            return false;
+        }
+        return {n, 1, stride};
+    }
+
+    static constexpr bool show_writeable
+        = is_eigen_dense_map<Type>::value && is_eigen_mutable_map<Type>::value;
+    static constexpr bool show_order = is_eigen_dense_map<Type>::value;
+    static constexpr bool show_c_contiguous = show_order && requires_row_major;
+    static constexpr bool show_f_contiguous
+        = !show_c_contiguous && show_order && requires_col_major;
+
+    static constexpr auto descriptor
+        = const_name("numpy.ndarray[") + npy_format_descriptor<Scalar>::name + const_name("[")
+          + const_name<fixed_rows>(const_name<(size_t) rows>(), const_name("m")) + const_name(", ")
+          + const_name<fixed_cols>(const_name<(size_t) cols>(), const_name("n")) + const_name("]")
+          +
+          // For a reference type (e.g. Ref<MatrixXd>) we have other constraints that might need to
+          // be satisfied: writeable=True (for a mutable reference), and, depending on the map's
+          // stride options, possibly f_contiguous or c_contiguous.  We include them in the
+          // descriptor output to provide some hint as to why a TypeError is occurring (otherwise
+          // it can be confusing to see that a function accepts a 'numpy.ndarray[float64[3,2]]' and
+          // an error message that you *gave* a numpy.ndarray of the right type and dimensions.
+          const_name<show_writeable>(", flags.writeable", "")
+          + const_name<show_c_contiguous>(", flags.c_contiguous", "")
+          + const_name<show_f_contiguous>(", flags.f_contiguous", "") + const_name("]");
+};
+
+// Casts an Eigen type to numpy array.  If given a base, the numpy array references the src data,
+// otherwise it'll make a copy.  writeable lets you turn off the writeable flag for the array.
+template <typename props>
+handle
+eigen_array_cast(typename props::Type const &src, handle base = handle(), bool writeable = true) {
+    constexpr ssize_t elem_size = sizeof(typename props::Scalar);
+    array a;
+    if (props::vector) {
+        a = array({src.size()}, {elem_size * src.innerStride()}, src.data(), base);
+    } else {
+        a = array({src.rows(), src.cols()},
+                  {elem_size * src.rowStride(), elem_size * src.colStride()},
+                  src.data(),
+                  base);
+    }
+
+    if (!writeable) {
+        array_proxy(a.ptr())->flags &= ~detail::npy_api::NPY_ARRAY_WRITEABLE_;
+    }
+
+    return a.release();
+}
+
+// Takes an lvalue ref to some Eigen type and a (python) base object, creating a numpy array that
+// reference the Eigen object's data with `base` as the python-registered base class (if omitted,
+// the base will be set to None, and lifetime management is up to the caller).  The numpy array is
+// non-writeable if the given type is const.
+template <typename props, typename Type>
+handle eigen_ref_array(Type &src, handle parent = none()) {
+    // none here is to get past array's should-we-copy detection, which currently always
+    // copies when there is no base.  Setting the base to None should be harmless.
+    return eigen_array_cast<props>(src, parent, !std::is_const<Type>::value);
+}
+
+// Takes a pointer to some dense, plain Eigen type, builds a capsule around it, then returns a
+// numpy array that references the encapsulated data with a python-side reference to the capsule to
+// tie its destruction to that of any dependent python objects.  Const-ness is determined by
+// whether or not the Type of the pointer given is const.
+template <typename props, typename Type, typename = enable_if_t<is_eigen_dense_plain<Type>::value>>
+handle eigen_encapsulate(Type *src) {
+    capsule base(src, [](void *o) { delete static_cast<Type *>(o); });
+    return eigen_ref_array<props>(*src, base);
+}
+
+// Type caster for regular, dense matrix types (e.g. MatrixXd), but not maps/refs/etc. of dense
+// types.
+template <typename Type>
+struct type_caster<Type, enable_if_t<is_eigen_dense_plain<Type>::value>> {
+    using Scalar = typename Type::Scalar;
+    static_assert(!std::is_pointer<Scalar>::value,
+                  PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED);
+    using props = EigenProps<Type>;
+
+    bool load(handle src, bool convert) {
+        // If we're in no-convert mode, only load if given an array of the correct type
+        if (!convert && !isinstance<array_t<Scalar>>(src)) {
+            return false;
+        }
+
+        // Coerce into an array, but don't do type conversion yet; the copy below handles it.
+        auto buf = array::ensure(src);
+
+        if (!buf) {
+            return false;
+        }
+
+        auto dims = buf.ndim();
+        if (dims < 1 || dims > 2) {
+            return false;
+        }
+
+        auto fits = props::conformable(buf);
+        if (!fits) {
+            return false;
+        }
+
+        // Allocate the new type, then build a numpy reference into it
+        value = Type(fits.rows, fits.cols);
+        auto ref = reinterpret_steal<array>(eigen_ref_array<props>(value));
+        if (dims == 1) {
+            ref = ref.squeeze();
+        } else if (ref.ndim() == 1) {
+            buf = buf.squeeze();
+        }
+
+        int result = detail::npy_api::get().PyArray_CopyInto_(ref.ptr(), buf.ptr());
+
+        if (result < 0) { // Copy failed!
+            PyErr_Clear();
+            return false;
+        }
+
+        return true;
+    }
+
+private:
+    // Cast implementation
+    template <typename CType>
+    static handle cast_impl(CType *src, return_value_policy policy, handle parent) {
+        switch (policy) {
+            case return_value_policy::take_ownership:
+            case return_value_policy::automatic:
+                return eigen_encapsulate<props>(src);
+            case return_value_policy::move:
+                return eigen_encapsulate<props>(new CType(std::move(*src)));
+            case return_value_policy::copy:
+                return eigen_array_cast<props>(*src);
+            case return_value_policy::reference:
+            case return_value_policy::automatic_reference:
+                return eigen_ref_array<props>(*src);
+            case return_value_policy::reference_internal:
+                return eigen_ref_array<props>(*src, parent);
+            default:
+                throw cast_error("unhandled return_value_policy: should not happen!");
+        };
+    }
+
+public:
+    // Normal returned non-reference, non-const value:
+    static handle cast(Type &&src, return_value_policy /* policy */, handle parent) {
+        return cast_impl(&src, return_value_policy::move, parent);
+    }
+    // If you return a non-reference const, we mark the numpy array readonly:
+    static handle cast(const Type &&src, return_value_policy /* policy */, handle parent) {
+        return cast_impl(&src, return_value_policy::move, parent);
+    }
+    // lvalue reference return; default (automatic) becomes copy
+    static handle cast(Type &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic
+            || policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::copy;
+        }
+        return cast_impl(&src, policy, parent);
+    }
+    // const lvalue reference return; default (automatic) becomes copy
+    static handle cast(const Type &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic
+            || policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::copy;
+        }
+        return cast(&src, policy, parent);
+    }
+    // non-const pointer return
+    static handle cast(Type *src, return_value_policy policy, handle parent) {
+        return cast_impl(src, policy, parent);
+    }
+    // const pointer return
+    static handle cast(const Type *src, return_value_policy policy, handle parent) {
+        return cast_impl(src, policy, parent);
+    }
+
+    static constexpr auto name = props::descriptor;
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator Type *() { return &value; }
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator Type &() { return value; }
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator Type &&() && { return std::move(value); }
+    template <typename T>
+    using cast_op_type = movable_cast_op_type<T>;
+
+private:
+    Type value;
+};
+
+// Base class for casting reference/map/block/etc. objects back to python.
+template <typename MapType>
+struct eigen_map_caster {
+    static_assert(!std::is_pointer<typename MapType::Scalar>::value,
+                  PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED);
+
+private:
+    using props = EigenProps<MapType>;
+
+public:
+    // Directly referencing a ref/map's data is a bit dangerous (whatever the map/ref points to has
+    // to stay around), but we'll allow it under the assumption that you know what you're doing
+    // (and have an appropriate keep_alive in place).  We return a numpy array pointing directly at
+    // the ref's data (The numpy array ends up read-only if the ref was to a const matrix type.)
+    // Note that this means you need to ensure you don't destroy the object in some other way (e.g.
+    // with an appropriate keep_alive, or with a reference to a statically allocated matrix).
+    static handle cast(const MapType &src, return_value_policy policy, handle parent) {
+        switch (policy) {
+            case return_value_policy::copy:
+                return eigen_array_cast<props>(src);
+            case return_value_policy::reference_internal:
+                return eigen_array_cast<props>(src, parent, is_eigen_mutable_map<MapType>::value);
+            case return_value_policy::reference:
+            case return_value_policy::automatic:
+            case return_value_policy::automatic_reference:
+                return eigen_array_cast<props>(src, none(), is_eigen_mutable_map<MapType>::value);
+            default:
+                // move, take_ownership don't make any sense for a ref/map:
+                pybind11_fail("Invalid return_value_policy for Eigen Map/Ref/Block type");
+        }
+    }
+
+    static constexpr auto name = props::descriptor;
+
+    // Explicitly delete these: support python -> C++ conversion on these (i.e. these can be return
+    // types but not bound arguments).  We still provide them (with an explicitly delete) so that
+    // you end up here if you try anyway.
+    bool load(handle, bool) = delete;
+    operator MapType() = delete;
+    template <typename>
+    using cast_op_type = MapType;
+};
+
+// We can return any map-like object (but can only load Refs, specialized next):
+template <typename Type>
+struct type_caster<Type, enable_if_t<is_eigen_dense_map<Type>::value>> : eigen_map_caster<Type> {};
+
+// Loader for Ref<...> arguments.  See the documentation for info on how to make this work without
+// copying (it requires some extra effort in many cases).
+template <typename PlainObjectType, typename StrideType>
+struct type_caster<
+    Eigen::Ref<PlainObjectType, 0, StrideType>,
+    enable_if_t<is_eigen_dense_map<Eigen::Ref<PlainObjectType, 0, StrideType>>::value>>
+    : public eigen_map_caster<Eigen::Ref<PlainObjectType, 0, StrideType>> {
+private:
+    using Type = Eigen::Ref<PlainObjectType, 0, StrideType>;
+    using props = EigenProps<Type>;
+    using Scalar = typename props::Scalar;
+    static_assert(!std::is_pointer<Scalar>::value,
+                  PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED);
+    using MapType = Eigen::Map<PlainObjectType, 0, StrideType>;
+    using Array
+        = array_t<Scalar,
+                  array::forcecast
+                      | ((props::row_major ? props::inner_stride : props::outer_stride) == 1
+                             ? array::c_style
+                         : (props::row_major ? props::outer_stride : props::inner_stride) == 1
+                             ? array::f_style
+                             : 0)>;
+    static constexpr bool need_writeable = is_eigen_mutable_map<Type>::value;
+    // Delay construction (these have no default constructor)
+    std::unique_ptr<MapType> map;
+    std::unique_ptr<Type> ref;
+    // Our array.  When possible, this is just a numpy array pointing to the source data, but
+    // sometimes we can't avoid copying (e.g. input is not a numpy array at all, has an
+    // incompatible layout, or is an array of a type that needs to be converted).  Using a numpy
+    // temporary (rather than an Eigen temporary) saves an extra copy when we need both type
+    // conversion and storage order conversion.  (Note that we refuse to use this temporary copy
+    // when loading an argument for a Ref<M> with M non-const, i.e. a read-write reference).
+    Array copy_or_ref;
+
+public:
+    bool load(handle src, bool convert) {
+        // First check whether what we have is already an array of the right type.  If not, we
+        // can't avoid a copy (because the copy is also going to do type conversion).
+        bool need_copy = !isinstance<Array>(src);
+
+        EigenConformable<props::row_major> fits;
+        if (!need_copy) {
+            // We don't need a converting copy, but we also need to check whether the strides are
+            // compatible with the Ref's stride requirements
+            auto aref = reinterpret_borrow<Array>(src);
+
+            if (aref && (!need_writeable || aref.writeable())) {
+                fits = props::conformable(aref);
+                if (!fits) {
+                    return false; // Incompatible dimensions
+                }
+                if (!fits.template stride_compatible<props>()) {
+                    need_copy = true;
+                } else {
+                    copy_or_ref = std::move(aref);
+                }
+            } else {
+                need_copy = true;
+            }
+        }
+
+        if (need_copy) {
+            // We need to copy: If we need a mutable reference, or we're not supposed to convert
+            // (either because we're in the no-convert overload pass, or because we're explicitly
+            // instructed not to copy (via `py::arg().noconvert()`) we have to fail loading.
+            if (!convert || need_writeable) {
+                return false;
+            }
+
+            Array copy = Array::ensure(src);
+            if (!copy) {
+                return false;
+            }
+            fits = props::conformable(copy);
+            if (!fits || !fits.template stride_compatible<props>()) {
+                return false;
+            }
+            copy_or_ref = std::move(copy);
+            loader_life_support::add_patient(copy_or_ref);
+        }
+
+        ref.reset();
+        map.reset(new MapType(data(copy_or_ref),
+                              fits.rows,
+                              fits.cols,
+                              make_stride(fits.stride.outer(), fits.stride.inner())));
+        ref.reset(new Type(*map));
+
+        return true;
+    }
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator Type *() { return ref.get(); }
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator Type &() { return *ref; }
+    template <typename _T>
+    using cast_op_type = pybind11::detail::cast_op_type<_T>;
+
+private:
+    template <typename T = Type, enable_if_t<is_eigen_mutable_map<T>::value, int> = 0>
+    Scalar *data(Array &a) {
+        return a.mutable_data();
+    }
+
+    template <typename T = Type, enable_if_t<!is_eigen_mutable_map<T>::value, int> = 0>
+    const Scalar *data(Array &a) {
+        return a.data();
+    }
+
+    // Attempt to figure out a constructor of `Stride` that will work.
+    // If both strides are fixed, use a default constructor:
+    template <typename S>
+    using stride_ctor_default = bool_constant<S::InnerStrideAtCompileTime != Eigen::Dynamic
+                                              && S::OuterStrideAtCompileTime != Eigen::Dynamic
+                                              && std::is_default_constructible<S>::value>;
+    // Otherwise, if there is a two-index constructor, assume it is (outer,inner) like
+    // Eigen::Stride, and use it:
+    template <typename S>
+    using stride_ctor_dual
+        = bool_constant<!stride_ctor_default<S>::value
+                        && std::is_constructible<S, EigenIndex, EigenIndex>::value>;
+    // Otherwise, if there is a one-index constructor, and just one of the strides is dynamic, use
+    // it (passing whichever stride is dynamic).
+    template <typename S>
+    using stride_ctor_outer
+        = bool_constant<!any_of<stride_ctor_default<S>, stride_ctor_dual<S>>::value
+                        && S::OuterStrideAtCompileTime == Eigen::Dynamic
+                        && S::InnerStrideAtCompileTime != Eigen::Dynamic
+                        && std::is_constructible<S, EigenIndex>::value>;
+    template <typename S>
+    using stride_ctor_inner
+        = bool_constant<!any_of<stride_ctor_default<S>, stride_ctor_dual<S>>::value
+                        && S::InnerStrideAtCompileTime == Eigen::Dynamic
+                        && S::OuterStrideAtCompileTime != Eigen::Dynamic
+                        && std::is_constructible<S, EigenIndex>::value>;
+
+    template <typename S = StrideType, enable_if_t<stride_ctor_default<S>::value, int> = 0>
+    static S make_stride(EigenIndex, EigenIndex) {
+        return S();
+    }
+    template <typename S = StrideType, enable_if_t<stride_ctor_dual<S>::value, int> = 0>
+    static S make_stride(EigenIndex outer, EigenIndex inner) {
+        return S(outer, inner);
+    }
+    template <typename S = StrideType, enable_if_t<stride_ctor_outer<S>::value, int> = 0>
+    static S make_stride(EigenIndex outer, EigenIndex) {
+        return S(outer);
+    }
+    template <typename S = StrideType, enable_if_t<stride_ctor_inner<S>::value, int> = 0>
+    static S make_stride(EigenIndex, EigenIndex inner) {
+        return S(inner);
+    }
+};
+
+// type_caster for special matrix types (e.g. DiagonalMatrix), which are EigenBase, but not
+// EigenDense (i.e. they don't have a data(), at least not with the usual matrix layout).
+// load() is not supported, but we can cast them into the python domain by first copying to a
+// regular Eigen::Matrix, then casting that.
+template <typename Type>
+struct type_caster<Type, enable_if_t<is_eigen_other<Type>::value>> {
+    static_assert(!std::is_pointer<typename Type::Scalar>::value,
+                  PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED);
+
+protected:
+    using Matrix
+        = Eigen::Matrix<typename Type::Scalar, Type::RowsAtCompileTime, Type::ColsAtCompileTime>;
+    using props = EigenProps<Matrix>;
+
+public:
+    static handle cast(const Type &src, return_value_policy /* policy */, handle /* parent */) {
+        handle h = eigen_encapsulate<props>(new Matrix(src));
+        return h;
+    }
+    static handle cast(const Type *src, return_value_policy policy, handle parent) {
+        return cast(*src, policy, parent);
+    }
+
+    static constexpr auto name = props::descriptor;
+
+    // Explicitly delete these: support python -> C++ conversion on these (i.e. these can be return
+    // types but not bound arguments).  We still provide them (with an explicitly delete) so that
+    // you end up here if you try anyway.
+    bool load(handle, bool) = delete;
+    operator Type() = delete;
+    template <typename>
+    using cast_op_type = Type;
+};
+
+template <typename Type>
+struct type_caster<Type, enable_if_t<is_eigen_sparse<Type>::value>> {
+    using Scalar = typename Type::Scalar;
+    static_assert(!std::is_pointer<Scalar>::value,
+                  PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED);
+    using StorageIndex = remove_reference_t<decltype(*std::declval<Type>().outerIndexPtr())>;
+    using Index = typename Type::Index;
+    static constexpr bool rowMajor = Type::IsRowMajor;
+
+    bool load(handle src, bool) {
+        if (!src) {
+            return false;
+        }
+
+        auto obj = reinterpret_borrow<object>(src);
+        object sparse_module = module_::import("scipy.sparse");
+        object matrix_type = sparse_module.attr(rowMajor ? "csr_matrix" : "csc_matrix");
+
+        if (!type::handle_of(obj).is(matrix_type)) {
+            try {
+                obj = matrix_type(obj);
+            } catch (const error_already_set &) {
+                return false;
+            }
+        }
+
+        auto values = array_t<Scalar>((object) obj.attr("data"));
+        auto innerIndices = array_t<StorageIndex>((object) obj.attr("indices"));
+        auto outerIndices = array_t<StorageIndex>((object) obj.attr("indptr"));
+        auto shape = pybind11::tuple((pybind11::object) obj.attr("shape"));
+        auto nnz = obj.attr("nnz").cast<Index>();
+
+        if (!values || !innerIndices || !outerIndices) {
+            return false;
+        }
+
+        value = EigenMapSparseMatrix<Scalar,
+                                     Type::Flags &(Eigen::RowMajor | Eigen::ColMajor),
+                                     StorageIndex>(shape[0].cast<Index>(),
+                                                   shape[1].cast<Index>(),
+                                                   std::move(nnz),
+                                                   outerIndices.mutable_data(),
+                                                   innerIndices.mutable_data(),
+                                                   values.mutable_data());
+
+        return true;
+    }
+
+    static handle cast(const Type &src, return_value_policy /* policy */, handle /* parent */) {
+        const_cast<Type &>(src).makeCompressed();
+
+        object matrix_type
+            = module_::import("scipy.sparse").attr(rowMajor ? "csr_matrix" : "csc_matrix");
+
+        array data(src.nonZeros(), src.valuePtr());
+        array outerIndices((rowMajor ? src.rows() : src.cols()) + 1, src.outerIndexPtr());
+        array innerIndices(src.nonZeros(), src.innerIndexPtr());
+
+        return matrix_type(pybind11::make_tuple(
+                               std::move(data), std::move(innerIndices), std::move(outerIndices)),
+                           pybind11::make_tuple(src.rows(), src.cols()))
+            .release();
+    }
+
+    PYBIND11_TYPE_CASTER(Type,
+                         const_name<(Type::IsRowMajor) != 0>("scipy.sparse.csr_matrix[",
+                                                             "scipy.sparse.csc_matrix[")
+                             + npy_format_descriptor<Scalar>::name + const_name("]"));
+};
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/phivenv/Lib/site-packages/torch/include/pybind11/eigen/tensor.h b/phivenv/Lib/site-packages/torch/include/pybind11/eigen/tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..346c08ff35711468a0bb40a34188e54049238cf2
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/pybind11/eigen/tensor.h
@@ -0,0 +1,515 @@
+/*
+    pybind11/eigen/tensor.h: Transparent conversion for Eigen tensors
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include <pybind11/numpy.h>
+
+#include "common.h"
+
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+static_assert(__GNUC__ > 5, "Eigen Tensor support in pybind11 requires GCC > 5.0");
+#endif
+
+// Disable warnings for Eigen
+PYBIND11_WARNING_PUSH
+PYBIND11_WARNING_DISABLE_MSVC(4554)
+PYBIND11_WARNING_DISABLE_MSVC(4127)
+#if defined(__MINGW32__)
+PYBIND11_WARNING_DISABLE_GCC("-Wmaybe-uninitialized")
+#endif
+
+#include <unsupported/Eigen/CXX11/Tensor>
+
+PYBIND11_WARNING_POP
+
+static_assert(EIGEN_VERSION_AT_LEAST(3, 3, 0),
+              "Eigen Tensor support in pybind11 requires Eigen >= 3.3.0");
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+PYBIND11_WARNING_DISABLE_MSVC(4127)
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+inline bool is_tensor_aligned(const void *data) {
+    return (reinterpret_cast<std::size_t>(data) % EIGEN_DEFAULT_ALIGN_BYTES) == 0;
+}
+
+template <typename T>
+constexpr int compute_array_flag_from_tensor() {
+    static_assert((static_cast<int>(T::Layout) == static_cast<int>(Eigen::RowMajor))
+                      || (static_cast<int>(T::Layout) == static_cast<int>(Eigen::ColMajor)),
+                  "Layout must be row or column major");
+    return (static_cast<int>(T::Layout) == static_cast<int>(Eigen::RowMajor)) ? array::c_style
+                                                                              : array::f_style;
+}
+
+template <typename T>
+struct eigen_tensor_helper {};
+
+template <typename Scalar_, int NumIndices_, int Options_, typename IndexType>
+struct eigen_tensor_helper<Eigen::Tensor<Scalar_, NumIndices_, Options_, IndexType>> {
+    using Type = Eigen::Tensor<Scalar_, NumIndices_, Options_, IndexType>;
+    using ValidType = void;
+
+    static Eigen::DSizes<typename Type::Index, Type::NumIndices> get_shape(const Type &f) {
+        return f.dimensions();
+    }
+
+    static constexpr bool
+    is_correct_shape(const Eigen::DSizes<typename Type::Index, Type::NumIndices> & /*shape*/) {
+        return true;
+    }
+
+    template <typename T>
+    struct helper {};
+
+    template <size_t... Is>
+    struct helper<index_sequence<Is...>> {
+        static constexpr auto value = ::pybind11::detail::concat(const_name(((void) Is, "?"))...);
+    };
+
+    static constexpr auto dimensions_descriptor
+        = helper<decltype(make_index_sequence<Type::NumIndices>())>::value;
+
+    template <typename... Args>
+    static Type *alloc(Args &&...args) {
+        return new Type(std::forward<Args>(args)...);
+    }
+
+    static void free(Type *tensor) { delete tensor; }
+};
+
+template <typename Scalar_, typename std::ptrdiff_t... Indices, int Options_, typename IndexType>
+struct eigen_tensor_helper<
+    Eigen::TensorFixedSize<Scalar_, Eigen::Sizes<Indices...>, Options_, IndexType>> {
+    using Type = Eigen::TensorFixedSize<Scalar_, Eigen::Sizes<Indices...>, Options_, IndexType>;
+    using ValidType = void;
+
+    static constexpr Eigen::DSizes<typename Type::Index, Type::NumIndices>
+    get_shape(const Type & /*f*/) {
+        return get_shape();
+    }
+
+    static constexpr Eigen::DSizes<typename Type::Index, Type::NumIndices> get_shape() {
+        return Eigen::DSizes<typename Type::Index, Type::NumIndices>(Indices...);
+    }
+
+    static bool
+    is_correct_shape(const Eigen::DSizes<typename Type::Index, Type::NumIndices> &shape) {
+        return get_shape() == shape;
+    }
+
+    static constexpr auto dimensions_descriptor
+        = ::pybind11::detail::concat(const_name<Indices>()...);
+
+    template <typename... Args>
+    static Type *alloc(Args &&...args) {
+        Eigen::aligned_allocator<Type> allocator;
+        return ::new (allocator.allocate(1)) Type(std::forward<Args>(args)...);
+    }
+
+    static void free(Type *tensor) {
+        Eigen::aligned_allocator<Type> allocator;
+        tensor->~Type();
+        allocator.deallocate(tensor, 1);
+    }
+};
+
+template <typename Type, bool ShowDetails, bool NeedsWriteable = false>
+struct get_tensor_descriptor {
+    static constexpr auto details
+        = const_name<NeedsWriteable>(", flags.writeable", "")
+          + const_name<static_cast<int>(Type::Layout) == static_cast<int>(Eigen::RowMajor)>(
+              ", flags.c_contiguous", ", flags.f_contiguous");
+    static constexpr auto value
+        = const_name("numpy.ndarray[") + npy_format_descriptor<typename Type::Scalar>::name
+          + const_name("[") + eigen_tensor_helper<remove_cv_t<Type>>::dimensions_descriptor
+          + const_name("]") + const_name<ShowDetails>(details, const_name("")) + const_name("]");
+};
+
+// When EIGEN_AVOID_STL_ARRAY is defined, Eigen::DSizes<T, 0> does not have the begin() member
+// function. Falling back to a simple loop works around this issue.
+//
+// We need to disable the type-limits warning for the inner loop when size = 0.
+
+PYBIND11_WARNING_PUSH
+PYBIND11_WARNING_DISABLE_GCC("-Wtype-limits")
+
+template <typename T, int size>
+std::vector<T> convert_dsizes_to_vector(const Eigen::DSizes<T, size> &arr) {
+    std::vector<T> result(size);
+
+    for (size_t i = 0; i < size; i++) {
+        result[i] = arr[i];
+    }
+
+    return result;
+}
+
+template <typename T, int size>
+Eigen::DSizes<T, size> get_shape_for_array(const array &arr) {
+    Eigen::DSizes<T, size> result;
+    const T *shape = arr.shape();
+    for (size_t i = 0; i < size; i++) {
+        result[i] = shape[i];
+    }
+
+    return result;
+}
+
+PYBIND11_WARNING_POP
+
+template <typename Type>
+struct type_caster<Type, typename eigen_tensor_helper<Type>::ValidType> {
+    static_assert(!std::is_pointer<typename Type::Scalar>::value,
+                  PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED);
+    using Helper = eigen_tensor_helper<Type>;
+    static constexpr auto temp_name = get_tensor_descriptor<Type, false>::value;
+    PYBIND11_TYPE_CASTER(Type, temp_name);
+
+    bool load(handle src, bool convert) {
+        if (!convert) {
+            if (!isinstance<array>(src)) {
+                return false;
+            }
+            array temp = array::ensure(src);
+            if (!temp) {
+                return false;
+            }
+
+            if (!temp.dtype().is(dtype::of<typename Type::Scalar>())) {
+                return false;
+            }
+        }
+
+        array_t<typename Type::Scalar, compute_array_flag_from_tensor<Type>()> arr(
+            reinterpret_borrow<object>(src));
+
+        if (arr.ndim() != Type::NumIndices) {
+            return false;
+        }
+        auto shape = get_shape_for_array<typename Type::Index, Type::NumIndices>(arr);
+
+        if (!Helper::is_correct_shape(shape)) {
+            return false;
+        }
+
+#if EIGEN_VERSION_AT_LEAST(3, 4, 0)
+        auto data_pointer = arr.data();
+#else
+        // Handle Eigen bug
+        auto data_pointer = const_cast<typename Type::Scalar *>(arr.data());
+#endif
+
+        if (is_tensor_aligned(arr.data())) {
+            value = Eigen::TensorMap<const Type, Eigen::Aligned>(data_pointer, shape);
+        } else {
+            value = Eigen::TensorMap<const Type>(data_pointer, shape);
+        }
+
+        return true;
+    }
+
+    static handle cast(Type &&src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::reference
+            || policy == return_value_policy::reference_internal) {
+            pybind11_fail("Cannot use a reference return value policy for an rvalue");
+        }
+        return cast_impl(&src, return_value_policy::move, parent);
+    }
+
+    static handle cast(const Type &&src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::reference
+            || policy == return_value_policy::reference_internal) {
+            pybind11_fail("Cannot use a reference return value policy for an rvalue");
+        }
+        return cast_impl(&src, return_value_policy::move, parent);
+    }
+
+    static handle cast(Type &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic
+            || policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::copy;
+        }
+        return cast_impl(&src, policy, parent);
+    }
+
+    static handle cast(const Type &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic
+            || policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::copy;
+        }
+        return cast(&src, policy, parent);
+    }
+
+    static handle cast(Type *src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic) {
+            policy = return_value_policy::take_ownership;
+        } else if (policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::reference;
+        }
+        return cast_impl(src, policy, parent);
+    }
+
+    static handle cast(const Type *src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic) {
+            policy = return_value_policy::take_ownership;
+        } else if (policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::reference;
+        }
+        return cast_impl(src, policy, parent);
+    }
+
+    template <typename C>
+    static handle cast_impl(C *src, return_value_policy policy, handle parent) {
+        object parent_object;
+        bool writeable = false;
+        switch (policy) {
+            case return_value_policy::move:
+                if (std::is_const<C>::value) {
+                    pybind11_fail("Cannot move from a constant reference");
+                }
+
+                src = Helper::alloc(std::move(*src));
+
+                parent_object
+                    = capsule(src, [](void *ptr) { Helper::free(reinterpret_cast<Type *>(ptr)); });
+                writeable = true;
+                break;
+
+            case return_value_policy::take_ownership:
+                if (std::is_const<C>::value) {
+                    // This cast is ugly, and might be UB in some cases, but we don't have an
+                    // alternative here as we must free that memory
+                    Helper::free(const_cast<Type *>(src));
+                    pybind11_fail("Cannot take ownership of a const reference");
+                }
+
+                parent_object
+                    = capsule(src, [](void *ptr) { Helper::free(reinterpret_cast<Type *>(ptr)); });
+                writeable = true;
+                break;
+
+            case return_value_policy::copy:
+                writeable = true;
+                break;
+
+            case return_value_policy::reference:
+                parent_object = none();
+                writeable = !std::is_const<C>::value;
+                break;
+
+            case return_value_policy::reference_internal:
+                // Default should do the right thing
+                if (!parent) {
+                    pybind11_fail("Cannot use reference internal when there is no parent");
+                }
+                parent_object = reinterpret_borrow<object>(parent);
+                writeable = !std::is_const<C>::value;
+                break;
+
+            default:
+                pybind11_fail("pybind11 bug in eigen.h, please file a bug report");
+        }
+
+        auto result = array_t<typename Type::Scalar, compute_array_flag_from_tensor<Type>()>(
+            convert_dsizes_to_vector(Helper::get_shape(*src)), src->data(), parent_object);
+
+        if (!writeable) {
+            array_proxy(result.ptr())->flags &= ~detail::npy_api::NPY_ARRAY_WRITEABLE_;
+        }
+
+        return result.release();
+    }
+};
+
+template <typename StoragePointerType,
+          bool needs_writeable,
+          enable_if_t<!needs_writeable, bool> = true>
+StoragePointerType get_array_data_for_type(array &arr) {
+#if EIGEN_VERSION_AT_LEAST(3, 4, 0)
+    return reinterpret_cast<StoragePointerType>(arr.data());
+#else
+    // Handle Eigen bug
+    return reinterpret_cast<StoragePointerType>(const_cast<void *>(arr.data()));
+#endif
+}
+
+template <typename StoragePointerType,
+          bool needs_writeable,
+          enable_if_t<needs_writeable, bool> = true>
+StoragePointerType get_array_data_for_type(array &arr) {
+    return reinterpret_cast<StoragePointerType>(arr.mutable_data());
+}
+
+template <typename T, typename = void>
+struct get_storage_pointer_type;
+
+template <typename MapType>
+struct get_storage_pointer_type<MapType, void_t<typename MapType::StoragePointerType>> {
+    using SPT = typename MapType::StoragePointerType;
+};
+
+template <typename MapType>
+struct get_storage_pointer_type<MapType, void_t<typename MapType::PointerArgType>> {
+    using SPT = typename MapType::PointerArgType;
+};
+
+template <typename Type, int Options>
+struct type_caster<Eigen::TensorMap<Type, Options>,
+                   typename eigen_tensor_helper<remove_cv_t<Type>>::ValidType> {
+    static_assert(!std::is_pointer<typename Type::Scalar>::value,
+                  PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED);
+    using MapType = Eigen::TensorMap<Type, Options>;
+    using Helper = eigen_tensor_helper<remove_cv_t<Type>>;
+
+    bool load(handle src, bool /*convert*/) {
+        // Note that we have a lot more checks here as we want to make sure to avoid copies
+        if (!isinstance<array>(src)) {
+            return false;
+        }
+        auto arr = reinterpret_borrow<array>(src);
+        if ((arr.flags() & compute_array_flag_from_tensor<Type>()) == 0) {
+            return false;
+        }
+
+        if (!arr.dtype().is(dtype::of<typename Type::Scalar>())) {
+            return false;
+        }
+
+        if (arr.ndim() != Type::NumIndices) {
+            return false;
+        }
+
+        constexpr bool is_aligned = (Options & Eigen::Aligned) != 0;
+
+        if (is_aligned && !is_tensor_aligned(arr.data())) {
+            return false;
+        }
+
+        auto shape = get_shape_for_array<typename Type::Index, Type::NumIndices>(arr);
+
+        if (!Helper::is_correct_shape(shape)) {
+            return false;
+        }
+
+        if (needs_writeable && !arr.writeable()) {
+            return false;
+        }
+
+        auto result = get_array_data_for_type<typename get_storage_pointer_type<MapType>::SPT,
+                                              needs_writeable>(arr);
+
+        value.reset(new MapType(std::move(result), std::move(shape)));
+
+        return true;
+    }
+
+    static handle cast(MapType &&src, return_value_policy policy, handle parent) {
+        return cast_impl(&src, policy, parent);
+    }
+
+    static handle cast(const MapType &&src, return_value_policy policy, handle parent) {
+        return cast_impl(&src, policy, parent);
+    }
+
+    static handle cast(MapType &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic
+            || policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::copy;
+        }
+        return cast_impl(&src, policy, parent);
+    }
+
+    static handle cast(const MapType &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic
+            || policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::copy;
+        }
+        return cast(&src, policy, parent);
+    }
+
+    static handle cast(MapType *src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic) {
+            policy = return_value_policy::take_ownership;
+        } else if (policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::reference;
+        }
+        return cast_impl(src, policy, parent);
+    }
+
+    static handle cast(const MapType *src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic) {
+            policy = return_value_policy::take_ownership;
+        } else if (policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::reference;
+        }
+        return cast_impl(src, policy, parent);
+    }
+
+    template <typename C>
+    static handle cast_impl(C *src, return_value_policy policy, handle parent) {
+        object parent_object;
+        constexpr bool writeable = !std::is_const<C>::value;
+        switch (policy) {
+            case return_value_policy::reference:
+                parent_object = none();
+                break;
+
+            case return_value_policy::reference_internal:
+                // Default should do the right thing
+                if (!parent) {
+                    pybind11_fail("Cannot use reference internal when there is no parent");
+                }
+                parent_object = reinterpret_borrow<object>(parent);
+                break;
+
+            default:
+                // move, take_ownership don't make any sense for a ref/map:
+                pybind11_fail("Invalid return_value_policy for Eigen Map type, must be either "
+                              "reference or reference_internal");
+        }
+
+        auto result = array_t<typename Type::Scalar, compute_array_flag_from_tensor<Type>()>(
+            convert_dsizes_to_vector(Helper::get_shape(*src)),
+            src->data(),
+            std::move(parent_object));
+
+        if (!writeable) {
+            array_proxy(result.ptr())->flags &= ~detail::npy_api::NPY_ARRAY_WRITEABLE_;
+        }
+
+        return result.release();
+    }
+
+#if EIGEN_VERSION_AT_LEAST(3, 4, 0)
+
+    static constexpr bool needs_writeable = !std::is_const<typename std::remove_pointer<
+        typename get_storage_pointer_type<MapType>::SPT>::type>::value;
+#else
+    // Handle Eigen bug
+    static constexpr bool needs_writeable = !std::is_const<Type>::value;
+#endif
+
+protected:
+    // TODO: Move to std::optional once std::optional has more support
+    std::unique_ptr<MapType> value;
+
+public:
+    static constexpr auto name = get_tensor_descriptor<Type, true, needs_writeable>::value;
+    explicit operator MapType *() { return value.get(); }
+    explicit operator MapType &() { return *value; }
+    explicit operator MapType &&() && { return std::move(*value); }
+
+    template <typename T_>
+    using cast_op_type = ::pybind11::detail::movable_cast_op_type<T_>;
+};
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/phivenv/Lib/site-packages/torch/include/pybind11/embed.h b/phivenv/Lib/site-packages/torch/include/pybind11/embed.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe2fdd033c68262b029340a610bb5c66e01474eb
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/pybind11/embed.h
@@ -0,0 +1,313 @@
+/*
+    pybind11/embed.h: Support for embedding the interpreter
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include "eval.h"
+
+#include <memory>
+#include <vector>
+
+#if defined(PYPY_VERSION)
+#    error Embedding the interpreter is not supported with PyPy
+#endif
+
+#define PYBIND11_EMBEDDED_MODULE_IMPL(name)                                                       \
+    extern "C" PyObject *pybind11_init_impl_##name();                                             \
+    extern "C" PyObject *pybind11_init_impl_##name() { return pybind11_init_wrapper_##name(); }
+
+/** \rst
+    Add a new module to the table of builtins for the interpreter. Must be
+    defined in global scope. The first macro parameter is the name of the
+    module (without quotes). The second parameter is the variable which will
+    be used as the interface to add functions and classes to the module.
+
+    .. code-block:: cpp
+
+        PYBIND11_EMBEDDED_MODULE(example, m) {
+            // ... initialize functions and classes here
+            m.def("foo", []() {
+                return "Hello, World!";
+            });
+        }
+ \endrst */
+#define PYBIND11_EMBEDDED_MODULE(name, variable)                                                  \
+    static ::pybind11::module_::module_def PYBIND11_CONCAT(pybind11_module_def_, name);           \
+    static void PYBIND11_CONCAT(pybind11_init_, name)(::pybind11::module_ &);                     \
+    static PyObject PYBIND11_CONCAT(*pybind11_init_wrapper_, name)() {                            \
+        auto m = ::pybind11::module_::create_extension_module(                                    \
+            PYBIND11_TOSTRING(name), nullptr, &PYBIND11_CONCAT(pybind11_module_def_, name));      \
+        try {                                                                                     \
+            PYBIND11_CONCAT(pybind11_init_, name)(m);                                             \
+            return m.ptr();                                                                       \
+        }                                                                                         \
+        PYBIND11_CATCH_INIT_EXCEPTIONS                                                            \
+    }                                                                                             \
+    PYBIND11_EMBEDDED_MODULE_IMPL(name)                                                           \
+    ::pybind11::detail::embedded_module PYBIND11_CONCAT(pybind11_module_, name)(                  \
+        PYBIND11_TOSTRING(name), PYBIND11_CONCAT(pybind11_init_impl_, name));                     \
+    void PYBIND11_CONCAT(pybind11_init_, name)(::pybind11::module_                                \
+                                               & variable) // NOLINT(bugprone-macro-parentheses)
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+/// Python 2.7/3.x compatible version of `PyImport_AppendInittab` and error checks.
+struct embedded_module {
+    using init_t = PyObject *(*) ();
+    embedded_module(const char *name, init_t init) {
+        if (Py_IsInitialized() != 0) {
+            pybind11_fail("Can't add new modules after the interpreter has been initialized");
+        }
+
+        auto result = PyImport_AppendInittab(name, init);
+        if (result == -1) {
+            pybind11_fail("Insufficient memory to add a new module");
+        }
+    }
+};
+
+struct wide_char_arg_deleter {
+    void operator()(wchar_t *ptr) const {
+        // API docs: https://docs.python.org/3/c-api/sys.html#c.Py_DecodeLocale
+        PyMem_RawFree(ptr);
+    }
+};
+
+inline wchar_t *widen_chars(const char *safe_arg) {
+    wchar_t *widened_arg = Py_DecodeLocale(safe_arg, nullptr);
+    return widened_arg;
+}
+
+inline void precheck_interpreter() {
+    if (Py_IsInitialized() != 0) {
+        pybind11_fail("The interpreter is already running");
+    }
+}
+
+#if !defined(PYBIND11_PYCONFIG_SUPPORT_PY_VERSION_HEX)
+#    define PYBIND11_PYCONFIG_SUPPORT_PY_VERSION_HEX (0x03080000)
+#endif
+
+#if PY_VERSION_HEX < PYBIND11_PYCONFIG_SUPPORT_PY_VERSION_HEX
+inline void initialize_interpreter_pre_pyconfig(bool init_signal_handlers,
+                                                int argc,
+                                                const char *const *argv,
+                                                bool add_program_dir_to_path) {
+    detail::precheck_interpreter();
+    Py_InitializeEx(init_signal_handlers ? 1 : 0);
+
+    // Before it was special-cased in python 3.8, passing an empty or null argv
+    // caused a segfault, so we have to reimplement the special case ourselves.
+    bool special_case = (argv == nullptr || argc <= 0);
+
+    const char *const empty_argv[]{"\0"};
+    const char *const *safe_argv = special_case ? empty_argv : argv;
+    if (special_case) {
+        argc = 1;
+    }
+
+    auto argv_size = static_cast<size_t>(argc);
+    // SetArgv* on python 3 takes wchar_t, so we have to convert.
+    std::unique_ptr<wchar_t *[]> widened_argv(new wchar_t *[argv_size]);
+    std::vector<std::unique_ptr<wchar_t[], detail::wide_char_arg_deleter>> widened_argv_entries;
+    widened_argv_entries.reserve(argv_size);
+    for (size_t ii = 0; ii < argv_size; ++ii) {
+        widened_argv_entries.emplace_back(detail::widen_chars(safe_argv[ii]));
+        if (!widened_argv_entries.back()) {
+            // A null here indicates a character-encoding failure or the python
+            // interpreter out of memory. Give up.
+            return;
+        }
+        widened_argv[ii] = widened_argv_entries.back().get();
+    }
+
+    auto *pysys_argv = widened_argv.get();
+
+    PySys_SetArgvEx(argc, pysys_argv, static_cast<int>(add_program_dir_to_path));
+}
+#endif
+
+PYBIND11_NAMESPACE_END(detail)
+
+#if PY_VERSION_HEX >= PYBIND11_PYCONFIG_SUPPORT_PY_VERSION_HEX
+inline void initialize_interpreter(PyConfig *config,
+                                   int argc = 0,
+                                   const char *const *argv = nullptr,
+                                   bool add_program_dir_to_path = true) {
+    detail::precheck_interpreter();
+    PyStatus status = PyConfig_SetBytesArgv(config, argc, const_cast<char *const *>(argv));
+    if (PyStatus_Exception(status) != 0) {
+        // A failure here indicates a character-encoding failure or the python
+        // interpreter out of memory. Give up.
+        PyConfig_Clear(config);
+        throw std::runtime_error(PyStatus_IsError(status) != 0 ? status.err_msg
+                                                               : "Failed to prepare CPython");
+    }
+    status = Py_InitializeFromConfig(config);
+    if (PyStatus_Exception(status) != 0) {
+        PyConfig_Clear(config);
+        throw std::runtime_error(PyStatus_IsError(status) != 0 ? status.err_msg
+                                                               : "Failed to init CPython");
+    }
+    if (add_program_dir_to_path) {
+        PyRun_SimpleString("import sys, os.path; "
+                           "sys.path.insert(0, "
+                           "os.path.abspath(os.path.dirname(sys.argv[0])) "
+                           "if sys.argv and os.path.exists(sys.argv[0]) else '')");
+    }
+    PyConfig_Clear(config);
+}
+#endif
+
+/** \rst
+    Initialize the Python interpreter. No other pybind11 or CPython API functions can be
+    called before this is done; with the exception of `PYBIND11_EMBEDDED_MODULE`. The
+    optional `init_signal_handlers` parameter can be used to skip the registration of
+    signal handlers (see the `Python documentation`_ for details). Calling this function
+    again after the interpreter has already been initialized is a fatal error.
+
+    If initializing the Python interpreter fails, then the program is terminated.  (This
+    is controlled by the CPython runtime and is an exception to pybind11's normal behavior
+    of throwing exceptions on errors.)
+
+    The remaining optional parameters, `argc`, `argv`, and `add_program_dir_to_path` are
+    used to populate ``sys.argv`` and ``sys.path``.
+    See the |PySys_SetArgvEx documentation|_ for details.
+
+    .. _Python documentation: https://docs.python.org/3/c-api/init.html#c.Py_InitializeEx
+    .. |PySys_SetArgvEx documentation| replace:: ``PySys_SetArgvEx`` documentation
+    .. _PySys_SetArgvEx documentation: https://docs.python.org/3/c-api/init.html#c.PySys_SetArgvEx
+ \endrst */
+inline void initialize_interpreter(bool init_signal_handlers = true,
+                                   int argc = 0,
+                                   const char *const *argv = nullptr,
+                                   bool add_program_dir_to_path = true) {
+#if PY_VERSION_HEX < PYBIND11_PYCONFIG_SUPPORT_PY_VERSION_HEX
+    detail::initialize_interpreter_pre_pyconfig(
+        init_signal_handlers, argc, argv, add_program_dir_to_path);
+#else
+    PyConfig config;
+    PyConfig_InitPythonConfig(&config);
+    // See PR #4473 for background
+    config.parse_argv = 0;
+
+    config.install_signal_handlers = init_signal_handlers ? 1 : 0;
+    initialize_interpreter(&config, argc, argv, add_program_dir_to_path);
+#endif
+}
+
+/** \rst
+    Shut down the Python interpreter. No pybind11 or CPython API functions can be called
+    after this. In addition, pybind11 objects must not outlive the interpreter:
+
+    .. code-block:: cpp
+
+        { // BAD
+            py::initialize_interpreter();
+            auto hello = py::str("Hello, World!");
+            py::finalize_interpreter();
+        } // <-- BOOM, hello's destructor is called after interpreter shutdown
+
+        { // GOOD
+            py::initialize_interpreter();
+            { // scoped
+                auto hello = py::str("Hello, World!");
+            } // <-- OK, hello is cleaned up properly
+            py::finalize_interpreter();
+        }
+
+        { // BETTER
+            py::scoped_interpreter guard{};
+            auto hello = py::str("Hello, World!");
+        }
+
+    .. warning::
+
+        The interpreter can be restarted by calling `initialize_interpreter` again.
+        Modules created using pybind11 can be safely re-initialized. However, Python
+        itself cannot completely unload binary extension modules and there are several
+        caveats with regard to interpreter restarting. All the details can be found
+        in the CPython documentation. In short, not all interpreter memory may be
+        freed, either due to reference cycles or user-created global data.
+
+ \endrst */
+inline void finalize_interpreter() {
+    // Get the internals pointer (without creating it if it doesn't exist).  It's possible for the
+    // internals to be created during Py_Finalize() (e.g. if a py::capsule calls `get_internals()`
+    // during destruction), so we get the pointer-pointer here and check it after Py_Finalize().
+    detail::internals **internals_ptr_ptr = detail::get_internals_pp();
+    // It could also be stashed in state_dict, so look there too:
+    if (object internals_obj
+        = get_internals_obj_from_state_dict(detail::get_python_state_dict())) {
+        internals_ptr_ptr = detail::get_internals_pp_from_capsule(internals_obj);
+    }
+    // Local internals contains data managed by the current interpreter, so we must clear them to
+    // avoid undefined behaviors when initializing another interpreter
+    detail::get_local_internals().registered_types_cpp.clear();
+    detail::get_local_internals().registered_exception_translators.clear();
+
+    Py_Finalize();
+
+    if (internals_ptr_ptr) {
+        delete *internals_ptr_ptr;
+        *internals_ptr_ptr = nullptr;
+    }
+}
+
+/** \rst
+    Scope guard version of `initialize_interpreter` and `finalize_interpreter`.
+    This a move-only guard and only a single instance can exist.
+
+    See `initialize_interpreter` for a discussion of its constructor arguments.
+
+    .. code-block:: cpp
+
+        #include <pybind11/embed.h>
+
+        int main() {
+            py::scoped_interpreter guard{};
+            py::print(Hello, World!);
+        } // <-- interpreter shutdown
+ \endrst */
+class scoped_interpreter {
+public:
+    explicit scoped_interpreter(bool init_signal_handlers = true,
+                                int argc = 0,
+                                const char *const *argv = nullptr,
+                                bool add_program_dir_to_path = true) {
+        initialize_interpreter(init_signal_handlers, argc, argv, add_program_dir_to_path);
+    }
+
+#if PY_VERSION_HEX >= PYBIND11_PYCONFIG_SUPPORT_PY_VERSION_HEX
+    explicit scoped_interpreter(PyConfig *config,
+                                int argc = 0,
+                                const char *const *argv = nullptr,
+                                bool add_program_dir_to_path = true) {
+        initialize_interpreter(config, argc, argv, add_program_dir_to_path);
+    }
+#endif
+
+    scoped_interpreter(const scoped_interpreter &) = delete;
+    scoped_interpreter(scoped_interpreter &&other) noexcept { other.is_valid = false; }
+    scoped_interpreter &operator=(const scoped_interpreter &) = delete;
+    scoped_interpreter &operator=(scoped_interpreter &&) = delete;
+
+    ~scoped_interpreter() {
+        if (is_valid) {
+            finalize_interpreter();
+        }
+    }
+
+private:
+    bool is_valid = true;
+};
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/phivenv/Lib/site-packages/torch/include/pybind11/eval.h b/phivenv/Lib/site-packages/torch/include/pybind11/eval.h
new file mode 100644
index 0000000000000000000000000000000000000000..62259b6c6fe65f74dad84c00b07b029883ee24a7
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/pybind11/eval.h
@@ -0,0 +1,156 @@
+/*
+    pybind11/eval.h: Support for evaluating Python expressions and statements
+    from strings and files
+
+    Copyright (c) 2016 Klemens Morgenstern <klemens.morgenstern@ed-chemnitz.de> and
+                       Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+
+#include <utility>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+inline void ensure_builtins_in_globals(object &global) {
+#if defined(PYPY_VERSION) || PY_VERSION_HEX < 0x03080000
+    // Running exec and eval adds `builtins` module under `__builtins__` key to
+    // globals if not yet present.  Python 3.8 made PyRun_String behave
+    // similarly. Let's also do that for older versions, for consistency. This
+    // was missing from PyPy3.8 7.3.7.
+    if (!global.contains("__builtins__"))
+        global["__builtins__"] = module_::import(PYBIND11_BUILTINS_MODULE);
+#else
+    (void) global;
+#endif
+}
+
+PYBIND11_NAMESPACE_END(detail)
+
+enum eval_mode {
+    /// Evaluate a string containing an isolated expression
+    eval_expr,
+
+    /// Evaluate a string containing a single statement. Returns \c none
+    eval_single_statement,
+
+    /// Evaluate a string containing a sequence of statement. Returns \c none
+    eval_statements
+};
+
+template <eval_mode mode = eval_expr>
+object eval(const str &expr, object global = globals(), object local = object()) {
+    if (!local) {
+        local = global;
+    }
+
+    detail::ensure_builtins_in_globals(global);
+
+    /* PyRun_String does not accept a PyObject / encoding specifier,
+       this seems to be the only alternative */
+    std::string buffer = "# -*- coding: utf-8 -*-\n" + (std::string) expr;
+
+    int start = 0;
+    switch (mode) {
+        case eval_expr:
+            start = Py_eval_input;
+            break;
+        case eval_single_statement:
+            start = Py_single_input;
+            break;
+        case eval_statements:
+            start = Py_file_input;
+            break;
+        default:
+            pybind11_fail("invalid evaluation mode");
+    }
+
+    PyObject *result = PyRun_String(buffer.c_str(), start, global.ptr(), local.ptr());
+    if (!result) {
+        throw error_already_set();
+    }
+    return reinterpret_steal<object>(result);
+}
+
+template <eval_mode mode = eval_expr, size_t N>
+object eval(const char (&s)[N], object global = globals(), object local = object()) {
+    /* Support raw string literals by removing common leading whitespace */
+    auto expr = (s[0] == '\n') ? str(module_::import("textwrap").attr("dedent")(s)) : str(s);
+    return eval<mode>(expr, std::move(global), std::move(local));
+}
+
+inline void exec(const str &expr, object global = globals(), object local = object()) {
+    eval<eval_statements>(expr, std::move(global), std::move(local));
+}
+
+template <size_t N>
+void exec(const char (&s)[N], object global = globals(), object local = object()) {
+    eval<eval_statements>(s, std::move(global), std::move(local));
+}
+
+#if defined(PYPY_VERSION)
+template <eval_mode mode = eval_statements>
+object eval_file(str, object, object) {
+    pybind11_fail("eval_file not supported in PyPy3. Use eval");
+}
+template <eval_mode mode = eval_statements>
+object eval_file(str, object) {
+    pybind11_fail("eval_file not supported in PyPy3. Use eval");
+}
+template <eval_mode mode = eval_statements>
+object eval_file(str) {
+    pybind11_fail("eval_file not supported in PyPy3. Use eval");
+}
+#else
+template <eval_mode mode = eval_statements>
+object eval_file(str fname, object global = globals(), object local = object()) {
+    if (!local) {
+        local = global;
+    }
+
+    detail::ensure_builtins_in_globals(global);
+
+    int start = 0;
+    switch (mode) {
+        case eval_expr:
+            start = Py_eval_input;
+            break;
+        case eval_single_statement:
+            start = Py_single_input;
+            break;
+        case eval_statements:
+            start = Py_file_input;
+            break;
+        default:
+            pybind11_fail("invalid evaluation mode");
+    }
+
+    int closeFile = 1;
+    std::string fname_str = (std::string) fname;
+    FILE *f = _Py_fopen_obj(fname.ptr(), "r");
+    if (!f) {
+        PyErr_Clear();
+        pybind11_fail("File \"" + fname_str + "\" could not be opened!");
+    }
+
+    if (!global.contains("__file__")) {
+        global["__file__"] = std::move(fname);
+    }
+
+    PyObject *result
+        = PyRun_FileEx(f, fname_str.c_str(), start, global.ptr(), local.ptr(), closeFile);
+
+    if (!result) {
+        throw error_already_set();
+    }
+    return reinterpret_steal<object>(result);
+}
+#endif
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/phivenv/Lib/site-packages/torch/include/pybind11/functional.h b/phivenv/Lib/site-packages/torch/include/pybind11/functional.h
new file mode 100644
index 0000000000000000000000000000000000000000..bc5ebcbef23de9419d5ea0b95543fb0eb4ced318
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/pybind11/functional.h
@@ -0,0 +1,149 @@
+/*
+    pybind11/functional.h: std::function<> support
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#define PYBIND11_HAS_TYPE_CASTER_STD_FUNCTION_SPECIALIZATIONS
+
+#include "pybind11.h"
+
+#include <functional>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+PYBIND11_NAMESPACE_BEGIN(type_caster_std_function_specializations)
+
+// ensure GIL is held during functor destruction
+struct func_handle {
+    function f;
+#if !(defined(_MSC_VER) && _MSC_VER == 1916 && defined(PYBIND11_CPP17))
+    // This triggers a syntax error under very special conditions (very weird indeed).
+    explicit
+#endif
+        func_handle(function &&f_) noexcept
+        : f(std::move(f_)) {
+    }
+    func_handle(const func_handle &f_) { operator=(f_); }
+    func_handle &operator=(const func_handle &f_) {
+        gil_scoped_acquire acq;
+        f = f_.f;
+        return *this;
+    }
+    ~func_handle() {
+        gil_scoped_acquire acq;
+        function kill_f(std::move(f));
+    }
+};
+
+// to emulate 'move initialization capture' in C++11
+struct func_wrapper_base {
+    func_handle hfunc;
+    explicit func_wrapper_base(func_handle &&hf) noexcept : hfunc(hf) {}
+};
+
+template <typename Return, typename... Args>
+struct func_wrapper : func_wrapper_base {
+    using func_wrapper_base::func_wrapper_base;
+    Return operator()(Args... args) const {
+        gil_scoped_acquire acq;
+        // casts the returned object as a rvalue to the return type
+        return hfunc.f(std::forward<Args>(args)...).template cast<Return>();
+    }
+};
+
+PYBIND11_NAMESPACE_END(type_caster_std_function_specializations)
+
+template <typename Return, typename... Args>
+struct type_caster<std::function<Return(Args...)>> {
+    using type = std::function<Return(Args...)>;
+    using retval_type = conditional_t<std::is_same<Return, void>::value, void_type, Return>;
+    using function_type = Return (*)(Args...);
+
+public:
+    bool load(handle src, bool convert) {
+        if (src.is_none()) {
+            // Defer accepting None to other overloads (if we aren't in convert mode):
+            if (!convert) {
+                return false;
+            }
+            return true;
+        }
+
+        if (!isinstance<function>(src)) {
+            return false;
+        }
+
+        auto func = reinterpret_borrow<function>(src);
+
+        /*
+           When passing a C++ function as an argument to another C++
+           function via Python, every function call would normally involve
+           a full C++ -> Python -> C++ roundtrip, which can be prohibitive.
+           Here, we try to at least detect the case where the function is
+           stateless (i.e. function pointer or lambda function without
+           captured variables), in which case the roundtrip can be avoided.
+         */
+        if (auto cfunc = func.cpp_function()) {
+            auto *cfunc_self = PyCFunction_GET_SELF(cfunc.ptr());
+            if (cfunc_self == nullptr) {
+                PyErr_Clear();
+            } else if (isinstance<capsule>(cfunc_self)) {
+                auto c = reinterpret_borrow<capsule>(cfunc_self);
+
+                function_record *rec = nullptr;
+                // Check that we can safely reinterpret the capsule into a function_record
+                if (detail::is_function_record_capsule(c)) {
+                    rec = c.get_pointer<function_record>();
+                }
+
+                while (rec != nullptr) {
+                    if (rec->is_stateless
+                        && same_type(typeid(function_type),
+                                     *reinterpret_cast<const std::type_info *>(rec->data[1]))) {
+                        struct capture {
+                            function_type f;
+                        };
+                        value = ((capture *) &rec->data)->f;
+                        return true;
+                    }
+                    rec = rec->next;
+                }
+            }
+            // PYPY segfaults here when passing builtin function like sum.
+            // Raising an fail exception here works to prevent the segfault, but only on gcc.
+            // See PR #1413 for full details
+        }
+
+        value = type_caster_std_function_specializations::func_wrapper<Return, Args...>(
+            type_caster_std_function_specializations::func_handle(std::move(func)));
+        return true;
+    }
+
+    template <typename Func>
+    static handle cast(Func &&f_, return_value_policy policy, handle /* parent */) {
+        if (!f_) {
+            return none().release();
+        }
+
+        auto result = f_.template target<function_type>();
+        if (result) {
+            return cpp_function(*result, policy).release();
+        }
+        return cpp_function(std::forward<Func>(f_), policy).release();
+    }
+
+    PYBIND11_TYPE_CASTER(type,
+                         const_name("Callable[[")
+                             + ::pybind11::detail::concat(make_caster<Args>::name...)
+                             + const_name("], ") + make_caster<retval_type>::name
+                             + const_name("]"));
+};
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/phivenv/Lib/site-packages/torch/include/pybind11/gil.h b/phivenv/Lib/site-packages/torch/include/pybind11/gil.h
new file mode 100644
index 0000000000000000000000000000000000000000..78477960eddd96f6b05348e8c8ae2975189003e6
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/pybind11/gil.h
@@ -0,0 +1,219 @@
+/*
+    pybind11/gil.h: RAII helpers for managing the GIL
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+
+#include <cassert>
+
+#if !defined(PYBIND11_SIMPLE_GIL_MANAGEMENT)
+#    include "detail/internals.h"
+#endif
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// forward declarations
+PyThreadState *get_thread_state_unchecked();
+
+PYBIND11_NAMESPACE_END(detail)
+
+#if !defined(PYBIND11_SIMPLE_GIL_MANAGEMENT)
+
+/* The functions below essentially reproduce the PyGILState_* API using a RAII
+ * pattern, but there are a few important differences:
+ *
+ * 1. When acquiring the GIL from an non-main thread during the finalization
+ *    phase, the GILState API blindly terminates the calling thread, which
+ *    is often not what is wanted. This API does not do this.
+ *
+ * 2. The gil_scoped_release function can optionally cut the relationship
+ *    of a PyThreadState and its associated thread, which allows moving it to
+ *    another thread (this is a fairly rare/advanced use case).
+ *
+ * 3. The reference count of an acquired thread state can be controlled. This
+ *    can be handy to prevent cases where callbacks issued from an external
+ *    thread would otherwise constantly construct and destroy thread state data
+ *    structures.
+ *
+ * See the Python bindings of NanoGUI (http://github.com/wjakob/nanogui) for an
+ * example which uses features 2 and 3 to migrate the Python thread of
+ * execution to another thread (to run the event loop on the original thread,
+ * in this case).
+ */
+
+class gil_scoped_acquire {
+public:
+    PYBIND11_NOINLINE gil_scoped_acquire() {
+        auto &internals = detail::get_internals();
+        tstate = (PyThreadState *) PYBIND11_TLS_GET_VALUE(internals.tstate);
+
+        if (!tstate) {
+            /* Check if the GIL was acquired using the PyGILState_* API instead (e.g. if
+               calling from a Python thread). Since we use a different key, this ensures
+               we don't create a new thread state and deadlock in PyEval_AcquireThread
+               below. Note we don't save this state with internals.tstate, since we don't
+               create it we would fail to clear it (its reference count should be > 0). */
+            tstate = PyGILState_GetThisThreadState();
+        }
+
+        if (!tstate) {
+            tstate = PyThreadState_New(internals.istate);
+#    if defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+            if (!tstate) {
+                pybind11_fail("scoped_acquire: could not create thread state!");
+            }
+#    endif
+            tstate->gilstate_counter = 0;
+            PYBIND11_TLS_REPLACE_VALUE(internals.tstate, tstate);
+        } else {
+            release = detail::get_thread_state_unchecked() != tstate;
+        }
+
+        if (release) {
+            PyEval_AcquireThread(tstate);
+        }
+
+        inc_ref();
+    }
+
+    gil_scoped_acquire(const gil_scoped_acquire &) = delete;
+    gil_scoped_acquire &operator=(const gil_scoped_acquire &) = delete;
+
+    void inc_ref() { ++tstate->gilstate_counter; }
+
+    PYBIND11_NOINLINE void dec_ref() {
+        --tstate->gilstate_counter;
+#    if defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+        if (detail::get_thread_state_unchecked() != tstate) {
+            pybind11_fail("scoped_acquire::dec_ref(): thread state must be current!");
+        }
+        if (tstate->gilstate_counter < 0) {
+            pybind11_fail("scoped_acquire::dec_ref(): reference count underflow!");
+        }
+#    endif
+        if (tstate->gilstate_counter == 0) {
+#    if defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+            if (!release) {
+                pybind11_fail("scoped_acquire::dec_ref(): internal error!");
+            }
+#    endif
+            PyThreadState_Clear(tstate);
+            if (active) {
+                PyThreadState_DeleteCurrent();
+            }
+            PYBIND11_TLS_DELETE_VALUE(detail::get_internals().tstate);
+            release = false;
+        }
+    }
+
+    /// This method will disable the PyThreadState_DeleteCurrent call and the
+    /// GIL won't be acquired. This method should be used if the interpreter
+    /// could be shutting down when this is called, as thread deletion is not
+    /// allowed during shutdown. Check _Py_IsFinalizing() on Python 3.7+, and
+    /// protect subsequent code.
+    PYBIND11_NOINLINE void disarm() { active = false; }
+
+    PYBIND11_NOINLINE ~gil_scoped_acquire() {
+        dec_ref();
+        if (release) {
+            PyEval_SaveThread();
+        }
+    }
+
+private:
+    PyThreadState *tstate = nullptr;
+    bool release = true;
+    bool active = true;
+};
+
+class gil_scoped_release {
+public:
+    // PRECONDITION: The GIL must be held when this constructor is called.
+    explicit gil_scoped_release(bool disassoc = false) : disassoc(disassoc) {
+        assert(PyGILState_Check());
+        // `get_internals()` must be called here unconditionally in order to initialize
+        // `internals.tstate` for subsequent `gil_scoped_acquire` calls. Otherwise, an
+        // initialization race could occur as multiple threads try `gil_scoped_acquire`.
+        auto &internals = detail::get_internals();
+        // NOLINTNEXTLINE(cppcoreguidelines-prefer-member-initializer)
+        tstate = PyEval_SaveThread();
+        if (disassoc) {
+            // Python >= 3.7 can remove this, it's an int before 3.7
+            // NOLINTNEXTLINE(readability-qualified-auto)
+            auto key = internals.tstate;
+            PYBIND11_TLS_DELETE_VALUE(key);
+        }
+    }
+
+    gil_scoped_release(const gil_scoped_release &) = delete;
+    gil_scoped_release &operator=(const gil_scoped_release &) = delete;
+
+    /// This method will disable the PyThreadState_DeleteCurrent call and the
+    /// GIL won't be acquired. This method should be used if the interpreter
+    /// could be shutting down when this is called, as thread deletion is not
+    /// allowed during shutdown. Check _Py_IsFinalizing() on Python 3.7+, and
+    /// protect subsequent code.
+    PYBIND11_NOINLINE void disarm() { active = false; }
+
+    ~gil_scoped_release() {
+        if (!tstate) {
+            return;
+        }
+        // `PyEval_RestoreThread()` should not be called if runtime is finalizing
+        if (active) {
+            PyEval_RestoreThread(tstate);
+        }
+        if (disassoc) {
+            // Python >= 3.7 can remove this, it's an int before 3.7
+            // NOLINTNEXTLINE(readability-qualified-auto)
+            auto key = detail::get_internals().tstate;
+            PYBIND11_TLS_REPLACE_VALUE(key, tstate);
+        }
+    }
+
+private:
+    PyThreadState *tstate;
+    bool disassoc;
+    bool active = true;
+};
+
+#else // PYBIND11_SIMPLE_GIL_MANAGEMENT
+
+class gil_scoped_acquire {
+    PyGILState_STATE state;
+
+public:
+    gil_scoped_acquire() : state{PyGILState_Ensure()} {}
+    gil_scoped_acquire(const gil_scoped_acquire &) = delete;
+    gil_scoped_acquire &operator=(const gil_scoped_acquire &) = delete;
+    ~gil_scoped_acquire() { PyGILState_Release(state); }
+    void disarm() {}
+};
+
+class gil_scoped_release {
+    PyThreadState *state;
+
+public:
+    // PRECONDITION: The GIL must be held when this constructor is called.
+    gil_scoped_release() {
+        assert(PyGILState_Check());
+        state = PyEval_SaveThread();
+    }
+    gil_scoped_release(const gil_scoped_release &) = delete;
+    gil_scoped_release &operator=(const gil_scoped_release &) = delete;
+    ~gil_scoped_release() { PyEval_RestoreThread(state); }
+    void disarm() {}
+};
+
+#endif // PYBIND11_SIMPLE_GIL_MANAGEMENT
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/phivenv/Lib/site-packages/torch/include/pybind11/gil_safe_call_once.h b/phivenv/Lib/site-packages/torch/include/pybind11/gil_safe_call_once.h
new file mode 100644
index 0000000000000000000000000000000000000000..3807182d54c4e641745c6cecce15975d030dd704
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/pybind11/gil_safe_call_once.h
@@ -0,0 +1,100 @@
+// Copyright (c) 2023 The pybind Community.
+
+#pragma once
+
+#include "detail/common.h"
+#include "gil.h"
+
+#include <cassert>
+#include <mutex>
+
+#ifdef Py_GIL_DISABLED
+#    include <atomic>
+#endif
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+// Use the `gil_safe_call_once_and_store` class below instead of the naive
+//
+//   static auto imported_obj = py::module_::import("module_name"); // BAD, DO NOT USE!
+//
+// which has two serious issues:
+//
+//     1. Py_DECREF() calls potentially after the Python interpreter was finalized already, and
+//     2. deadlocks in multi-threaded processes (because of missing lock ordering).
+//
+// The following alternative avoids both problems:
+//
+//   PYBIND11_CONSTINIT static py::gil_safe_call_once_and_store<py::object> storage;
+//   auto &imported_obj = storage // Do NOT make this `static`!
+//       .call_once_and_store_result([]() {
+//           return py::module_::import("module_name");
+//       })
+//       .get_stored();
+//
+// The parameter of `call_once_and_store_result()` must be callable. It can make
+// CPython API calls, and in particular, it can temporarily release the GIL.
+//
+// `T` can be any C++ type, it does not have to involve CPython API types.
+//
+// The behavior with regard to signals, e.g. `SIGINT` (`KeyboardInterrupt`),
+// is not ideal. If the main thread is the one to actually run the `Callable`,
+// then a `KeyboardInterrupt` will interrupt it if it is running normal Python
+// code. The situation is different if a non-main thread runs the
+// `Callable`, and then the main thread starts waiting for it to complete:
+// a `KeyboardInterrupt` will not interrupt the non-main thread, but it will
+// get processed only when it is the main thread's turn again and it is running
+// normal Python code. However, this will be unnoticeable for quick call-once
+// functions, which is usually the case.
+template <typename T>
+class gil_safe_call_once_and_store {
+public:
+    // PRECONDITION: The GIL must be held when `call_once_and_store_result()` is called.
+    template <typename Callable>
+    gil_safe_call_once_and_store &call_once_and_store_result(Callable &&fn) {
+        if (!is_initialized_) { // This read is guarded by the GIL.
+            // Multiple threads may enter here, because the GIL is released in the next line and
+            // CPython API calls in the `fn()` call below may release and reacquire the GIL.
+            gil_scoped_release gil_rel; // Needed to establish lock ordering.
+            std::call_once(once_flag_, [&] {
+                // Only one thread will ever enter here.
+                gil_scoped_acquire gil_acq;
+                ::new (storage_) T(fn()); // fn may release, but will reacquire, the GIL.
+                is_initialized_ = true;   // This write is guarded by the GIL.
+            });
+            // All threads will observe `is_initialized_` as true here.
+        }
+        // Intentionally not returning `T &` to ensure the calling code is self-documenting.
+        return *this;
+    }
+
+    // This must only be called after `call_once_and_store_result()` was called.
+    T &get_stored() {
+        assert(is_initialized_);
+        PYBIND11_WARNING_PUSH
+#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5
+        // Needed for gcc 4.8.5
+        PYBIND11_WARNING_DISABLE_GCC("-Wstrict-aliasing")
+#endif
+        return *reinterpret_cast<T *>(storage_);
+        PYBIND11_WARNING_POP
+    }
+
+    constexpr gil_safe_call_once_and_store() = default;
+    PYBIND11_DTOR_CONSTEXPR ~gil_safe_call_once_and_store() = default;
+
+private:
+    alignas(T) char storage_[sizeof(T)] = {};
+    std::once_flag once_flag_ = {};
+#ifdef Py_GIL_DISABLED
+    std::atomic_bool
+#else
+    bool
+#endif
+        is_initialized_{false};
+    // The `is_initialized_`-`storage_` pair is very similar to `std::optional`,
+    // but the latter does not have the triviality properties of former,
+    // therefore `std::optional` is not a viable alternative here.
+};
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/phivenv/Lib/site-packages/torch/include/pybind11/iostream.h b/phivenv/Lib/site-packages/torch/include/pybind11/iostream.h
new file mode 100644
index 0000000000000000000000000000000000000000..1783c90a2495e4ff3ee633c3d800236f47f68513
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/pybind11/iostream.h
@@ -0,0 +1,265 @@
+/*
+    pybind11/iostream.h -- Tools to assist with redirecting cout and cerr to Python
+
+    Copyright (c) 2017 Henry F. Schreiner
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+
+    WARNING: The implementation in this file is NOT thread safe. Multiple
+    threads writing to a redirected ostream concurrently cause data races
+    and potentially buffer overflows. Therefore it is currently a requirement
+    that all (possibly) concurrent redirected ostream writes are protected by
+    a mutex.
+    #HelpAppreciated: Work on iostream.h thread safety.
+    For more background see the discussions under
+    https://github.com/pybind/pybind11/pull/2982 and
+    https://github.com/pybind/pybind11/pull/2995.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+
+#include <algorithm>
+#include <cstring>
+#include <iostream>
+#include <iterator>
+#include <memory>
+#include <ostream>
+#include <streambuf>
+#include <string>
+#include <utility>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// Buffer that writes to Python instead of C++
+class pythonbuf : public std::streambuf {
+private:
+    using traits_type = std::streambuf::traits_type;
+
+    const size_t buf_size;
+    std::unique_ptr<char[]> d_buffer;
+    object pywrite;
+    object pyflush;
+
+    int overflow(int c) override {
+        if (!traits_type::eq_int_type(c, traits_type::eof())) {
+            *pptr() = traits_type::to_char_type(c);
+            pbump(1);
+        }
+        return sync() == 0 ? traits_type::not_eof(c) : traits_type::eof();
+    }
+
+    // Computes how many bytes at the end of the buffer are part of an
+    // incomplete sequence of UTF-8 bytes.
+    // Precondition: pbase() < pptr()
+    size_t utf8_remainder() const {
+        const auto rbase = std::reverse_iterator<char *>(pbase());
+        const auto rpptr = std::reverse_iterator<char *>(pptr());
+        auto is_ascii = [](char c) { return (static_cast<unsigned char>(c) & 0x80) == 0x00; };
+        auto is_leading = [](char c) { return (static_cast<unsigned char>(c) & 0xC0) == 0xC0; };
+        auto is_leading_2b = [](char c) { return static_cast<unsigned char>(c) <= 0xDF; };
+        auto is_leading_3b = [](char c) { return static_cast<unsigned char>(c) <= 0xEF; };
+        // If the last character is ASCII, there are no incomplete code points
+        if (is_ascii(*rpptr)) {
+            return 0;
+        }
+        // Otherwise, work back from the end of the buffer and find the first
+        // UTF-8 leading byte
+        const auto rpend = rbase - rpptr >= 3 ? rpptr + 3 : rbase;
+        const auto leading = std::find_if(rpptr, rpend, is_leading);
+        if (leading == rbase) {
+            return 0;
+        }
+        const auto dist = static_cast<size_t>(leading - rpptr);
+        size_t remainder = 0;
+
+        if (dist == 0) {
+            remainder = 1; // 1-byte code point is impossible
+        } else if (dist == 1) {
+            remainder = is_leading_2b(*leading) ? 0 : dist + 1;
+        } else if (dist == 2) {
+            remainder = is_leading_3b(*leading) ? 0 : dist + 1;
+        }
+        // else if (dist >= 3), at least 4 bytes before encountering an UTF-8
+        // leading byte, either no remainder or invalid UTF-8.
+        // Invalid UTF-8 will cause an exception later when converting
+        // to a Python string, so that's not handled here.
+        return remainder;
+    }
+
+    // This function must be non-virtual to be called in a destructor.
+    int _sync() {
+        if (pbase() != pptr()) { // If buffer is not empty
+            gil_scoped_acquire tmp;
+            // This subtraction cannot be negative, so dropping the sign.
+            auto size = static_cast<size_t>(pptr() - pbase());
+            size_t remainder = utf8_remainder();
+
+            if (size > remainder) {
+                str line(pbase(), size - remainder);
+                pywrite(std::move(line));
+                pyflush();
+            }
+
+            // Copy the remainder at the end of the buffer to the beginning:
+            if (remainder > 0) {
+                std::memmove(pbase(), pptr() - remainder, remainder);
+            }
+            setp(pbase(), epptr());
+            pbump(static_cast<int>(remainder));
+        }
+        return 0;
+    }
+
+    int sync() override { return _sync(); }
+
+public:
+    explicit pythonbuf(const object &pyostream, size_t buffer_size = 1024)
+        : buf_size(buffer_size), d_buffer(new char[buf_size]), pywrite(pyostream.attr("write")),
+          pyflush(pyostream.attr("flush")) {
+        setp(d_buffer.get(), d_buffer.get() + buf_size - 1);
+    }
+
+    pythonbuf(pythonbuf &&) = default;
+
+    /// Sync before destroy
+    ~pythonbuf() override { _sync(); }
+};
+
+PYBIND11_NAMESPACE_END(detail)
+
+/** \rst
+    This a move-only guard that redirects output.
+
+    .. code-block:: cpp
+
+        #include <pybind11/iostream.h>
+
+        ...
+
+        {
+            py::scoped_ostream_redirect output;
+            std::cout << "Hello, World!"; // Python stdout
+        } // <-- return std::cout to normal
+
+    You can explicitly pass the c++ stream and the python object,
+    for example to guard stderr instead.
+
+    .. code-block:: cpp
+
+        {
+            py::scoped_ostream_redirect output{
+                std::cerr, py::module::import("sys").attr("stderr")};
+            std::cout << "Hello, World!";
+        }
+ \endrst */
+class scoped_ostream_redirect {
+protected:
+    std::streambuf *old;
+    std::ostream &costream;
+    detail::pythonbuf buffer;
+
+public:
+    explicit scoped_ostream_redirect(std::ostream &costream = std::cout,
+                                     const object &pyostream
+                                     = module_::import("sys").attr("stdout"))
+        : costream(costream), buffer(pyostream) {
+        old = costream.rdbuf(&buffer);
+    }
+
+    ~scoped_ostream_redirect() { costream.rdbuf(old); }
+
+    scoped_ostream_redirect(const scoped_ostream_redirect &) = delete;
+    scoped_ostream_redirect(scoped_ostream_redirect &&other) = default;
+    scoped_ostream_redirect &operator=(const scoped_ostream_redirect &) = delete;
+    scoped_ostream_redirect &operator=(scoped_ostream_redirect &&) = delete;
+};
+
+/** \rst
+    Like `scoped_ostream_redirect`, but redirects cerr by default. This class
+    is provided primary to make ``py::call_guard`` easier to make.
+
+    .. code-block:: cpp
+
+     m.def("noisy_func", &noisy_func,
+           py::call_guard<scoped_ostream_redirect,
+                          scoped_estream_redirect>());
+
+\endrst */
+class scoped_estream_redirect : public scoped_ostream_redirect {
+public:
+    explicit scoped_estream_redirect(std::ostream &costream = std::cerr,
+                                     const object &pyostream
+                                     = module_::import("sys").attr("stderr"))
+        : scoped_ostream_redirect(costream, pyostream) {}
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// Class to redirect output as a context manager. C++ backend.
+class OstreamRedirect {
+    bool do_stdout_;
+    bool do_stderr_;
+    std::unique_ptr<scoped_ostream_redirect> redirect_stdout;
+    std::unique_ptr<scoped_estream_redirect> redirect_stderr;
+
+public:
+    explicit OstreamRedirect(bool do_stdout = true, bool do_stderr = true)
+        : do_stdout_(do_stdout), do_stderr_(do_stderr) {}
+
+    void enter() {
+        if (do_stdout_) {
+            redirect_stdout.reset(new scoped_ostream_redirect());
+        }
+        if (do_stderr_) {
+            redirect_stderr.reset(new scoped_estream_redirect());
+        }
+    }
+
+    void exit() {
+        redirect_stdout.reset();
+        redirect_stderr.reset();
+    }
+};
+
+PYBIND11_NAMESPACE_END(detail)
+
+/** \rst
+    This is a helper function to add a C++ redirect context manager to Python
+    instead of using a C++ guard. To use it, add the following to your binding code:
+
+    .. code-block:: cpp
+
+        #include <pybind11/iostream.h>
+
+        ...
+
+        py::add_ostream_redirect(m, "ostream_redirect");
+
+    You now have a Python context manager that redirects your output:
+
+    .. code-block:: python
+
+        with m.ostream_redirect():
+            m.print_to_cout_function()
+
+    This manager can optionally be told which streams to operate on:
+
+    .. code-block:: python
+
+        with m.ostream_redirect(stdout=true, stderr=true):
+            m.noisy_function_with_error_printing()
+
+ \endrst */
+inline class_<detail::OstreamRedirect>
+add_ostream_redirect(module_ m, const std::string &name = "ostream_redirect") {
+    return class_<detail::OstreamRedirect>(std::move(m), name.c_str(), module_local())
+        .def(init<bool, bool>(), arg("stdout") = true, arg("stderr") = true)
+        .def("__enter__", &detail::OstreamRedirect::enter)
+        .def("__exit__", [](detail::OstreamRedirect &self_, const args &) { self_.exit(); });
+}
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/phivenv/Lib/site-packages/torch/include/pybind11/numpy.h b/phivenv/Lib/site-packages/torch/include/pybind11/numpy.h
new file mode 100644
index 0000000000000000000000000000000000000000..957561ab38d64b613c8f1c53933963f44a4f8a6d
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/pybind11/numpy.h
@@ -0,0 +1,2139 @@
+/*
+    pybind11/numpy.h: Basic NumPy support, vectorize() wrapper
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include "detail/common.h"
+#include "complex.h"
+#include "gil_safe_call_once.h"
+#include "pytypes.h"
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <functional>
+#include <numeric>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <typeindex>
+#include <utility>
+#include <vector>
+
+#if defined(PYBIND11_NUMPY_1_ONLY) && !defined(PYBIND11_INTERNAL_NUMPY_1_ONLY_DETECTED)
+#    error PYBIND11_NUMPY_1_ONLY must be defined before any pybind11 header is included.
+#endif
+
+/* This will be true on all flat address space platforms and allows us to reduce the
+   whole npy_intp / ssize_t / Py_intptr_t business down to just ssize_t for all size
+   and dimension types (e.g. shape, strides, indexing), instead of inflicting this
+   upon the library user.
+   Note that NumPy 2 now uses ssize_t for `npy_intp` to simplify this. */
+static_assert(sizeof(::pybind11::ssize_t) == sizeof(Py_intptr_t), "ssize_t != Py_intptr_t");
+static_assert(std::is_signed<Py_intptr_t>::value, "Py_intptr_t must be signed");
+// We now can reinterpret_cast between py::ssize_t and Py_intptr_t (MSVC + PyPy cares)
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+PYBIND11_WARNING_DISABLE_MSVC(4127)
+
+class dtype; // Forward declaration
+class array; // Forward declaration
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <>
+struct handle_type_name<dtype> {
+    static constexpr auto name = const_name("numpy.dtype");
+};
+
+template <>
+struct handle_type_name<array> {
+    static constexpr auto name = const_name("numpy.ndarray");
+};
+
+template <typename type, typename SFINAE = void>
+struct npy_format_descriptor;
+
+/* NumPy 1 proxy (always includes legacy fields) */
+struct PyArrayDescr1_Proxy {
+    PyObject_HEAD
+    PyObject *typeobj;
+    char kind;
+    char type;
+    char byteorder;
+    char flags;
+    int type_num;
+    int elsize;
+    int alignment;
+    char *subarray;
+    PyObject *fields;
+    PyObject *names;
+};
+
+#ifndef PYBIND11_NUMPY_1_ONLY
+struct PyArrayDescr_Proxy {
+    PyObject_HEAD
+    PyObject *typeobj;
+    char kind;
+    char type;
+    char byteorder;
+    char _former_flags;
+    int type_num;
+    /* Additional fields are NumPy version specific. */
+};
+#else
+/* NumPy 1.x only, we can expose all fields */
+using PyArrayDescr_Proxy = PyArrayDescr1_Proxy;
+#endif
+
+/* NumPy 2 proxy, including legacy fields */
+struct PyArrayDescr2_Proxy {
+    PyObject_HEAD
+    PyObject *typeobj;
+    char kind;
+    char type;
+    char byteorder;
+    char _former_flags;
+    int type_num;
+    std::uint64_t flags;
+    ssize_t elsize;
+    ssize_t alignment;
+    PyObject *metadata;
+    Py_hash_t hash;
+    void *reserved_null[2];
+    /* The following fields only exist if 0 <= type_num < 2056 */
+    char *subarray;
+    PyObject *fields;
+    PyObject *names;
+};
+
+struct PyArray_Proxy {
+    PyObject_HEAD
+    char *data;
+    int nd;
+    ssize_t *dimensions;
+    ssize_t *strides;
+    PyObject *base;
+    PyObject *descr;
+    int flags;
+};
+
+struct PyVoidScalarObject_Proxy {
+    PyObject_VAR_HEAD char *obval;
+    PyArrayDescr_Proxy *descr;
+    int flags;
+    PyObject *base;
+};
+
+struct numpy_type_info {
+    PyObject *dtype_ptr;
+    std::string format_str;
+};
+
+struct numpy_internals {
+    std::unordered_map<std::type_index, numpy_type_info> registered_dtypes;
+
+    numpy_type_info *get_type_info(const std::type_info &tinfo, bool throw_if_missing = true) {
+        auto it = registered_dtypes.find(std::type_index(tinfo));
+        if (it != registered_dtypes.end()) {
+            return &(it->second);
+        }
+        if (throw_if_missing) {
+            pybind11_fail(std::string("NumPy type info missing for ") + tinfo.name());
+        }
+        return nullptr;
+    }
+
+    template <typename T>
+    numpy_type_info *get_type_info(bool throw_if_missing = true) {
+        return get_type_info(typeid(typename std::remove_cv<T>::type), throw_if_missing);
+    }
+};
+
+PYBIND11_NOINLINE void load_numpy_internals(numpy_internals *&ptr) {
+    ptr = &get_or_create_shared_data<numpy_internals>("_numpy_internals");
+}
+
+inline numpy_internals &get_numpy_internals() {
+    static numpy_internals *ptr = nullptr;
+    if (!ptr) {
+        load_numpy_internals(ptr);
+    }
+    return *ptr;
+}
+
+PYBIND11_NOINLINE module_ import_numpy_core_submodule(const char *submodule_name) {
+    module_ numpy = module_::import("numpy");
+    str version_string = numpy.attr("__version__");
+
+    module_ numpy_lib = module_::import("numpy.lib");
+    object numpy_version = numpy_lib.attr("NumpyVersion")(version_string);
+    int major_version = numpy_version.attr("major").cast<int>();
+
+#ifdef PYBIND11_NUMPY_1_ONLY
+    if (major_version >= 2) {
+        throw std::runtime_error(
+            "This extension was built with PYBIND11_NUMPY_1_ONLY defined, "
+            "but NumPy 2 is used in this process. For NumPy2 compatibility, "
+            "this extension needs to be rebuilt without the PYBIND11_NUMPY_1_ONLY define.");
+    }
+#endif
+    /* `numpy.core` was renamed to `numpy._core` in NumPy 2.0 as it officially
+        became a private module. */
+    std::string numpy_core_path = major_version >= 2 ? "numpy._core" : "numpy.core";
+    return module_::import((numpy_core_path + "." + submodule_name).c_str());
+}
+
+template <typename T>
+struct same_size {
+    template <typename U>
+    using as = bool_constant<sizeof(T) == sizeof(U)>;
+};
+
+template <typename Concrete>
+constexpr int platform_lookup() {
+    return -1;
+}
+
+// Lookup a type according to its size, and return a value corresponding to the NumPy typenum.
+template <typename Concrete, typename T, typename... Ts, typename... Ints>
+constexpr int platform_lookup(int I, Ints... Is) {
+    return sizeof(Concrete) == sizeof(T) ? I : platform_lookup<Concrete, Ts...>(Is...);
+}
+
+struct npy_api {
+    enum constants {
+        NPY_ARRAY_C_CONTIGUOUS_ = 0x0001,
+        NPY_ARRAY_F_CONTIGUOUS_ = 0x0002,
+        NPY_ARRAY_OWNDATA_ = 0x0004,
+        NPY_ARRAY_FORCECAST_ = 0x0010,
+        NPY_ARRAY_ENSUREARRAY_ = 0x0040,
+        NPY_ARRAY_ALIGNED_ = 0x0100,
+        NPY_ARRAY_WRITEABLE_ = 0x0400,
+        NPY_BOOL_ = 0,
+        NPY_BYTE_,
+        NPY_UBYTE_,
+        NPY_SHORT_,
+        NPY_USHORT_,
+        NPY_INT_,
+        NPY_UINT_,
+        NPY_LONG_,
+        NPY_ULONG_,
+        NPY_LONGLONG_,
+        NPY_ULONGLONG_,
+        NPY_FLOAT_,
+        NPY_DOUBLE_,
+        NPY_LONGDOUBLE_,
+        NPY_CFLOAT_,
+        NPY_CDOUBLE_,
+        NPY_CLONGDOUBLE_,
+        NPY_OBJECT_ = 17,
+        NPY_STRING_,
+        NPY_UNICODE_,
+        NPY_VOID_,
+        // Platform-dependent normalization
+        NPY_INT8_ = NPY_BYTE_,
+        NPY_UINT8_ = NPY_UBYTE_,
+        NPY_INT16_ = NPY_SHORT_,
+        NPY_UINT16_ = NPY_USHORT_,
+        // `npy_common.h` defines the integer aliases. In order, it checks:
+        // NPY_BITSOF_LONG, NPY_BITSOF_LONGLONG, NPY_BITSOF_INT, NPY_BITSOF_SHORT, NPY_BITSOF_CHAR
+        // and assigns the alias to the first matching size, so we should check in this order.
+        NPY_INT32_
+        = platform_lookup<std::int32_t, long, int, short>(NPY_LONG_, NPY_INT_, NPY_SHORT_),
+        NPY_UINT32_ = platform_lookup<std::uint32_t, unsigned long, unsigned int, unsigned short>(
+            NPY_ULONG_, NPY_UINT_, NPY_USHORT_),
+        NPY_INT64_
+        = platform_lookup<std::int64_t, long, long long, int>(NPY_LONG_, NPY_LONGLONG_, NPY_INT_),
+        NPY_UINT64_
+        = platform_lookup<std::uint64_t, unsigned long, unsigned long long, unsigned int>(
+            NPY_ULONG_, NPY_ULONGLONG_, NPY_UINT_),
+    };
+
+    unsigned int PyArray_RUNTIME_VERSION_;
+
+    struct PyArray_Dims {
+        Py_intptr_t *ptr;
+        int len;
+    };
+
+    static npy_api &get() {
+        PYBIND11_CONSTINIT static gil_safe_call_once_and_store<npy_api> storage;
+        return storage.call_once_and_store_result(lookup).get_stored();
+    }
+
+    bool PyArray_Check_(PyObject *obj) const {
+        return PyObject_TypeCheck(obj, PyArray_Type_) != 0;
+    }
+    bool PyArrayDescr_Check_(PyObject *obj) const {
+        return PyObject_TypeCheck(obj, PyArrayDescr_Type_) != 0;
+    }
+
+    unsigned int (*PyArray_GetNDArrayCFeatureVersion_)();
+    PyObject *(*PyArray_DescrFromType_)(int);
+    PyObject *(*PyArray_NewFromDescr_)(PyTypeObject *,
+                                       PyObject *,
+                                       int,
+                                       Py_intptr_t const *,
+                                       Py_intptr_t const *,
+                                       void *,
+                                       int,
+                                       PyObject *);
+    // Unused. Not removed because that affects ABI of the class.
+    PyObject *(*PyArray_DescrNewFromType_)(int);
+    int (*PyArray_CopyInto_)(PyObject *, PyObject *);
+    PyObject *(*PyArray_NewCopy_)(PyObject *, int);
+    PyTypeObject *PyArray_Type_;
+    PyTypeObject *PyVoidArrType_Type_;
+    PyTypeObject *PyArrayDescr_Type_;
+    PyObject *(*PyArray_DescrFromScalar_)(PyObject *);
+    PyObject *(*PyArray_FromAny_)(PyObject *, PyObject *, int, int, int, PyObject *);
+    int (*PyArray_DescrConverter_)(PyObject *, PyObject **);
+    bool (*PyArray_EquivTypes_)(PyObject *, PyObject *);
+#ifdef PYBIND11_NUMPY_1_ONLY
+    int (*PyArray_GetArrayParamsFromObject_)(PyObject *,
+                                             PyObject *,
+                                             unsigned char,
+                                             PyObject **,
+                                             int *,
+                                             Py_intptr_t *,
+                                             PyObject **,
+                                             PyObject *);
+#endif
+    PyObject *(*PyArray_Squeeze_)(PyObject *);
+    // Unused. Not removed because that affects ABI of the class.
+    int (*PyArray_SetBaseObject_)(PyObject *, PyObject *);
+    PyObject *(*PyArray_Resize_)(PyObject *, PyArray_Dims *, int, int);
+    PyObject *(*PyArray_Newshape_)(PyObject *, PyArray_Dims *, int);
+    PyObject *(*PyArray_View_)(PyObject *, PyObject *, PyObject *);
+
+private:
+    enum functions {
+        API_PyArray_GetNDArrayCFeatureVersion = 211,
+        API_PyArray_Type = 2,
+        API_PyArrayDescr_Type = 3,
+        API_PyVoidArrType_Type = 39,
+        API_PyArray_DescrFromType = 45,
+        API_PyArray_DescrFromScalar = 57,
+        API_PyArray_FromAny = 69,
+        API_PyArray_Resize = 80,
+        // CopyInto was slot 82 and 50 was effectively an alias. NumPy 2 removed 82.
+        API_PyArray_CopyInto = 50,
+        API_PyArray_NewCopy = 85,
+        API_PyArray_NewFromDescr = 94,
+        API_PyArray_DescrNewFromType = 96,
+        API_PyArray_Newshape = 135,
+        API_PyArray_Squeeze = 136,
+        API_PyArray_View = 137,
+        API_PyArray_DescrConverter = 174,
+        API_PyArray_EquivTypes = 182,
+#ifdef PYBIND11_NUMPY_1_ONLY
+        API_PyArray_GetArrayParamsFromObject = 278,
+#endif
+        API_PyArray_SetBaseObject = 282
+    };
+
+    static npy_api lookup() {
+        module_ m = detail::import_numpy_core_submodule("multiarray");
+        auto c = m.attr("_ARRAY_API");
+        void **api_ptr = (void **) PyCapsule_GetPointer(c.ptr(), nullptr);
+        if (api_ptr == nullptr) {
+            raise_from(PyExc_SystemError, "FAILURE obtaining numpy _ARRAY_API pointer.");
+            throw error_already_set();
+        }
+        npy_api api;
+#define DECL_NPY_API(Func) api.Func##_ = (decltype(api.Func##_)) api_ptr[API_##Func];
+        DECL_NPY_API(PyArray_GetNDArrayCFeatureVersion);
+        api.PyArray_RUNTIME_VERSION_ = api.PyArray_GetNDArrayCFeatureVersion_();
+        if (api.PyArray_RUNTIME_VERSION_ < 0x7) {
+            pybind11_fail("pybind11 numpy support requires numpy >= 1.7.0");
+        }
+        DECL_NPY_API(PyArray_Type);
+        DECL_NPY_API(PyVoidArrType_Type);
+        DECL_NPY_API(PyArrayDescr_Type);
+        DECL_NPY_API(PyArray_DescrFromType);
+        DECL_NPY_API(PyArray_DescrFromScalar);
+        DECL_NPY_API(PyArray_FromAny);
+        DECL_NPY_API(PyArray_Resize);
+        DECL_NPY_API(PyArray_CopyInto);
+        DECL_NPY_API(PyArray_NewCopy);
+        DECL_NPY_API(PyArray_NewFromDescr);
+        DECL_NPY_API(PyArray_DescrNewFromType);
+        DECL_NPY_API(PyArray_Newshape);
+        DECL_NPY_API(PyArray_Squeeze);
+        DECL_NPY_API(PyArray_View);
+        DECL_NPY_API(PyArray_DescrConverter);
+        DECL_NPY_API(PyArray_EquivTypes);
+#ifdef PYBIND11_NUMPY_1_ONLY
+        DECL_NPY_API(PyArray_GetArrayParamsFromObject);
+#endif
+        DECL_NPY_API(PyArray_SetBaseObject);
+
+#undef DECL_NPY_API
+        return api;
+    }
+};
+
+inline PyArray_Proxy *array_proxy(void *ptr) { return reinterpret_cast<PyArray_Proxy *>(ptr); }
+
+inline const PyArray_Proxy *array_proxy(const void *ptr) {
+    return reinterpret_cast<const PyArray_Proxy *>(ptr);
+}
+
+inline PyArrayDescr_Proxy *array_descriptor_proxy(PyObject *ptr) {
+    return reinterpret_cast<PyArrayDescr_Proxy *>(ptr);
+}
+
+inline const PyArrayDescr_Proxy *array_descriptor_proxy(const PyObject *ptr) {
+    return reinterpret_cast<const PyArrayDescr_Proxy *>(ptr);
+}
+
+inline const PyArrayDescr1_Proxy *array_descriptor1_proxy(const PyObject *ptr) {
+    return reinterpret_cast<const PyArrayDescr1_Proxy *>(ptr);
+}
+
+inline const PyArrayDescr2_Proxy *array_descriptor2_proxy(const PyObject *ptr) {
+    return reinterpret_cast<const PyArrayDescr2_Proxy *>(ptr);
+}
+
+inline bool check_flags(const void *ptr, int flag) {
+    return (flag == (array_proxy(ptr)->flags & flag));
+}
+
+template <typename T>
+struct is_std_array : std::false_type {};
+template <typename T, size_t N>
+struct is_std_array<std::array<T, N>> : std::true_type {};
+template <typename T>
+struct is_complex : std::false_type {};
+template <typename T>
+struct is_complex<std::complex<T>> : std::true_type {};
+
+template <typename T>
+struct array_info_scalar {
+    using type = T;
+    static constexpr bool is_array = false;
+    static constexpr bool is_empty = false;
+    static constexpr auto extents = const_name("");
+    static void append_extents(list & /* shape */) {}
+};
+// Computes underlying type and a comma-separated list of extents for array
+// types (any mix of std::array and built-in arrays). An array of char is
+// treated as scalar because it gets special handling.
+template <typename T>
+struct array_info : array_info_scalar<T> {};
+template <typename T, size_t N>
+struct array_info<std::array<T, N>> {
+    using type = typename array_info<T>::type;
+    static constexpr bool is_array = true;
+    static constexpr bool is_empty = (N == 0) || array_info<T>::is_empty;
+    static constexpr size_t extent = N;
+
+    // appends the extents to shape
+    static void append_extents(list &shape) {
+        shape.append(N);
+        array_info<T>::append_extents(shape);
+    }
+
+    static constexpr auto extents = const_name<array_info<T>::is_array>(
+        ::pybind11::detail::concat(const_name<N>(), array_info<T>::extents), const_name<N>());
+};
+// For numpy we have special handling for arrays of characters, so we don't include
+// the size in the array extents.
+template <size_t N>
+struct array_info<char[N]> : array_info_scalar<char[N]> {};
+template <size_t N>
+struct array_info<std::array<char, N>> : array_info_scalar<std::array<char, N>> {};
+template <typename T, size_t N>
+struct array_info<T[N]> : array_info<std::array<T, N>> {};
+template <typename T>
+using remove_all_extents_t = typename array_info<T>::type;
+
+template <typename T>
+using is_pod_struct
+    = all_of<std::is_standard_layout<T>, // since we're accessing directly in memory
+                                         // we need a standard layout type
+#if defined(__GLIBCXX__)                                                                          \
+    && (__GLIBCXX__ < 20150422 || __GLIBCXX__ == 20150426 || __GLIBCXX__ == 20150623              \
+        || __GLIBCXX__ == 20150626 || __GLIBCXX__ == 20160803)
+             // libstdc++ < 5 (including versions 4.8.5, 4.9.3 and 4.9.4 which were released after
+             // 5) don't implement is_trivially_copyable, so approximate it
+             std::is_trivially_destructible<T>,
+             satisfies_any_of<T, std::has_trivial_copy_constructor, std::has_trivial_copy_assign>,
+#else
+             std::is_trivially_copyable<T>,
+#endif
+             satisfies_none_of<T,
+                               std::is_reference,
+                               std::is_array,
+                               is_std_array,
+                               std::is_arithmetic,
+                               is_complex,
+                               std::is_enum>>;
+
+// Replacement for std::is_pod (deprecated in C++20)
+template <typename T>
+using is_pod = all_of<std::is_standard_layout<T>, std::is_trivial<T>>;
+
+template <ssize_t Dim = 0, typename Strides>
+ssize_t byte_offset_unsafe(const Strides &) {
+    return 0;
+}
+template <ssize_t Dim = 0, typename Strides, typename... Ix>
+ssize_t byte_offset_unsafe(const Strides &strides, ssize_t i, Ix... index) {
+    return i * strides[Dim] + byte_offset_unsafe<Dim + 1>(strides, index...);
+}
+
+/**
+ * Proxy class providing unsafe, unchecked const access to array data.  This is constructed through
+ * the `unchecked<T, N>()` method of `array` or the `unchecked<N>()` method of `array_t<T>`. `Dims`
+ * will be -1 for dimensions determined at runtime.
+ */
+template <typename T, ssize_t Dims>
+class unchecked_reference {
+protected:
+    static constexpr bool Dynamic = Dims < 0;
+    const unsigned char *data_;
+    // Storing the shape & strides in local variables (i.e. these arrays) allows the compiler to
+    // make large performance gains on big, nested loops, but requires compile-time dimensions
+    conditional_t<Dynamic, const ssize_t *, std::array<ssize_t, (size_t) Dims>> shape_, strides_;
+    const ssize_t dims_;
+
+    friend class pybind11::array;
+    // Constructor for compile-time dimensions:
+    template <bool Dyn = Dynamic>
+    unchecked_reference(const void *data,
+                        const ssize_t *shape,
+                        const ssize_t *strides,
+                        enable_if_t<!Dyn, ssize_t>)
+        : data_{reinterpret_cast<const unsigned char *>(data)}, dims_{Dims} {
+        for (size_t i = 0; i < (size_t) dims_; i++) {
+            shape_[i] = shape[i];
+            strides_[i] = strides[i];
+        }
+    }
+    // Constructor for runtime dimensions:
+    template <bool Dyn = Dynamic>
+    unchecked_reference(const void *data,
+                        const ssize_t *shape,
+                        const ssize_t *strides,
+                        enable_if_t<Dyn, ssize_t> dims)
+        : data_{reinterpret_cast<const unsigned char *>(data)}, shape_{shape}, strides_{strides},
+          dims_{dims} {}
+
+public:
+    /**
+     * Unchecked const reference access to data at the given indices.  For a compile-time known
+     * number of dimensions, this requires the correct number of arguments; for run-time
+     * dimensionality, this is not checked (and so is up to the caller to use safely).
+     */
+    template <typename... Ix>
+    const T &operator()(Ix... index) const {
+        static_assert(ssize_t{sizeof...(Ix)} == Dims || Dynamic,
+                      "Invalid number of indices for unchecked array reference");
+        return *reinterpret_cast<const T *>(data_
+                                            + byte_offset_unsafe(strides_, ssize_t(index)...));
+    }
+    /**
+     * Unchecked const reference access to data; this operator only participates if the reference
+     * is to a 1-dimensional array.  When present, this is exactly equivalent to `obj(index)`.
+     */
+    template <ssize_t D = Dims, typename = enable_if_t<D == 1 || Dynamic>>
+    const T &operator[](ssize_t index) const {
+        return operator()(index);
+    }
+
+    /// Pointer access to the data at the given indices.
+    template <typename... Ix>
+    const T *data(Ix... ix) const {
+        return &operator()(ssize_t(ix)...);
+    }
+
+    /// Returns the item size, i.e. sizeof(T)
+    constexpr static ssize_t itemsize() { return sizeof(T); }
+
+    /// Returns the shape (i.e. size) of dimension `dim`
+    ssize_t shape(ssize_t dim) const { return shape_[(size_t) dim]; }
+
+    /// Returns the number of dimensions of the array
+    ssize_t ndim() const { return dims_; }
+
+    /// Returns the total number of elements in the referenced array, i.e. the product of the
+    /// shapes
+    template <bool Dyn = Dynamic>
+    enable_if_t<!Dyn, ssize_t> size() const {
+        return std::accumulate(
+            shape_.begin(), shape_.end(), (ssize_t) 1, std::multiplies<ssize_t>());
+    }
+    template <bool Dyn = Dynamic>
+    enable_if_t<Dyn, ssize_t> size() const {
+        return std::accumulate(shape_, shape_ + ndim(), (ssize_t) 1, std::multiplies<ssize_t>());
+    }
+
+    /// Returns the total number of bytes used by the referenced data.  Note that the actual span
+    /// in memory may be larger if the referenced array has non-contiguous strides (e.g. for a
+    /// slice).
+    ssize_t nbytes() const { return size() * itemsize(); }
+};
+
+template <typename T, ssize_t Dims>
+class unchecked_mutable_reference : public unchecked_reference<T, Dims> {
+    friend class pybind11::array;
+    using ConstBase = unchecked_reference<T, Dims>;
+    using ConstBase::ConstBase;
+    using ConstBase::Dynamic;
+
+public:
+    // Bring in const-qualified versions from base class
+    using ConstBase::operator();
+    using ConstBase::operator[];
+
+    /// Mutable, unchecked access to data at the given indices.
+    template <typename... Ix>
+    T &operator()(Ix... index) {
+        static_assert(ssize_t{sizeof...(Ix)} == Dims || Dynamic,
+                      "Invalid number of indices for unchecked array reference");
+        return const_cast<T &>(ConstBase::operator()(index...));
+    }
+    /**
+     * Mutable, unchecked access data at the given index; this operator only participates if the
+     * reference is to a 1-dimensional array (or has runtime dimensions).  When present, this is
+     * exactly equivalent to `obj(index)`.
+     */
+    template <ssize_t D = Dims, typename = enable_if_t<D == 1 || Dynamic>>
+    T &operator[](ssize_t index) {
+        return operator()(index);
+    }
+
+    /// Mutable pointer access to the data at the given indices.
+    template <typename... Ix>
+    T *mutable_data(Ix... ix) {
+        return &operator()(ssize_t(ix)...);
+    }
+};
+
+template <typename T, ssize_t Dim>
+struct type_caster<unchecked_reference<T, Dim>> {
+    static_assert(Dim == 0 && Dim > 0 /* always fail */,
+                  "unchecked array proxy object is not castable");
+};
+template <typename T, ssize_t Dim>
+struct type_caster<unchecked_mutable_reference<T, Dim>>
+    : type_caster<unchecked_reference<T, Dim>> {};
+
+PYBIND11_NAMESPACE_END(detail)
+
+class dtype : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(dtype, object, detail::npy_api::get().PyArrayDescr_Check_)
+
+    explicit dtype(const buffer_info &info) {
+        dtype descr(_dtype_from_pep3118()(pybind11::str(info.format)));
+        // If info.itemsize == 0, use the value calculated from the format string
+        m_ptr = descr.strip_padding(info.itemsize != 0 ? info.itemsize : descr.itemsize())
+                    .release()
+                    .ptr();
+    }
+
+    explicit dtype(const pybind11::str &format) : dtype(from_args(format)) {}
+
+    explicit dtype(const std::string &format) : dtype(pybind11::str(format)) {}
+
+    explicit dtype(const char *format) : dtype(pybind11::str(format)) {}
+
+    dtype(list names, list formats, list offsets, ssize_t itemsize) {
+        dict args;
+        args["names"] = std::move(names);
+        args["formats"] = std::move(formats);
+        args["offsets"] = std::move(offsets);
+        args["itemsize"] = pybind11::int_(itemsize);
+        m_ptr = from_args(args).release().ptr();
+    }
+
+    /// Return dtype for the given typenum (one of the NPY_TYPES).
+    /// https://numpy.org/devdocs/reference/c-api/array.html#c.PyArray_DescrFromType
+    explicit dtype(int typenum)
+        : object(detail::npy_api::get().PyArray_DescrFromType_(typenum), stolen_t{}) {
+        if (m_ptr == nullptr) {
+            throw error_already_set();
+        }
+    }
+
+    /// This is essentially the same as calling numpy.dtype(args) in Python.
+    static dtype from_args(const object &args) {
+        PyObject *ptr = nullptr;
+        if ((detail::npy_api::get().PyArray_DescrConverter_(args.ptr(), &ptr) == 0) || !ptr) {
+            throw error_already_set();
+        }
+        return reinterpret_steal<dtype>(ptr);
+    }
+
+    /// Return dtype associated with a C++ type.
+    template <typename T>
+    static dtype of() {
+        return detail::npy_format_descriptor<typename std::remove_cv<T>::type>::dtype();
+    }
+
+    /// Size of the data type in bytes.
+#ifdef PYBIND11_NUMPY_1_ONLY
+    ssize_t itemsize() const { return detail::array_descriptor_proxy(m_ptr)->elsize; }
+#else
+    ssize_t itemsize() const {
+        if (detail::npy_api::get().PyArray_RUNTIME_VERSION_ < 0x12) {
+            return detail::array_descriptor1_proxy(m_ptr)->elsize;
+        }
+        return detail::array_descriptor2_proxy(m_ptr)->elsize;
+    }
+#endif
+
+    /// Returns true for structured data types.
+#ifdef PYBIND11_NUMPY_1_ONLY
+    bool has_fields() const { return detail::array_descriptor_proxy(m_ptr)->names != nullptr; }
+#else
+    bool has_fields() const {
+        if (detail::npy_api::get().PyArray_RUNTIME_VERSION_ < 0x12) {
+            return detail::array_descriptor1_proxy(m_ptr)->names != nullptr;
+        }
+        const auto *proxy = detail::array_descriptor2_proxy(m_ptr);
+        if (proxy->type_num < 0 || proxy->type_num >= 2056) {
+            return false;
+        }
+        return proxy->names != nullptr;
+    }
+#endif
+
+    /// Single-character code for dtype's kind.
+    /// For example, floating point types are 'f' and integral types are 'i'.
+    char kind() const { return detail::array_descriptor_proxy(m_ptr)->kind; }
+
+    /// Single-character for dtype's type.
+    /// For example, ``float`` is 'f', ``double`` 'd', ``int`` 'i', and ``long`` 'l'.
+    char char_() const {
+        // Note: The signature, `dtype::char_` follows the naming of NumPy's
+        // public Python API (i.e., ``dtype.char``), rather than its internal
+        // C API (``PyArray_Descr::type``).
+        return detail::array_descriptor_proxy(m_ptr)->type;
+    }
+
+    /// type number of dtype.
+    int num() const {
+        // Note: The signature, `dtype::num` follows the naming of NumPy's public
+        // Python API (i.e., ``dtype.num``), rather than its internal
+        // C API (``PyArray_Descr::type_num``).
+        return detail::array_descriptor_proxy(m_ptr)->type_num;
+    }
+
+    /// Single character for byteorder
+    char byteorder() const { return detail::array_descriptor_proxy(m_ptr)->byteorder; }
+
+/// Alignment of the data type
+#ifdef PYBIND11_NUMPY_1_ONLY
+    int alignment() const { return detail::array_descriptor_proxy(m_ptr)->alignment; }
+#else
+    ssize_t alignment() const {
+        if (detail::npy_api::get().PyArray_RUNTIME_VERSION_ < 0x12) {
+            return detail::array_descriptor1_proxy(m_ptr)->alignment;
+        }
+        return detail::array_descriptor2_proxy(m_ptr)->alignment;
+    }
+#endif
+
+/// Flags for the array descriptor
+#ifdef PYBIND11_NUMPY_1_ONLY
+    char flags() const { return detail::array_descriptor_proxy(m_ptr)->flags; }
+#else
+    std::uint64_t flags() const {
+        if (detail::npy_api::get().PyArray_RUNTIME_VERSION_ < 0x12) {
+            return (unsigned char) detail::array_descriptor1_proxy(m_ptr)->flags;
+        }
+        return detail::array_descriptor2_proxy(m_ptr)->flags;
+    }
+#endif
+
+private:
+    static object &_dtype_from_pep3118() {
+        PYBIND11_CONSTINIT static gil_safe_call_once_and_store<object> storage;
+        return storage
+            .call_once_and_store_result([]() {
+                return detail::import_numpy_core_submodule("_internal")
+                    .attr("_dtype_from_pep3118");
+            })
+            .get_stored();
+    }
+
+    dtype strip_padding(ssize_t itemsize) {
+        // Recursively strip all void fields with empty names that are generated for
+        // padding fields (as of NumPy v1.11).
+        if (!has_fields()) {
+            return *this;
+        }
+
+        struct field_descr {
+            pybind11::str name;
+            object format;
+            pybind11::int_ offset;
+            field_descr(pybind11::str &&name, object &&format, pybind11::int_ &&offset)
+                : name{std::move(name)}, format{std::move(format)}, offset{std::move(offset)} {};
+        };
+        auto field_dict = attr("fields").cast<dict>();
+        std::vector<field_descr> field_descriptors;
+        field_descriptors.reserve(field_dict.size());
+
+        for (auto field : field_dict.attr("items")()) {
+            auto spec = field.cast<tuple>();
+            auto name = spec[0].cast<pybind11::str>();
+            auto spec_fo = spec[1].cast<tuple>();
+            auto format = spec_fo[0].cast<dtype>();
+            auto offset = spec_fo[1].cast<pybind11::int_>();
+            if ((len(name) == 0u) && format.kind() == 'V') {
+                continue;
+            }
+            field_descriptors.emplace_back(
+                std::move(name), format.strip_padding(format.itemsize()), std::move(offset));
+        }
+
+        std::sort(field_descriptors.begin(),
+                  field_descriptors.end(),
+                  [](const field_descr &a, const field_descr &b) {
+                      return a.offset.cast<int>() < b.offset.cast<int>();
+                  });
+
+        list names, formats, offsets;
+        for (auto &descr : field_descriptors) {
+            names.append(std::move(descr.name));
+            formats.append(std::move(descr.format));
+            offsets.append(std::move(descr.offset));
+        }
+        return dtype(std::move(names), std::move(formats), std::move(offsets), itemsize);
+    }
+};
+
+class array : public buffer {
+public:
+    PYBIND11_OBJECT_CVT(array, buffer, detail::npy_api::get().PyArray_Check_, raw_array)
+
+    enum {
+        c_style = detail::npy_api::NPY_ARRAY_C_CONTIGUOUS_,
+        f_style = detail::npy_api::NPY_ARRAY_F_CONTIGUOUS_,
+        forcecast = detail::npy_api::NPY_ARRAY_FORCECAST_
+    };
+
+    array() : array(0, static_cast<const double *>(nullptr)) {}
+
+    using ShapeContainer = detail::any_container<ssize_t>;
+    using StridesContainer = detail::any_container<ssize_t>;
+
+    // Constructs an array taking shape/strides from arbitrary container types
+    array(const pybind11::dtype &dt,
+          ShapeContainer shape,
+          StridesContainer strides,
+          const void *ptr = nullptr,
+          handle base = handle()) {
+
+        if (strides->empty()) {
+            *strides = detail::c_strides(*shape, dt.itemsize());
+        }
+
+        auto ndim = shape->size();
+        if (ndim != strides->size()) {
+            pybind11_fail("NumPy: shape ndim doesn't match strides ndim");
+        }
+        auto descr = dt;
+
+        int flags = 0;
+        if (base && ptr) {
+            if (isinstance<array>(base)) {
+                /* Copy flags from base (except ownership bit) */
+                flags = reinterpret_borrow<array>(base).flags()
+                        & ~detail::npy_api::NPY_ARRAY_OWNDATA_;
+            } else {
+                /* Writable by default, easy to downgrade later on if needed */
+                flags = detail::npy_api::NPY_ARRAY_WRITEABLE_;
+            }
+        }
+
+        auto &api = detail::npy_api::get();
+        auto tmp = reinterpret_steal<object>(api.PyArray_NewFromDescr_(
+            api.PyArray_Type_,
+            descr.release().ptr(),
+            (int) ndim,
+            // Use reinterpret_cast for PyPy on Windows (remove if fixed, checked on 7.3.1)
+            reinterpret_cast<Py_intptr_t *>(shape->data()),
+            reinterpret_cast<Py_intptr_t *>(strides->data()),
+            const_cast<void *>(ptr),
+            flags,
+            nullptr));
+        if (!tmp) {
+            throw error_already_set();
+        }
+        if (ptr) {
+            if (base) {
+                api.PyArray_SetBaseObject_(tmp.ptr(), base.inc_ref().ptr());
+            } else {
+                tmp = reinterpret_steal<object>(
+                    api.PyArray_NewCopy_(tmp.ptr(), -1 /* any order */));
+            }
+        }
+        m_ptr = tmp.release().ptr();
+    }
+
+    array(const pybind11::dtype &dt,
+          ShapeContainer shape,
+          const void *ptr = nullptr,
+          handle base = handle())
+        : array(dt, std::move(shape), {}, ptr, base) {}
+
+    template <typename T,
+              typename
+              = detail::enable_if_t<std::is_integral<T>::value && !std::is_same<bool, T>::value>>
+    array(const pybind11::dtype &dt, T count, const void *ptr = nullptr, handle base = handle())
+        : array(dt, {{count}}, ptr, base) {}
+
+    template <typename T>
+    array(ShapeContainer shape, StridesContainer strides, const T *ptr, handle base = handle())
+        : array(pybind11::dtype::of<T>(),
+                std::move(shape),
+                std::move(strides),
+                reinterpret_cast<const void *>(ptr),
+                base) {}
+
+    template <typename T>
+    array(ShapeContainer shape, const T *ptr, handle base = handle())
+        : array(std::move(shape), {}, ptr, base) {}
+
+    template <typename T>
+    explicit array(ssize_t count, const T *ptr, handle base = handle())
+        : array({count}, {}, ptr, base) {}
+
+    explicit array(const buffer_info &info, handle base = handle())
+        : array(pybind11::dtype(info), info.shape, info.strides, info.ptr, base) {}
+
+    /// Array descriptor (dtype)
+    pybind11::dtype dtype() const {
+        return reinterpret_borrow<pybind11::dtype>(detail::array_proxy(m_ptr)->descr);
+    }
+
+    /// Total number of elements
+    ssize_t size() const {
+        return std::accumulate(shape(), shape() + ndim(), (ssize_t) 1, std::multiplies<ssize_t>());
+    }
+
+    /// Byte size of a single element
+    ssize_t itemsize() const { return dtype().itemsize(); }
+
+    /// Total number of bytes
+    ssize_t nbytes() const { return size() * itemsize(); }
+
+    /// Number of dimensions
+    ssize_t ndim() const { return detail::array_proxy(m_ptr)->nd; }
+
+    /// Base object
+    object base() const { return reinterpret_borrow<object>(detail::array_proxy(m_ptr)->base); }
+
+    /// Dimensions of the array
+    const ssize_t *shape() const { return detail::array_proxy(m_ptr)->dimensions; }
+
+    /// Dimension along a given axis
+    ssize_t shape(ssize_t dim) const {
+        if (dim >= ndim()) {
+            fail_dim_check(dim, "invalid axis");
+        }
+        return shape()[dim];
+    }
+
+    /// Strides of the array
+    const ssize_t *strides() const { return detail::array_proxy(m_ptr)->strides; }
+
+    /// Stride along a given axis
+    ssize_t strides(ssize_t dim) const {
+        if (dim >= ndim()) {
+            fail_dim_check(dim, "invalid axis");
+        }
+        return strides()[dim];
+    }
+
+    /// Return the NumPy array flags
+    int flags() const { return detail::array_proxy(m_ptr)->flags; }
+
+    /// If set, the array is writeable (otherwise the buffer is read-only)
+    bool writeable() const {
+        return detail::check_flags(m_ptr, detail::npy_api::NPY_ARRAY_WRITEABLE_);
+    }
+
+    /// If set, the array owns the data (will be freed when the array is deleted)
+    bool owndata() const {
+        return detail::check_flags(m_ptr, detail::npy_api::NPY_ARRAY_OWNDATA_);
+    }
+
+    /// Pointer to the contained data. If index is not provided, points to the
+    /// beginning of the buffer. May throw if the index would lead to out of bounds access.
+    template <typename... Ix>
+    const void *data(Ix... index) const {
+        return static_cast<const void *>(detail::array_proxy(m_ptr)->data + offset_at(index...));
+    }
+
+    /// Mutable pointer to the contained data. If index is not provided, points to the
+    /// beginning of the buffer. May throw if the index would lead to out of bounds access.
+    /// May throw if the array is not writeable.
+    template <typename... Ix>
+    void *mutable_data(Ix... index) {
+        check_writeable();
+        return static_cast<void *>(detail::array_proxy(m_ptr)->data + offset_at(index...));
+    }
+
+    /// Byte offset from beginning of the array to a given index (full or partial).
+    /// May throw if the index would lead to out of bounds access.
+    template <typename... Ix>
+    ssize_t offset_at(Ix... index) const {
+        if ((ssize_t) sizeof...(index) > ndim()) {
+            fail_dim_check(sizeof...(index), "too many indices for an array");
+        }
+        return byte_offset(ssize_t(index)...);
+    }
+
+    ssize_t offset_at() const { return 0; }
+
+    /// Item count from beginning of the array to a given index (full or partial).
+    /// May throw if the index would lead to out of bounds access.
+    template <typename... Ix>
+    ssize_t index_at(Ix... index) const {
+        return offset_at(index...) / itemsize();
+    }
+
+    /**
+     * Returns a proxy object that provides access to the array's data without bounds or
+     * dimensionality checking.  Will throw if the array is missing the `writeable` flag.  Use with
+     * care: the array must not be destroyed or reshaped for the duration of the returned object,
+     * and the caller must take care not to access invalid dimensions or dimension indices.
+     */
+    template <typename T, ssize_t Dims = -1>
+    detail::unchecked_mutable_reference<T, Dims> mutable_unchecked() & {
+        if (Dims >= 0 && ndim() != Dims) {
+            throw std::domain_error("array has incorrect number of dimensions: "
+                                    + std::to_string(ndim()) + "; expected "
+                                    + std::to_string(Dims));
+        }
+        return detail::unchecked_mutable_reference<T, Dims>(
+            mutable_data(), shape(), strides(), ndim());
+    }
+
+    /**
+     * Returns a proxy object that provides const access to the array's data without bounds or
+     * dimensionality checking.  Unlike `mutable_unchecked()`, this does not require that the
+     * underlying array have the `writable` flag.  Use with care: the array must not be destroyed
+     * or reshaped for the duration of the returned object, and the caller must take care not to
+     * access invalid dimensions or dimension indices.
+     */
+    template <typename T, ssize_t Dims = -1>
+    detail::unchecked_reference<T, Dims> unchecked() const & {
+        if (Dims >= 0 && ndim() != Dims) {
+            throw std::domain_error("array has incorrect number of dimensions: "
+                                    + std::to_string(ndim()) + "; expected "
+                                    + std::to_string(Dims));
+        }
+        return detail::unchecked_reference<T, Dims>(data(), shape(), strides(), ndim());
+    }
+
+    /// Return a new view with all of the dimensions of length 1 removed
+    array squeeze() {
+        auto &api = detail::npy_api::get();
+        return reinterpret_steal<array>(api.PyArray_Squeeze_(m_ptr));
+    }
+
+    /// Resize array to given shape
+    /// If refcheck is true and more that one reference exist to this array
+    /// then resize will succeed only if it makes a reshape, i.e. original size doesn't change
+    void resize(ShapeContainer new_shape, bool refcheck = true) {
+        detail::npy_api::PyArray_Dims d
+            = {// Use reinterpret_cast for PyPy on Windows (remove if fixed, checked on 7.3.1)
+               reinterpret_cast<Py_intptr_t *>(new_shape->data()),
+               int(new_shape->size())};
+        // try to resize, set ordering param to -1 cause it's not used anyway
+        auto new_array = reinterpret_steal<object>(
+            detail::npy_api::get().PyArray_Resize_(m_ptr, &d, int(refcheck), -1));
+        if (!new_array) {
+            throw error_already_set();
+        }
+        if (isinstance<array>(new_array)) {
+            *this = std::move(new_array);
+        }
+    }
+
+    /// Optional `order` parameter omitted, to be added as needed.
+    array reshape(ShapeContainer new_shape) {
+        detail::npy_api::PyArray_Dims d
+            = {reinterpret_cast<Py_intptr_t *>(new_shape->data()), int(new_shape->size())};
+        auto new_array
+            = reinterpret_steal<array>(detail::npy_api::get().PyArray_Newshape_(m_ptr, &d, 0));
+        if (!new_array) {
+            throw error_already_set();
+        }
+        return new_array;
+    }
+
+    /// Create a view of an array in a different data type.
+    /// This function may fundamentally reinterpret the data in the array.
+    /// It is the responsibility of the caller to ensure that this is safe.
+    /// Only supports the `dtype` argument, the `type` argument is omitted,
+    /// to be added as needed.
+    array view(const std::string &dtype) {
+        auto &api = detail::npy_api::get();
+        auto new_view = reinterpret_steal<array>(api.PyArray_View_(
+            m_ptr, dtype::from_args(pybind11::str(dtype)).release().ptr(), nullptr));
+        if (!new_view) {
+            throw error_already_set();
+        }
+        return new_view;
+    }
+
+    /// Ensure that the argument is a NumPy array
+    /// In case of an error, nullptr is returned and the Python error is cleared.
+    static array ensure(handle h, int ExtraFlags = 0) {
+        auto result = reinterpret_steal<array>(raw_array(h.ptr(), ExtraFlags));
+        if (!result) {
+            PyErr_Clear();
+        }
+        return result;
+    }
+
+protected:
+    template <typename, typename>
+    friend struct detail::npy_format_descriptor;
+
+    void fail_dim_check(ssize_t dim, const std::string &msg) const {
+        throw index_error(msg + ": " + std::to_string(dim) + " (ndim = " + std::to_string(ndim())
+                          + ')');
+    }
+
+    template <typename... Ix>
+    ssize_t byte_offset(Ix... index) const {
+        check_dimensions(index...);
+        return detail::byte_offset_unsafe(strides(), ssize_t(index)...);
+    }
+
+    void check_writeable() const {
+        if (!writeable()) {
+            throw std::domain_error("array is not writeable");
+        }
+    }
+
+    template <typename... Ix>
+    void check_dimensions(Ix... index) const {
+        check_dimensions_impl(ssize_t(0), shape(), ssize_t(index)...);
+    }
+
+    void check_dimensions_impl(ssize_t, const ssize_t *) const {}
+
+    template <typename... Ix>
+    void check_dimensions_impl(ssize_t axis, const ssize_t *shape, ssize_t i, Ix... index) const {
+        if (i >= *shape) {
+            throw index_error(std::string("index ") + std::to_string(i)
+                              + " is out of bounds for axis " + std::to_string(axis)
+                              + " with size " + std::to_string(*shape));
+        }
+        check_dimensions_impl(axis + 1, shape + 1, index...);
+    }
+
+    /// Create array from any object -- always returns a new reference
+    static PyObject *raw_array(PyObject *ptr, int ExtraFlags = 0) {
+        if (ptr == nullptr) {
+            set_error(PyExc_ValueError, "cannot create a pybind11::array from a nullptr");
+            return nullptr;
+        }
+        return detail::npy_api::get().PyArray_FromAny_(
+            ptr, nullptr, 0, 0, detail::npy_api::NPY_ARRAY_ENSUREARRAY_ | ExtraFlags, nullptr);
+    }
+};
+
+template <typename T, int ExtraFlags = array::forcecast>
+class array_t : public array {
+private:
+    struct private_ctor {};
+    // Delegating constructor needed when both moving and accessing in the same constructor
+    array_t(private_ctor,
+            ShapeContainer &&shape,
+            StridesContainer &&strides,
+            const T *ptr,
+            handle base)
+        : array(std::move(shape), std::move(strides), ptr, base) {}
+
+public:
+    static_assert(!detail::array_info<T>::is_array, "Array types cannot be used with array_t");
+
+    using value_type = T;
+
+    array_t() : array(0, static_cast<const T *>(nullptr)) {}
+    array_t(handle h, borrowed_t) : array(h, borrowed_t{}) {}
+    array_t(handle h, stolen_t) : array(h, stolen_t{}) {}
+
+    PYBIND11_DEPRECATED("Use array_t<T>::ensure() instead")
+    array_t(handle h, bool is_borrowed) : array(raw_array_t(h.ptr()), stolen_t{}) {
+        if (!m_ptr) {
+            PyErr_Clear();
+        }
+        if (!is_borrowed) {
+            Py_XDECREF(h.ptr());
+        }
+    }
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    array_t(const object &o) : array(raw_array_t(o.ptr()), stolen_t{}) {
+        if (!m_ptr) {
+            throw error_already_set();
+        }
+    }
+
+    explicit array_t(const buffer_info &info, handle base = handle()) : array(info, base) {}
+
+    array_t(ShapeContainer shape,
+            StridesContainer strides,
+            const T *ptr = nullptr,
+            handle base = handle())
+        : array(std::move(shape), std::move(strides), ptr, base) {}
+
+    explicit array_t(ShapeContainer shape, const T *ptr = nullptr, handle base = handle())
+        : array_t(private_ctor{},
+                  std::move(shape),
+                  (ExtraFlags & f_style) != 0 ? detail::f_strides(*shape, itemsize())
+                                              : detail::c_strides(*shape, itemsize()),
+                  ptr,
+                  base) {}
+
+    explicit array_t(ssize_t count, const T *ptr = nullptr, handle base = handle())
+        : array({count}, {}, ptr, base) {}
+
+    constexpr ssize_t itemsize() const { return sizeof(T); }
+
+    template <typename... Ix>
+    ssize_t index_at(Ix... index) const {
+        return offset_at(index...) / itemsize();
+    }
+
+    template <typename... Ix>
+    const T *data(Ix... index) const {
+        return static_cast<const T *>(array::data(index...));
+    }
+
+    template <typename... Ix>
+    T *mutable_data(Ix... index) {
+        return static_cast<T *>(array::mutable_data(index...));
+    }
+
+    // Reference to element at a given index
+    template <typename... Ix>
+    const T &at(Ix... index) const {
+        if ((ssize_t) sizeof...(index) != ndim()) {
+            fail_dim_check(sizeof...(index), "index dimension mismatch");
+        }
+        return *(static_cast<const T *>(array::data())
+                 + byte_offset(ssize_t(index)...) / itemsize());
+    }
+
+    // Mutable reference to element at a given index
+    template <typename... Ix>
+    T &mutable_at(Ix... index) {
+        if ((ssize_t) sizeof...(index) != ndim()) {
+            fail_dim_check(sizeof...(index), "index dimension mismatch");
+        }
+        return *(static_cast<T *>(array::mutable_data())
+                 + byte_offset(ssize_t(index)...) / itemsize());
+    }
+
+    /**
+     * Returns a proxy object that provides access to the array's data without bounds or
+     * dimensionality checking.  Will throw if the array is missing the `writeable` flag.  Use with
+     * care: the array must not be destroyed or reshaped for the duration of the returned object,
+     * and the caller must take care not to access invalid dimensions or dimension indices.
+     */
+    template <ssize_t Dims = -1>
+    detail::unchecked_mutable_reference<T, Dims> mutable_unchecked() & {
+        return array::mutable_unchecked<T, Dims>();
+    }
+
+    /**
+     * Returns a proxy object that provides const access to the array's data without bounds or
+     * dimensionality checking.  Unlike `mutable_unchecked()`, this does not require that the
+     * underlying array have the `writable` flag.  Use with care: the array must not be destroyed
+     * or reshaped for the duration of the returned object, and the caller must take care not to
+     * access invalid dimensions or dimension indices.
+     */
+    template <ssize_t Dims = -1>
+    detail::unchecked_reference<T, Dims> unchecked() const & {
+        return array::unchecked<T, Dims>();
+    }
+
+    /// Ensure that the argument is a NumPy array of the correct dtype (and if not, try to convert
+    /// it).  In case of an error, nullptr is returned and the Python error is cleared.
+    static array_t ensure(handle h) {
+        auto result = reinterpret_steal<array_t>(raw_array_t(h.ptr()));
+        if (!result) {
+            PyErr_Clear();
+        }
+        return result;
+    }
+
+    static bool check_(handle h) {
+        const auto &api = detail::npy_api::get();
+        return api.PyArray_Check_(h.ptr())
+               && api.PyArray_EquivTypes_(detail::array_proxy(h.ptr())->descr,
+                                          dtype::of<T>().ptr())
+               && detail::check_flags(h.ptr(), ExtraFlags & (array::c_style | array::f_style));
+    }
+
+protected:
+    /// Create array from any object -- always returns a new reference
+    static PyObject *raw_array_t(PyObject *ptr) {
+        if (ptr == nullptr) {
+            set_error(PyExc_ValueError, "cannot create a pybind11::array_t from a nullptr");
+            return nullptr;
+        }
+        return detail::npy_api::get().PyArray_FromAny_(ptr,
+                                                       dtype::of<T>().release().ptr(),
+                                                       0,
+                                                       0,
+                                                       detail::npy_api::NPY_ARRAY_ENSUREARRAY_
+                                                           | ExtraFlags,
+                                                       nullptr);
+    }
+};
+
+template <typename T>
+struct format_descriptor<T, detail::enable_if_t<detail::is_pod_struct<T>::value>> {
+    static std::string format() {
+        return detail::npy_format_descriptor<typename std::remove_cv<T>::type>::format();
+    }
+};
+
+template <size_t N>
+struct format_descriptor<char[N]> {
+    static std::string format() { return std::to_string(N) + 's'; }
+};
+template <size_t N>
+struct format_descriptor<std::array<char, N>> {
+    static std::string format() { return std::to_string(N) + 's'; }
+};
+
+template <typename T>
+struct format_descriptor<T, detail::enable_if_t<std::is_enum<T>::value>> {
+    static std::string format() {
+        return format_descriptor<
+            typename std::remove_cv<typename std::underlying_type<T>::type>::type>::format();
+    }
+};
+
+template <typename T>
+struct format_descriptor<T, detail::enable_if_t<detail::array_info<T>::is_array>> {
+    static std::string format() {
+        using namespace detail;
+        static constexpr auto extents = const_name("(") + array_info<T>::extents + const_name(")");
+        return extents.text + format_descriptor<remove_all_extents_t<T>>::format();
+    }
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+template <typename T, int ExtraFlags>
+struct pyobject_caster<array_t<T, ExtraFlags>> {
+    using type = array_t<T, ExtraFlags>;
+
+    bool load(handle src, bool convert) {
+        if (!convert && !type::check_(src)) {
+            return false;
+        }
+        value = type::ensure(src);
+        return static_cast<bool>(value);
+    }
+
+    static handle cast(const handle &src, return_value_policy /* policy */, handle /* parent */) {
+        return src.inc_ref();
+    }
+    PYBIND11_TYPE_CASTER(type, handle_type_name<type>::name);
+};
+
+template <typename T>
+struct compare_buffer_info<T, detail::enable_if_t<detail::is_pod_struct<T>::value>> {
+    static bool compare(const buffer_info &b) {
+        return npy_api::get().PyArray_EquivTypes_(dtype::of<T>().ptr(), dtype(b).ptr());
+    }
+};
+
+template <typename T, typename = void>
+struct npy_format_descriptor_name;
+
+template <typename T>
+struct npy_format_descriptor_name<T, enable_if_t<std::is_integral<T>::value>> {
+    static constexpr auto name = const_name<std::is_same<T, bool>::value>(
+        const_name("bool"),
+        const_name<std::is_signed<T>::value>("numpy.int", "numpy.uint")
+            + const_name<sizeof(T) * 8>());
+};
+
+template <typename T>
+struct npy_format_descriptor_name<T, enable_if_t<std::is_floating_point<T>::value>> {
+    static constexpr auto name = const_name < std::is_same<T, float>::value
+                                 || std::is_same<T, const float>::value
+                                 || std::is_same<T, double>::value
+                                 || std::is_same<T, const double>::value
+                                        > (const_name("numpy.float") + const_name<sizeof(T) * 8>(),
+                                           const_name("numpy.longdouble"));
+};
+
+template <typename T>
+struct npy_format_descriptor_name<T, enable_if_t<is_complex<T>::value>> {
+    static constexpr auto name = const_name < std::is_same<typename T::value_type, float>::value
+                                 || std::is_same<typename T::value_type, const float>::value
+                                 || std::is_same<typename T::value_type, double>::value
+                                 || std::is_same<typename T::value_type, const double>::value
+                                        > (const_name("numpy.complex")
+                                               + const_name<sizeof(typename T::value_type) * 16>(),
+                                           const_name("numpy.longcomplex"));
+};
+
+template <typename T>
+struct npy_format_descriptor<
+    T,
+    enable_if_t<satisfies_any_of<T, std::is_arithmetic, is_complex>::value>>
+    : npy_format_descriptor_name<T> {
+private:
+    // NB: the order here must match the one in common.h
+    constexpr static const int values[15] = {npy_api::NPY_BOOL_,
+                                             npy_api::NPY_BYTE_,
+                                             npy_api::NPY_UBYTE_,
+                                             npy_api::NPY_INT16_,
+                                             npy_api::NPY_UINT16_,
+                                             npy_api::NPY_INT32_,
+                                             npy_api::NPY_UINT32_,
+                                             npy_api::NPY_INT64_,
+                                             npy_api::NPY_UINT64_,
+                                             npy_api::NPY_FLOAT_,
+                                             npy_api::NPY_DOUBLE_,
+                                             npy_api::NPY_LONGDOUBLE_,
+                                             npy_api::NPY_CFLOAT_,
+                                             npy_api::NPY_CDOUBLE_,
+                                             npy_api::NPY_CLONGDOUBLE_};
+
+public:
+    static constexpr int value = values[detail::is_fmt_numeric<T>::index];
+
+    static pybind11::dtype dtype() { return pybind11::dtype(/*typenum*/ value); }
+};
+
+template <typename T>
+struct npy_format_descriptor<T, enable_if_t<is_same_ignoring_cvref<T, PyObject *>::value>> {
+    static constexpr auto name = const_name("object");
+
+    static constexpr int value = npy_api::NPY_OBJECT_;
+
+    static pybind11::dtype dtype() { return pybind11::dtype(/*typenum*/ value); }
+};
+
+#define PYBIND11_DECL_CHAR_FMT                                                                    \
+    static constexpr auto name = const_name("S") + const_name<N>();                               \
+    static pybind11::dtype dtype() {                                                              \
+        return pybind11::dtype(std::string("S") + std::to_string(N));                             \
+    }
+template <size_t N>
+struct npy_format_descriptor<char[N]> {
+    PYBIND11_DECL_CHAR_FMT
+};
+template <size_t N>
+struct npy_format_descriptor<std::array<char, N>> {
+    PYBIND11_DECL_CHAR_FMT
+};
+#undef PYBIND11_DECL_CHAR_FMT
+
+template <typename T>
+struct npy_format_descriptor<T, enable_if_t<array_info<T>::is_array>> {
+private:
+    using base_descr = npy_format_descriptor<typename array_info<T>::type>;
+
+public:
+    static_assert(!array_info<T>::is_empty, "Zero-sized arrays are not supported");
+
+    static constexpr auto name
+        = const_name("(") + array_info<T>::extents + const_name(")") + base_descr::name;
+    static pybind11::dtype dtype() {
+        list shape;
+        array_info<T>::append_extents(shape);
+        return pybind11::dtype::from_args(
+            pybind11::make_tuple(base_descr::dtype(), std::move(shape)));
+    }
+};
+
+template <typename T>
+struct npy_format_descriptor<T, enable_if_t<std::is_enum<T>::value>> {
+private:
+    using base_descr = npy_format_descriptor<typename std::underlying_type<T>::type>;
+
+public:
+    static constexpr auto name = base_descr::name;
+    static pybind11::dtype dtype() { return base_descr::dtype(); }
+};
+
+struct field_descriptor {
+    const char *name;
+    ssize_t offset;
+    ssize_t size;
+    std::string format;
+    dtype descr;
+};
+
+PYBIND11_NOINLINE void register_structured_dtype(any_container<field_descriptor> fields,
+                                                 const std::type_info &tinfo,
+                                                 ssize_t itemsize,
+                                                 bool (*direct_converter)(PyObject *, void *&)) {
+
+    auto &numpy_internals = get_numpy_internals();
+    if (numpy_internals.get_type_info(tinfo, false)) {
+        pybind11_fail("NumPy: dtype is already registered");
+    }
+
+    // Use ordered fields because order matters as of NumPy 1.14:
+    // https://docs.scipy.org/doc/numpy/release.html#multiple-field-indexing-assignment-of-structured-arrays
+    std::vector<field_descriptor> ordered_fields(std::move(fields));
+    std::sort(
+        ordered_fields.begin(),
+        ordered_fields.end(),
+        [](const field_descriptor &a, const field_descriptor &b) { return a.offset < b.offset; });
+
+    list names, formats, offsets;
+    for (auto &field : ordered_fields) {
+        if (!field.descr) {
+            pybind11_fail(std::string("NumPy: unsupported field dtype: `") + field.name + "` @ "
+                          + tinfo.name());
+        }
+        names.append(pybind11::str(field.name));
+        formats.append(field.descr);
+        offsets.append(pybind11::int_(field.offset));
+    }
+    auto *dtype_ptr
+        = pybind11::dtype(std::move(names), std::move(formats), std::move(offsets), itemsize)
+              .release()
+              .ptr();
+
+    // There is an existing bug in NumPy (as of v1.11): trailing bytes are
+    // not encoded explicitly into the format string. This will supposedly
+    // get fixed in v1.12; for further details, see these:
+    // - https://github.com/numpy/numpy/issues/7797
+    // - https://github.com/numpy/numpy/pull/7798
+    // Because of this, we won't use numpy's logic to generate buffer format
+    // strings and will just do it ourselves.
+    ssize_t offset = 0;
+    std::ostringstream oss;
+    // mark the structure as unaligned with '^', because numpy and C++ don't
+    // always agree about alignment (particularly for complex), and we're
+    // explicitly listing all our padding. This depends on none of the fields
+    // overriding the endianness. Putting the ^ in front of individual fields
+    // isn't guaranteed to work due to https://github.com/numpy/numpy/issues/9049
+    oss << "^T{";
+    for (auto &field : ordered_fields) {
+        if (field.offset > offset) {
+            oss << (field.offset - offset) << 'x';
+        }
+        oss << field.format << ':' << field.name << ':';
+        offset = field.offset + field.size;
+    }
+    if (itemsize > offset) {
+        oss << (itemsize - offset) << 'x';
+    }
+    oss << '}';
+    auto format_str = oss.str();
+
+    // Smoke test: verify that NumPy properly parses our buffer format string
+    auto &api = npy_api::get();
+    auto arr = array(buffer_info(nullptr, itemsize, format_str, 1));
+    if (!api.PyArray_EquivTypes_(dtype_ptr, arr.dtype().ptr())) {
+        pybind11_fail("NumPy: invalid buffer descriptor!");
+    }
+
+    auto tindex = std::type_index(tinfo);
+    numpy_internals.registered_dtypes[tindex] = {dtype_ptr, std::move(format_str)};
+    with_internals([tindex, &direct_converter](internals &internals) {
+        internals.direct_conversions[tindex].push_back(direct_converter);
+    });
+}
+
+template <typename T, typename SFINAE>
+struct npy_format_descriptor {
+    static_assert(is_pod_struct<T>::value,
+                  "Attempt to use a non-POD or unimplemented POD type as a numpy dtype");
+
+    static constexpr auto name = make_caster<T>::name;
+
+    static pybind11::dtype dtype() { return reinterpret_borrow<pybind11::dtype>(dtype_ptr()); }
+
+    static std::string format() {
+        static auto format_str = get_numpy_internals().get_type_info<T>(true)->format_str;
+        return format_str;
+    }
+
+    static void register_dtype(any_container<field_descriptor> fields) {
+        register_structured_dtype(std::move(fields),
+                                  typeid(typename std::remove_cv<T>::type),
+                                  sizeof(T),
+                                  &direct_converter);
+    }
+
+private:
+    static PyObject *dtype_ptr() {
+        static PyObject *ptr = get_numpy_internals().get_type_info<T>(true)->dtype_ptr;
+        return ptr;
+    }
+
+    static bool direct_converter(PyObject *obj, void *&value) {
+        auto &api = npy_api::get();
+        if (!PyObject_TypeCheck(obj, api.PyVoidArrType_Type_)) {
+            return false;
+        }
+        if (auto descr = reinterpret_steal<object>(api.PyArray_DescrFromScalar_(obj))) {
+            if (api.PyArray_EquivTypes_(dtype_ptr(), descr.ptr())) {
+                value = ((PyVoidScalarObject_Proxy *) obj)->obval;
+                return true;
+            }
+        }
+        return false;
+    }
+};
+
+#ifdef __CLION_IDE__ // replace heavy macro with dummy code for the IDE (doesn't affect code)
+#    define PYBIND11_NUMPY_DTYPE(Type, ...) ((void) 0)
+#    define PYBIND11_NUMPY_DTYPE_EX(Type, ...) ((void) 0)
+#else
+
+#    define PYBIND11_FIELD_DESCRIPTOR_EX(T, Field, Name)                                          \
+        ::pybind11::detail::field_descriptor {                                                    \
+            Name, offsetof(T, Field), sizeof(decltype(std::declval<T>().Field)),                  \
+                ::pybind11::format_descriptor<decltype(std::declval<T>().Field)>::format(),       \
+                ::pybind11::detail::npy_format_descriptor<                                        \
+                    decltype(std::declval<T>().Field)>::dtype()                                   \
+        }
+
+// Extract name, offset and format descriptor for a struct field
+#    define PYBIND11_FIELD_DESCRIPTOR(T, Field) PYBIND11_FIELD_DESCRIPTOR_EX(T, Field, #Field)
+
+// The main idea of this macro is borrowed from https://github.com/swansontec/map-macro
+// (C) William Swanson, Paul Fultz
+#    define PYBIND11_EVAL0(...) __VA_ARGS__
+#    define PYBIND11_EVAL1(...) PYBIND11_EVAL0(PYBIND11_EVAL0(PYBIND11_EVAL0(__VA_ARGS__)))
+#    define PYBIND11_EVAL2(...) PYBIND11_EVAL1(PYBIND11_EVAL1(PYBIND11_EVAL1(__VA_ARGS__)))
+#    define PYBIND11_EVAL3(...) PYBIND11_EVAL2(PYBIND11_EVAL2(PYBIND11_EVAL2(__VA_ARGS__)))
+#    define PYBIND11_EVAL4(...) PYBIND11_EVAL3(PYBIND11_EVAL3(PYBIND11_EVAL3(__VA_ARGS__)))
+#    define PYBIND11_EVAL(...) PYBIND11_EVAL4(PYBIND11_EVAL4(PYBIND11_EVAL4(__VA_ARGS__)))
+#    define PYBIND11_MAP_END(...)
+#    define PYBIND11_MAP_OUT
+#    define PYBIND11_MAP_COMMA ,
+#    define PYBIND11_MAP_GET_END() 0, PYBIND11_MAP_END
+#    define PYBIND11_MAP_NEXT0(test, next, ...) next PYBIND11_MAP_OUT
+#    define PYBIND11_MAP_NEXT1(test, next) PYBIND11_MAP_NEXT0(test, next, 0)
+#    define PYBIND11_MAP_NEXT(test, next) PYBIND11_MAP_NEXT1(PYBIND11_MAP_GET_END test, next)
+#    if defined(_MSC_VER)                                                                         \
+        && !defined(__clang__) // MSVC is not as eager to expand macros, hence this workaround
+#        define PYBIND11_MAP_LIST_NEXT1(test, next)                                               \
+            PYBIND11_EVAL0(PYBIND11_MAP_NEXT0(test, PYBIND11_MAP_COMMA next, 0))
+#    else
+#        define PYBIND11_MAP_LIST_NEXT1(test, next)                                               \
+            PYBIND11_MAP_NEXT0(test, PYBIND11_MAP_COMMA next, 0)
+#    endif
+#    define PYBIND11_MAP_LIST_NEXT(test, next)                                                    \
+        PYBIND11_MAP_LIST_NEXT1(PYBIND11_MAP_GET_END test, next)
+#    define PYBIND11_MAP_LIST0(f, t, x, peek, ...)                                                \
+        f(t, x) PYBIND11_MAP_LIST_NEXT(peek, PYBIND11_MAP_LIST1)(f, t, peek, __VA_ARGS__)
+#    define PYBIND11_MAP_LIST1(f, t, x, peek, ...)                                                \
+        f(t, x) PYBIND11_MAP_LIST_NEXT(peek, PYBIND11_MAP_LIST0)(f, t, peek, __VA_ARGS__)
+// PYBIND11_MAP_LIST(f, t, a1, a2, ...) expands to f(t, a1), f(t, a2), ...
+#    define PYBIND11_MAP_LIST(f, t, ...)                                                          \
+        PYBIND11_EVAL(PYBIND11_MAP_LIST1(f, t, __VA_ARGS__, (), 0))
+
+#    define PYBIND11_NUMPY_DTYPE(Type, ...)                                                       \
+        ::pybind11::detail::npy_format_descriptor<Type>::register_dtype(                          \
+            ::std::vector<::pybind11::detail::field_descriptor>{                                  \
+                PYBIND11_MAP_LIST(PYBIND11_FIELD_DESCRIPTOR, Type, __VA_ARGS__)})
+
+#    if defined(_MSC_VER) && !defined(__clang__)
+#        define PYBIND11_MAP2_LIST_NEXT1(test, next)                                              \
+            PYBIND11_EVAL0(PYBIND11_MAP_NEXT0(test, PYBIND11_MAP_COMMA next, 0))
+#    else
+#        define PYBIND11_MAP2_LIST_NEXT1(test, next)                                              \
+            PYBIND11_MAP_NEXT0(test, PYBIND11_MAP_COMMA next, 0)
+#    endif
+#    define PYBIND11_MAP2_LIST_NEXT(test, next)                                                   \
+        PYBIND11_MAP2_LIST_NEXT1(PYBIND11_MAP_GET_END test, next)
+#    define PYBIND11_MAP2_LIST0(f, t, x1, x2, peek, ...)                                          \
+        f(t, x1, x2) PYBIND11_MAP2_LIST_NEXT(peek, PYBIND11_MAP2_LIST1)(f, t, peek, __VA_ARGS__)
+#    define PYBIND11_MAP2_LIST1(f, t, x1, x2, peek, ...)                                          \
+        f(t, x1, x2) PYBIND11_MAP2_LIST_NEXT(peek, PYBIND11_MAP2_LIST0)(f, t, peek, __VA_ARGS__)
+// PYBIND11_MAP2_LIST(f, t, a1, a2, ...) expands to f(t, a1, a2), f(t, a3, a4), ...
+#    define PYBIND11_MAP2_LIST(f, t, ...)                                                         \
+        PYBIND11_EVAL(PYBIND11_MAP2_LIST1(f, t, __VA_ARGS__, (), 0))
+
+#    define PYBIND11_NUMPY_DTYPE_EX(Type, ...)                                                    \
+        ::pybind11::detail::npy_format_descriptor<Type>::register_dtype(                          \
+            ::std::vector<::pybind11::detail::field_descriptor>{                                  \
+                PYBIND11_MAP2_LIST(PYBIND11_FIELD_DESCRIPTOR_EX, Type, __VA_ARGS__)})
+
+#endif // __CLION_IDE__
+
+class common_iterator {
+public:
+    using container_type = std::vector<ssize_t>;
+    using value_type = container_type::value_type;
+    using size_type = container_type::size_type;
+
+    common_iterator() : m_strides() {}
+
+    common_iterator(void *ptr, const container_type &strides, const container_type &shape)
+        : p_ptr(reinterpret_cast<char *>(ptr)), m_strides(strides.size()) {
+        m_strides.back() = static_cast<value_type>(strides.back());
+        for (size_type i = m_strides.size() - 1; i != 0; --i) {
+            size_type j = i - 1;
+            auto s = static_cast<value_type>(shape[i]);
+            m_strides[j] = strides[j] + m_strides[i] - strides[i] * s;
+        }
+    }
+
+    void increment(size_type dim) { p_ptr += m_strides[dim]; }
+
+    void *data() const { return p_ptr; }
+
+private:
+    char *p_ptr{nullptr};
+    container_type m_strides;
+};
+
+template <size_t N>
+class multi_array_iterator {
+public:
+    using container_type = std::vector<ssize_t>;
+
+    multi_array_iterator(const std::array<buffer_info, N> &buffers, const container_type &shape)
+        : m_shape(shape.size()), m_index(shape.size(), 0), m_common_iterator() {
+
+        // Manual copy to avoid conversion warning if using std::copy
+        for (size_t i = 0; i < shape.size(); ++i) {
+            m_shape[i] = shape[i];
+        }
+
+        container_type strides(shape.size());
+        for (size_t i = 0; i < N; ++i) {
+            init_common_iterator(buffers[i], shape, m_common_iterator[i], strides);
+        }
+    }
+
+    multi_array_iterator &operator++() {
+        for (size_t j = m_index.size(); j != 0; --j) {
+            size_t i = j - 1;
+            if (++m_index[i] != m_shape[i]) {
+                increment_common_iterator(i);
+                break;
+            }
+            m_index[i] = 0;
+        }
+        return *this;
+    }
+
+    template <size_t K, class T = void>
+    T *data() const {
+        return reinterpret_cast<T *>(m_common_iterator[K].data());
+    }
+
+private:
+    using common_iter = common_iterator;
+
+    void init_common_iterator(const buffer_info &buffer,
+                              const container_type &shape,
+                              common_iter &iterator,
+                              container_type &strides) {
+        auto buffer_shape_iter = buffer.shape.rbegin();
+        auto buffer_strides_iter = buffer.strides.rbegin();
+        auto shape_iter = shape.rbegin();
+        auto strides_iter = strides.rbegin();
+
+        while (buffer_shape_iter != buffer.shape.rend()) {
+            if (*shape_iter == *buffer_shape_iter) {
+                *strides_iter = *buffer_strides_iter;
+            } else {
+                *strides_iter = 0;
+            }
+
+            ++buffer_shape_iter;
+            ++buffer_strides_iter;
+            ++shape_iter;
+            ++strides_iter;
+        }
+
+        std::fill(strides_iter, strides.rend(), 0);
+        iterator = common_iter(buffer.ptr, strides, shape);
+    }
+
+    void increment_common_iterator(size_t dim) {
+        for (auto &iter : m_common_iterator) {
+            iter.increment(dim);
+        }
+    }
+
+    container_type m_shape;
+    container_type m_index;
+    std::array<common_iter, N> m_common_iterator;
+};
+
+enum class broadcast_trivial { non_trivial, c_trivial, f_trivial };
+
+// Populates the shape and number of dimensions for the set of buffers.  Returns a
+// broadcast_trivial enum value indicating whether the broadcast is "trivial"--that is, has each
+// buffer being either a singleton or a full-size, C-contiguous (`c_trivial`) or Fortran-contiguous
+// (`f_trivial`) storage buffer; returns `non_trivial` otherwise.
+template <size_t N>
+broadcast_trivial
+broadcast(const std::array<buffer_info, N> &buffers, ssize_t &ndim, std::vector<ssize_t> &shape) {
+    ndim = std::accumulate(
+        buffers.begin(), buffers.end(), ssize_t(0), [](ssize_t res, const buffer_info &buf) {
+            return std::max(res, buf.ndim);
+        });
+
+    shape.clear();
+    shape.resize((size_t) ndim, 1);
+
+    // Figure out the output size, and make sure all input arrays conform (i.e. are either size 1
+    // or the full size).
+    for (size_t i = 0; i < N; ++i) {
+        auto res_iter = shape.rbegin();
+        auto end = buffers[i].shape.rend();
+        for (auto shape_iter = buffers[i].shape.rbegin(); shape_iter != end;
+             ++shape_iter, ++res_iter) {
+            const auto &dim_size_in = *shape_iter;
+            auto &dim_size_out = *res_iter;
+
+            // Each input dimension can either be 1 or `n`, but `n` values must match across
+            // buffers
+            if (dim_size_out == 1) {
+                dim_size_out = dim_size_in;
+            } else if (dim_size_in != 1 && dim_size_in != dim_size_out) {
+                pybind11_fail("pybind11::vectorize: incompatible size/dimension of inputs!");
+            }
+        }
+    }
+
+    bool trivial_broadcast_c = true;
+    bool trivial_broadcast_f = true;
+    for (size_t i = 0; i < N && (trivial_broadcast_c || trivial_broadcast_f); ++i) {
+        if (buffers[i].size == 1) {
+            continue;
+        }
+
+        // Require the same number of dimensions:
+        if (buffers[i].ndim != ndim) {
+            return broadcast_trivial::non_trivial;
+        }
+
+        // Require all dimensions be full-size:
+        if (!std::equal(buffers[i].shape.cbegin(), buffers[i].shape.cend(), shape.cbegin())) {
+            return broadcast_trivial::non_trivial;
+        }
+
+        // Check for C contiguity (but only if previous inputs were also C contiguous)
+        if (trivial_broadcast_c) {
+            ssize_t expect_stride = buffers[i].itemsize;
+            auto end = buffers[i].shape.crend();
+            for (auto shape_iter = buffers[i].shape.crbegin(),
+                      stride_iter = buffers[i].strides.crbegin();
+                 trivial_broadcast_c && shape_iter != end;
+                 ++shape_iter, ++stride_iter) {
+                if (expect_stride == *stride_iter) {
+                    expect_stride *= *shape_iter;
+                } else {
+                    trivial_broadcast_c = false;
+                }
+            }
+        }
+
+        // Check for Fortran contiguity (if previous inputs were also F contiguous)
+        if (trivial_broadcast_f) {
+            ssize_t expect_stride = buffers[i].itemsize;
+            auto end = buffers[i].shape.cend();
+            for (auto shape_iter = buffers[i].shape.cbegin(),
+                      stride_iter = buffers[i].strides.cbegin();
+                 trivial_broadcast_f && shape_iter != end;
+                 ++shape_iter, ++stride_iter) {
+                if (expect_stride == *stride_iter) {
+                    expect_stride *= *shape_iter;
+                } else {
+                    trivial_broadcast_f = false;
+                }
+            }
+        }
+    }
+
+    return trivial_broadcast_c   ? broadcast_trivial::c_trivial
+           : trivial_broadcast_f ? broadcast_trivial::f_trivial
+                                 : broadcast_trivial::non_trivial;
+}
+
+template <typename T>
+struct vectorize_arg {
+    static_assert(!std::is_rvalue_reference<T>::value,
+                  "Functions with rvalue reference arguments cannot be vectorized");
+    // The wrapped function gets called with this type:
+    using call_type = remove_reference_t<T>;
+    // Is this a vectorized argument?
+    static constexpr bool vectorize
+        = satisfies_any_of<call_type, std::is_arithmetic, is_complex, is_pod>::value
+          && satisfies_none_of<call_type,
+                               std::is_pointer,
+                               std::is_array,
+                               is_std_array,
+                               std::is_enum>::value
+          && (!std::is_reference<T>::value
+              || (std::is_lvalue_reference<T>::value && std::is_const<call_type>::value));
+    // Accept this type: an array for vectorized types, otherwise the type as-is:
+    using type = conditional_t<vectorize, array_t<remove_cv_t<call_type>, array::forcecast>, T>;
+};
+
+// py::vectorize when a return type is present
+template <typename Func, typename Return, typename... Args>
+struct vectorize_returned_array {
+    using Type = array_t<Return>;
+
+    static Type create(broadcast_trivial trivial, const std::vector<ssize_t> &shape) {
+        if (trivial == broadcast_trivial::f_trivial) {
+            return array_t<Return, array::f_style>(shape);
+        }
+        return array_t<Return>(shape);
+    }
+
+    static Return *mutable_data(Type &array) { return array.mutable_data(); }
+
+    static Return call(Func &f, Args &...args) { return f(args...); }
+
+    static void call(Return *out, size_t i, Func &f, Args &...args) { out[i] = f(args...); }
+};
+
+// py::vectorize when a return type is not present
+template <typename Func, typename... Args>
+struct vectorize_returned_array<Func, void, Args...> {
+    using Type = none;
+
+    static Type create(broadcast_trivial, const std::vector<ssize_t> &) { return none(); }
+
+    static void *mutable_data(Type &) { return nullptr; }
+
+    static detail::void_type call(Func &f, Args &...args) {
+        f(args...);
+        return {};
+    }
+
+    static void call(void *, size_t, Func &f, Args &...args) { f(args...); }
+};
+
+template <typename Func, typename Return, typename... Args>
+struct vectorize_helper {
+
+// NVCC for some reason breaks if NVectorized is private
+#ifdef __CUDACC__
+public:
+#else
+private:
+#endif
+
+    static constexpr size_t N = sizeof...(Args);
+    static constexpr size_t NVectorized = constexpr_sum(vectorize_arg<Args>::vectorize...);
+    static_assert(
+        NVectorized >= 1,
+        "pybind11::vectorize(...) requires a function with at least one vectorizable argument");
+
+public:
+    template <typename T,
+              // SFINAE to prevent shadowing the copy constructor.
+              typename = detail::enable_if_t<
+                  !std::is_same<vectorize_helper, typename std::decay<T>::type>::value>>
+    explicit vectorize_helper(T &&f) : f(std::forward<T>(f)) {}
+
+    object operator()(typename vectorize_arg<Args>::type... args) {
+        return run(args...,
+                   make_index_sequence<N>(),
+                   select_indices<vectorize_arg<Args>::vectorize...>(),
+                   make_index_sequence<NVectorized>());
+    }
+
+private:
+    remove_reference_t<Func> f;
+
+    // Internal compiler error in MSVC 19.16.27025.1 (Visual Studio 2017 15.9.4), when compiling
+    // with "/permissive-" flag when arg_call_types is manually inlined.
+    using arg_call_types = std::tuple<typename vectorize_arg<Args>::call_type...>;
+    template <size_t Index>
+    using param_n_t = typename std::tuple_element<Index, arg_call_types>::type;
+
+    using returned_array = vectorize_returned_array<Func, Return, Args...>;
+
+    // Runs a vectorized function given arguments tuple and three index sequences:
+    //     - Index is the full set of 0 ... (N-1) argument indices;
+    //     - VIndex is the subset of argument indices with vectorized parameters, letting us access
+    //       vectorized arguments (anything not in this sequence is passed through)
+    //     - BIndex is a incremental sequence (beginning at 0) of the same size as VIndex, so that
+    //       we can store vectorized buffer_infos in an array (argument VIndex has its buffer at
+    //       index BIndex in the array).
+    template <size_t... Index, size_t... VIndex, size_t... BIndex>
+    object run(typename vectorize_arg<Args>::type &...args,
+               index_sequence<Index...> i_seq,
+               index_sequence<VIndex...> vi_seq,
+               index_sequence<BIndex...> bi_seq) {
+
+        // Pointers to values the function was called with; the vectorized ones set here will start
+        // out as array_t<T> pointers, but they will be changed them to T pointers before we make
+        // call the wrapped function.  Non-vectorized pointers are left as-is.
+        std::array<void *, N> params{{reinterpret_cast<void *>(&args)...}};
+
+        // The array of `buffer_info`s of vectorized arguments:
+        std::array<buffer_info, NVectorized> buffers{
+            {reinterpret_cast<array *>(params[VIndex])->request()...}};
+
+        /* Determine dimensions parameters of output array */
+        ssize_t nd = 0;
+        std::vector<ssize_t> shape(0);
+        auto trivial = broadcast(buffers, nd, shape);
+        auto ndim = (size_t) nd;
+
+        size_t size
+            = std::accumulate(shape.begin(), shape.end(), (size_t) 1, std::multiplies<size_t>());
+
+        // If all arguments are 0-dimension arrays (i.e. single values) return a plain value (i.e.
+        // not wrapped in an array).
+        if (size == 1 && ndim == 0) {
+            PYBIND11_EXPAND_SIDE_EFFECTS(params[VIndex] = buffers[BIndex].ptr);
+            return cast(
+                returned_array::call(f, *reinterpret_cast<param_n_t<Index> *>(params[Index])...));
+        }
+
+        auto result = returned_array::create(trivial, shape);
+
+        PYBIND11_WARNING_PUSH
+#ifdef PYBIND11_DETECTED_CLANG_WITH_MISLEADING_CALL_STD_MOVE_EXPLICITLY_WARNING
+        PYBIND11_WARNING_DISABLE_CLANG("-Wreturn-std-move")
+#endif
+
+        if (size == 0) {
+            return result;
+        }
+
+        /* Call the function */
+        auto *mutable_data = returned_array::mutable_data(result);
+        if (trivial == broadcast_trivial::non_trivial) {
+            apply_broadcast(buffers, params, mutable_data, size, shape, i_seq, vi_seq, bi_seq);
+        } else {
+            apply_trivial(buffers, params, mutable_data, size, i_seq, vi_seq, bi_seq);
+        }
+
+        return result;
+        PYBIND11_WARNING_POP
+    }
+
+    template <size_t... Index, size_t... VIndex, size_t... BIndex>
+    void apply_trivial(std::array<buffer_info, NVectorized> &buffers,
+                       std::array<void *, N> &params,
+                       Return *out,
+                       size_t size,
+                       index_sequence<Index...>,
+                       index_sequence<VIndex...>,
+                       index_sequence<BIndex...>) {
+
+        // Initialize an array of mutable byte references and sizes with references set to the
+        // appropriate pointer in `params`; as we iterate, we'll increment each pointer by its size
+        // (except for singletons, which get an increment of 0).
+        std::array<std::pair<unsigned char *&, const size_t>, NVectorized> vecparams{
+            {std::pair<unsigned char *&, const size_t>(
+                reinterpret_cast<unsigned char *&>(params[VIndex] = buffers[BIndex].ptr),
+                buffers[BIndex].size == 1 ? 0 : sizeof(param_n_t<VIndex>))...}};
+
+        for (size_t i = 0; i < size; ++i) {
+            returned_array::call(
+                out, i, f, *reinterpret_cast<param_n_t<Index> *>(params[Index])...);
+            for (auto &x : vecparams) {
+                x.first += x.second;
+            }
+        }
+    }
+
+    template <size_t... Index, size_t... VIndex, size_t... BIndex>
+    void apply_broadcast(std::array<buffer_info, NVectorized> &buffers,
+                         std::array<void *, N> &params,
+                         Return *out,
+                         size_t size,
+                         const std::vector<ssize_t> &output_shape,
+                         index_sequence<Index...>,
+                         index_sequence<VIndex...>,
+                         index_sequence<BIndex...>) {
+
+        multi_array_iterator<NVectorized> input_iter(buffers, output_shape);
+
+        for (size_t i = 0; i < size; ++i, ++input_iter) {
+            PYBIND11_EXPAND_SIDE_EFFECTS((params[VIndex] = input_iter.template data<BIndex>()));
+            returned_array::call(
+                out, i, f, *reinterpret_cast<param_n_t<Index> *>(std::get<Index>(params))...);
+        }
+    }
+};
+
+template <typename Func, typename Return, typename... Args>
+vectorize_helper<Func, Return, Args...> vectorize_extractor(const Func &f, Return (*)(Args...)) {
+    return detail::vectorize_helper<Func, Return, Args...>(f);
+}
+
+template <typename T, int Flags>
+struct handle_type_name<array_t<T, Flags>> {
+    static constexpr auto name
+        = const_name("numpy.ndarray[") + npy_format_descriptor<T>::name + const_name("]");
+};
+
+PYBIND11_NAMESPACE_END(detail)
+
+// Vanilla pointer vectorizer:
+template <typename Return, typename... Args>
+detail::vectorize_helper<Return (*)(Args...), Return, Args...> vectorize(Return (*f)(Args...)) {
+    return detail::vectorize_helper<Return (*)(Args...), Return, Args...>(f);
+}
+
+// lambda vectorizer:
+template <typename Func, detail::enable_if_t<detail::is_lambda<Func>::value, int> = 0>
+auto vectorize(Func &&f)
+    -> decltype(detail::vectorize_extractor(std::forward<Func>(f),
+                                            (detail::function_signature_t<Func> *) nullptr)) {
+    return detail::vectorize_extractor(std::forward<Func>(f),
+                                       (detail::function_signature_t<Func> *) nullptr);
+}
+
+// Vectorize a class method (non-const):
+template <typename Return,
+          typename Class,
+          typename... Args,
+          typename Helper = detail::vectorize_helper<
+              decltype(std::mem_fn(std::declval<Return (Class::*)(Args...)>())),
+              Return,
+              Class *,
+              Args...>>
+Helper vectorize(Return (Class::*f)(Args...)) {
+    return Helper(std::mem_fn(f));
+}
+
+// Vectorize a class method (const):
+template <typename Return,
+          typename Class,
+          typename... Args,
+          typename Helper = detail::vectorize_helper<
+              decltype(std::mem_fn(std::declval<Return (Class::*)(Args...) const>())),
+              Return,
+              const Class *,
+              Args...>>
+Helper vectorize(Return (Class::*f)(Args...) const) {
+    return Helper(std::mem_fn(f));
+}
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/phivenv/Lib/site-packages/torch/include/pybind11/operators.h b/phivenv/Lib/site-packages/torch/include/pybind11/operators.h
new file mode 100644
index 0000000000000000000000000000000000000000..e2b224e89304806d58bcd93b61b196e8fc88b0f2
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/pybind11/operators.h
@@ -0,0 +1,202 @@
+/*
+    pybind11/operator.h: Metatemplates for operator overloading
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+/// Enumeration with all supported operator types
+enum op_id : int {
+    op_add,
+    op_sub,
+    op_mul,
+    op_div,
+    op_mod,
+    op_divmod,
+    op_pow,
+    op_lshift,
+    op_rshift,
+    op_and,
+    op_xor,
+    op_or,
+    op_neg,
+    op_pos,
+    op_abs,
+    op_invert,
+    op_int,
+    op_long,
+    op_float,
+    op_str,
+    op_cmp,
+    op_gt,
+    op_ge,
+    op_lt,
+    op_le,
+    op_eq,
+    op_ne,
+    op_iadd,
+    op_isub,
+    op_imul,
+    op_idiv,
+    op_imod,
+    op_ilshift,
+    op_irshift,
+    op_iand,
+    op_ixor,
+    op_ior,
+    op_complex,
+    op_bool,
+    op_nonzero,
+    op_repr,
+    op_truediv,
+    op_itruediv,
+    op_hash
+};
+
+enum op_type : int {
+    op_l, /* base type on left */
+    op_r, /* base type on right */
+    op_u  /* unary operator */
+};
+
+struct self_t {};
+static const self_t self = self_t();
+
+/// Type for an unused type slot
+struct undefined_t {};
+
+/// Don't warn about an unused variable
+inline self_t __self() { return self; }
+
+/// base template of operator implementations
+template <op_id, op_type, typename B, typename L, typename R>
+struct op_impl {};
+
+/// Operator implementation generator
+template <op_id id, op_type ot, typename L, typename R>
+struct op_ {
+    static constexpr bool op_enable_if_hook = true;
+    template <typename Class, typename... Extra>
+    void execute(Class &cl, const Extra &...extra) const {
+        using Base = typename Class::type;
+        using L_type = conditional_t<std::is_same<L, self_t>::value, Base, L>;
+        using R_type = conditional_t<std::is_same<R, self_t>::value, Base, R>;
+        using op = op_impl<id, ot, Base, L_type, R_type>;
+        cl.def(op::name(), &op::execute, is_operator(), extra...);
+    }
+    template <typename Class, typename... Extra>
+    void execute_cast(Class &cl, const Extra &...extra) const {
+        using Base = typename Class::type;
+        using L_type = conditional_t<std::is_same<L, self_t>::value, Base, L>;
+        using R_type = conditional_t<std::is_same<R, self_t>::value, Base, R>;
+        using op = op_impl<id, ot, Base, L_type, R_type>;
+        cl.def(op::name(), &op::execute_cast, is_operator(), extra...);
+    }
+};
+
+#define PYBIND11_BINARY_OPERATOR(id, rid, op, expr)                                               \
+    template <typename B, typename L, typename R>                                                 \
+    struct op_impl<op_##id, op_l, B, L, R> {                                                      \
+        static char const *name() { return "__" #id "__"; }                                       \
+        static auto execute(const L &l, const R &r) -> decltype(expr) { return (expr); }          \
+        static B execute_cast(const L &l, const R &r) { return B(expr); }                         \
+    };                                                                                            \
+    template <typename B, typename L, typename R>                                                 \
+    struct op_impl<op_##id, op_r, B, L, R> {                                                      \
+        static char const *name() { return "__" #rid "__"; }                                      \
+        static auto execute(const R &r, const L &l) -> decltype(expr) { return (expr); }          \
+        static B execute_cast(const R &r, const L &l) { return B(expr); }                         \
+    };                                                                                            \
+    inline op_<op_##id, op_l, self_t, self_t> op(const self_t &, const self_t &) {                \
+        return op_<op_##id, op_l, self_t, self_t>();                                              \
+    }                                                                                             \
+    template <typename T>                                                                         \
+    op_<op_##id, op_l, self_t, T> op(const self_t &, const T &) {                                 \
+        return op_<op_##id, op_l, self_t, T>();                                                   \
+    }                                                                                             \
+    template <typename T>                                                                         \
+    op_<op_##id, op_r, T, self_t> op(const T &, const self_t &) {                                 \
+        return op_<op_##id, op_r, T, self_t>();                                                   \
+    }
+
+#define PYBIND11_INPLACE_OPERATOR(id, op, expr)                                                   \
+    template <typename B, typename L, typename R>                                                 \
+    struct op_impl<op_##id, op_l, B, L, R> {                                                      \
+        static char const *name() { return "__" #id "__"; }                                       \
+        static auto execute(L &l, const R &r) -> decltype(expr) { return expr; }                  \
+        static B execute_cast(L &l, const R &r) { return B(expr); }                               \
+    };                                                                                            \
+    template <typename T>                                                                         \
+    op_<op_##id, op_l, self_t, T> op(const self_t &, const T &) {                                 \
+        return op_<op_##id, op_l, self_t, T>();                                                   \
+    }
+
+#define PYBIND11_UNARY_OPERATOR(id, op, expr)                                                     \
+    template <typename B, typename L>                                                             \
+    struct op_impl<op_##id, op_u, B, L, undefined_t> {                                            \
+        static char const *name() { return "__" #id "__"; }                                       \
+        static auto execute(const L &l) -> decltype(expr) { return expr; }                        \
+        static B execute_cast(const L &l) { return B(expr); }                                     \
+    };                                                                                            \
+    inline op_<op_##id, op_u, self_t, undefined_t> op(const self_t &) {                           \
+        return op_<op_##id, op_u, self_t, undefined_t>();                                         \
+    }
+
+PYBIND11_BINARY_OPERATOR(sub, rsub, operator-, l - r)
+PYBIND11_BINARY_OPERATOR(add, radd, operator+, l + r)
+PYBIND11_BINARY_OPERATOR(mul, rmul, operator*, l *r)
+PYBIND11_BINARY_OPERATOR(truediv, rtruediv, operator/, l / r)
+PYBIND11_BINARY_OPERATOR(mod, rmod, operator%, l % r)
+PYBIND11_BINARY_OPERATOR(lshift, rlshift, operator<<, l << r)
+PYBIND11_BINARY_OPERATOR(rshift, rrshift, operator>>, l >> r)
+PYBIND11_BINARY_OPERATOR(and, rand, operator&, l &r)
+PYBIND11_BINARY_OPERATOR(xor, rxor, operator^, l ^ r)
+PYBIND11_BINARY_OPERATOR(eq, eq, operator==, l == r)
+PYBIND11_BINARY_OPERATOR(ne, ne, operator!=, l != r)
+PYBIND11_BINARY_OPERATOR(or, ror, operator|, l | r)
+PYBIND11_BINARY_OPERATOR(gt, lt, operator>, l > r)
+PYBIND11_BINARY_OPERATOR(ge, le, operator>=, l >= r)
+PYBIND11_BINARY_OPERATOR(lt, gt, operator<, l < r)
+PYBIND11_BINARY_OPERATOR(le, ge, operator<=, l <= r)
+// PYBIND11_BINARY_OPERATOR(pow,       rpow,         pow,          std::pow(l,  r))
+PYBIND11_INPLACE_OPERATOR(iadd, operator+=, l += r)
+PYBIND11_INPLACE_OPERATOR(isub, operator-=, l -= r)
+PYBIND11_INPLACE_OPERATOR(imul, operator*=, l *= r)
+PYBIND11_INPLACE_OPERATOR(itruediv, operator/=, l /= r)
+PYBIND11_INPLACE_OPERATOR(imod, operator%=, l %= r)
+PYBIND11_INPLACE_OPERATOR(ilshift, operator<<=, l <<= r)
+PYBIND11_INPLACE_OPERATOR(irshift, operator>>=, l >>= r)
+PYBIND11_INPLACE_OPERATOR(iand, operator&=, l &= r)
+PYBIND11_INPLACE_OPERATOR(ixor, operator^=, l ^= r)
+PYBIND11_INPLACE_OPERATOR(ior, operator|=, l |= r)
+PYBIND11_UNARY_OPERATOR(neg, operator-, -l)
+PYBIND11_UNARY_OPERATOR(pos, operator+, +l)
+// WARNING: This usage of `abs` should only be done for existing STL overloads.
+// Adding overloads directly in to the `std::` namespace is advised against:
+// https://en.cppreference.com/w/cpp/language/extending_std
+PYBIND11_UNARY_OPERATOR(abs, abs, std::abs(l))
+PYBIND11_UNARY_OPERATOR(hash, hash, std::hash<L>()(l))
+PYBIND11_UNARY_OPERATOR(invert, operator~, (~l))
+PYBIND11_UNARY_OPERATOR(bool, operator!, !!l)
+PYBIND11_UNARY_OPERATOR(int, int_, (int) l)
+PYBIND11_UNARY_OPERATOR(float, float_, (double) l)
+
+#undef PYBIND11_BINARY_OPERATOR
+#undef PYBIND11_INPLACE_OPERATOR
+#undef PYBIND11_UNARY_OPERATOR
+PYBIND11_NAMESPACE_END(detail)
+
+using detail::self;
+// Add named operators so that they are accessible via `py::`.
+using detail::hash;
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/phivenv/Lib/site-packages/torch/include/pybind11/options.h b/phivenv/Lib/site-packages/torch/include/pybind11/options.h
new file mode 100644
index 0000000000000000000000000000000000000000..02c41feb7533c4216f59dae1a4723f7f14e947ba
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/pybind11/options.h
@@ -0,0 +1,92 @@
+/*
+    pybind11/options.h: global settings that are configurable at runtime.
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+class options {
+public:
+    // Default RAII constructor, which leaves settings as they currently are.
+    options() : previous_state(global_state()) {}
+
+    // Class is non-copyable.
+    options(const options &) = delete;
+    options &operator=(const options &) = delete;
+
+    // Destructor, which restores settings that were in effect before.
+    ~options() { global_state() = previous_state; }
+
+    // Setter methods (affect the global state):
+
+    options &disable_user_defined_docstrings() & {
+        global_state().show_user_defined_docstrings = false;
+        return *this;
+    }
+
+    options &enable_user_defined_docstrings() & {
+        global_state().show_user_defined_docstrings = true;
+        return *this;
+    }
+
+    options &disable_function_signatures() & {
+        global_state().show_function_signatures = false;
+        return *this;
+    }
+
+    options &enable_function_signatures() & {
+        global_state().show_function_signatures = true;
+        return *this;
+    }
+
+    options &disable_enum_members_docstring() & {
+        global_state().show_enum_members_docstring = false;
+        return *this;
+    }
+
+    options &enable_enum_members_docstring() & {
+        global_state().show_enum_members_docstring = true;
+        return *this;
+    }
+
+    // Getter methods (return the global state):
+
+    static bool show_user_defined_docstrings() {
+        return global_state().show_user_defined_docstrings;
+    }
+
+    static bool show_function_signatures() { return global_state().show_function_signatures; }
+
+    static bool show_enum_members_docstring() {
+        return global_state().show_enum_members_docstring;
+    }
+
+    // This type is not meant to be allocated on the heap.
+    void *operator new(size_t) = delete;
+
+private:
+    struct state {
+        bool show_user_defined_docstrings = true; //< Include user-supplied texts in docstrings.
+        bool show_function_signatures = true;     //< Include auto-generated function signatures
+                                                  //  in docstrings.
+        bool show_enum_members_docstring = true;  //< Include auto-generated member list in enum
+                                                  //  docstrings.
+    };
+
+    static state &global_state() {
+        static state instance;
+        return instance;
+    }
+
+    state previous_state;
+};
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/phivenv/Lib/site-packages/torch/include/pybind11/pybind11.h b/phivenv/Lib/site-packages/torch/include/pybind11/pybind11.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d12e58c1f3eb163f05ef171b0217b7189cebb23
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/pybind11/pybind11.h
@@ -0,0 +1,2978 @@
+/*
+    pybind11/pybind11.h: Main header file of the C++11 python
+    binding generator library
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+#include "detail/class.h"
+#include "detail/exception_translation.h"
+#include "detail/init.h"
+#include "attr.h"
+#include "gil.h"
+#include "gil_safe_call_once.h"
+#include "options.h"
+#include "typing.h"
+
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <new>
+#include <string>
+#include <utility>
+#include <vector>
+
+#if defined(__cpp_lib_launder) && !(defined(_MSC_VER) && (_MSC_VER < 1914))
+#    define PYBIND11_STD_LAUNDER std::launder
+#    define PYBIND11_HAS_STD_LAUNDER 1
+#else
+#    define PYBIND11_STD_LAUNDER
+#    define PYBIND11_HAS_STD_LAUNDER 0
+#endif
+#if defined(__GNUG__) && !defined(__clang__)
+#    include <cxxabi.h>
+#endif
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+/* https://stackoverflow.com/questions/46798456/handling-gccs-noexcept-type-warning
+   This warning is about ABI compatibility, not code health.
+   It is only actually needed in a couple places, but apparently GCC 7 "generates this warning if
+   and only if the first template instantiation ... involves noexcept" [stackoverflow], therefore
+   it could get triggered from seemingly random places, depending on user code.
+   No other GCC version generates this warning.
+ */
+#if defined(__GNUC__) && __GNUC__ == 7
+PYBIND11_WARNING_DISABLE_GCC("-Wnoexcept-type")
+#endif
+
+PYBIND11_WARNING_DISABLE_MSVC(4127)
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+inline std::string replace_newlines_and_squash(const char *text) {
+    const char *whitespaces = " \t\n\r\f\v";
+    std::string result(text);
+    bool previous_is_whitespace = false;
+
+    if (result.size() >= 2) {
+        // Do not modify string representations
+        char first_char = result[0];
+        char last_char = result[result.size() - 1];
+        if (first_char == last_char && first_char == '\'') {
+            return result;
+        }
+    }
+    result.clear();
+
+    // Replace characters in whitespaces array with spaces and squash consecutive spaces
+    while (*text != '\0') {
+        if (std::strchr(whitespaces, *text)) {
+            if (!previous_is_whitespace) {
+                result += ' ';
+                previous_is_whitespace = true;
+            }
+        } else {
+            result += *text;
+            previous_is_whitespace = false;
+        }
+        ++text;
+    }
+
+    // Strip leading and trailing whitespaces
+    const size_t str_begin = result.find_first_not_of(whitespaces);
+    if (str_begin == std::string::npos) {
+        return "";
+    }
+
+    const size_t str_end = result.find_last_not_of(whitespaces);
+    const size_t str_range = str_end - str_begin + 1;
+
+    return result.substr(str_begin, str_range);
+}
+
+#if defined(_MSC_VER)
+#    define PYBIND11_COMPAT_STRDUP _strdup
+#else
+#    define PYBIND11_COMPAT_STRDUP strdup
+#endif
+
+PYBIND11_NAMESPACE_END(detail)
+
+/// Wraps an arbitrary C++ function/method/lambda function/.. into a callable Python object
+class cpp_function : public function {
+public:
+    cpp_function() = default;
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    cpp_function(std::nullptr_t) {}
+    cpp_function(std::nullptr_t, const is_setter &) {}
+
+    /// Construct a cpp_function from a vanilla function pointer
+    template <typename Return, typename... Args, typename... Extra>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    cpp_function(Return (*f)(Args...), const Extra &...extra) {
+        initialize(f, f, extra...);
+    }
+
+    /// Construct a cpp_function from a lambda function (possibly with internal state)
+    template <typename Func,
+              typename... Extra,
+              typename = detail::enable_if_t<detail::is_lambda<Func>::value>>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    cpp_function(Func &&f, const Extra &...extra) {
+        initialize(
+            std::forward<Func>(f), (detail::function_signature_t<Func> *) nullptr, extra...);
+    }
+
+    /// Construct a cpp_function from a class method (non-const, no ref-qualifier)
+    template <typename Return, typename Class, typename... Arg, typename... Extra>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    cpp_function(Return (Class::*f)(Arg...), const Extra &...extra) {
+        initialize(
+            [f](Class *c, Arg... args) -> Return { return (c->*f)(std::forward<Arg>(args)...); },
+            (Return(*)(Class *, Arg...)) nullptr,
+            extra...);
+    }
+
+    /// Construct a cpp_function from a class method (non-const, lvalue ref-qualifier)
+    /// A copy of the overload for non-const functions without explicit ref-qualifier
+    /// but with an added `&`.
+    template <typename Return, typename Class, typename... Arg, typename... Extra>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    cpp_function(Return (Class::*f)(Arg...) &, const Extra &...extra) {
+        initialize(
+            [f](Class *c, Arg... args) -> Return { return (c->*f)(std::forward<Arg>(args)...); },
+            (Return(*)(Class *, Arg...)) nullptr,
+            extra...);
+    }
+
+    /// Construct a cpp_function from a class method (const, no ref-qualifier)
+    template <typename Return, typename Class, typename... Arg, typename... Extra>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    cpp_function(Return (Class::*f)(Arg...) const, const Extra &...extra) {
+        initialize([f](const Class *c,
+                       Arg... args) -> Return { return (c->*f)(std::forward<Arg>(args)...); },
+                   (Return(*)(const Class *, Arg...)) nullptr,
+                   extra...);
+    }
+
+    /// Construct a cpp_function from a class method (const, lvalue ref-qualifier)
+    /// A copy of the overload for const functions without explicit ref-qualifier
+    /// but with an added `&`.
+    template <typename Return, typename Class, typename... Arg, typename... Extra>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    cpp_function(Return (Class::*f)(Arg...) const &, const Extra &...extra) {
+        initialize([f](const Class *c,
+                       Arg... args) -> Return { return (c->*f)(std::forward<Arg>(args)...); },
+                   (Return(*)(const Class *, Arg...)) nullptr,
+                   extra...);
+    }
+
+    /// Return the function name
+    object name() const { return attr("__name__"); }
+
+protected:
+    struct InitializingFunctionRecordDeleter {
+        // `destruct(function_record, false)`: `initialize_generic` copies strings and
+        // takes care of cleaning up in case of exceptions. So pass `false` to `free_strings`.
+        void operator()(detail::function_record *rec) { destruct(rec, false); }
+    };
+    using unique_function_record
+        = std::unique_ptr<detail::function_record, InitializingFunctionRecordDeleter>;
+
+    /// Space optimization: don't inline this frequently instantiated fragment
+    PYBIND11_NOINLINE unique_function_record make_function_record() {
+        return unique_function_record(new detail::function_record());
+    }
+
+    /// Special internal constructor for functors, lambda functions, etc.
+    template <typename Func, typename Return, typename... Args, typename... Extra>
+    void initialize(Func &&f, Return (*)(Args...), const Extra &...extra) {
+        using namespace detail;
+        struct capture {
+            remove_reference_t<Func> f;
+        };
+
+        /* Store the function including any extra state it might have (e.g. a lambda capture
+         * object) */
+        // The unique_ptr makes sure nothing is leaked in case of an exception.
+        auto unique_rec = make_function_record();
+        auto *rec = unique_rec.get();
+
+        /* Store the capture object directly in the function record if there is enough space */
+        if (sizeof(capture) <= sizeof(rec->data)) {
+            /* Without these pragmas, GCC warns that there might not be
+               enough space to use the placement new operator. However, the
+               'if' statement above ensures that this is the case. */
+            PYBIND11_WARNING_PUSH
+
+#if defined(__GNUG__) && __GNUC__ >= 6
+            PYBIND11_WARNING_DISABLE_GCC("-Wplacement-new")
+#endif
+
+            new ((capture *) &rec->data) capture{std::forward<Func>(f)};
+
+#if !PYBIND11_HAS_STD_LAUNDER
+            PYBIND11_WARNING_DISABLE_GCC("-Wstrict-aliasing")
+#endif
+
+            // UB without std::launder, but without breaking ABI and/or
+            // a significant refactoring it's "impossible" to solve.
+            if (!std::is_trivially_destructible<capture>::value) {
+                rec->free_data = [](function_record *r) {
+                    auto data = PYBIND11_STD_LAUNDER((capture *) &r->data);
+                    (void) data;
+                    data->~capture();
+                };
+            }
+            PYBIND11_WARNING_POP
+        } else {
+            rec->data[0] = new capture{std::forward<Func>(f)};
+            rec->free_data = [](function_record *r) { delete ((capture *) r->data[0]); };
+        }
+
+        /* Type casters for the function arguments and return value */
+        using cast_in = argument_loader<Args...>;
+        using cast_out
+            = make_caster<conditional_t<std::is_void<Return>::value, void_type, Return>>;
+
+        static_assert(
+            expected_num_args<Extra...>(
+                sizeof...(Args), cast_in::args_pos >= 0, cast_in::has_kwargs),
+            "The number of argument annotations does not match the number of function arguments");
+
+        /* Dispatch code which converts function arguments and performs the actual function call */
+        rec->impl = [](function_call &call) -> handle {
+            cast_in args_converter;
+
+            /* Try to cast the function arguments into the C++ domain */
+            if (!args_converter.load_args(call)) {
+                return PYBIND11_TRY_NEXT_OVERLOAD;
+            }
+
+            /* Invoke call policy pre-call hook */
+            process_attributes<Extra...>::precall(call);
+
+            /* Get a pointer to the capture object */
+            const auto *data = (sizeof(capture) <= sizeof(call.func.data) ? &call.func.data
+                                                                          : call.func.data[0]);
+            auto *cap = const_cast<capture *>(reinterpret_cast<const capture *>(data));
+
+            /* Override policy for rvalues -- usually to enforce rvp::move on an rvalue */
+            return_value_policy policy
+                = return_value_policy_override<Return>::policy(call.func.policy);
+
+            /* Function scope guard -- defaults to the compile-to-nothing `void_type` */
+            using Guard = extract_guard_t<Extra...>;
+
+            /* Perform the function call */
+            handle result;
+            if (call.func.is_setter) {
+                (void) std::move(args_converter).template call<Return, Guard>(cap->f);
+                result = none().release();
+            } else {
+                result = cast_out::cast(
+                    std::move(args_converter).template call<Return, Guard>(cap->f),
+                    policy,
+                    call.parent);
+            }
+
+            /* Invoke call policy post-call hook */
+            process_attributes<Extra...>::postcall(call, result);
+
+            return result;
+        };
+
+        rec->nargs_pos = cast_in::args_pos >= 0
+                             ? static_cast<std::uint16_t>(cast_in::args_pos)
+                             : sizeof...(Args) - cast_in::has_kwargs; // Will get reduced more if
+                                                                      // we have a kw_only
+        rec->has_args = cast_in::args_pos >= 0;
+        rec->has_kwargs = cast_in::has_kwargs;
+
+        /* Process any user-provided function attributes */
+        process_attributes<Extra...>::init(extra..., rec);
+
+        {
+            constexpr bool has_kw_only_args = any_of<std::is_same<kw_only, Extra>...>::value,
+                           has_pos_only_args = any_of<std::is_same<pos_only, Extra>...>::value,
+                           has_arg_annotations = any_of<is_keyword<Extra>...>::value;
+            static_assert(has_arg_annotations || !has_kw_only_args,
+                          "py::kw_only requires the use of argument annotations");
+            static_assert(has_arg_annotations || !has_pos_only_args,
+                          "py::pos_only requires the use of argument annotations (for docstrings "
+                          "and aligning the annotations to the argument)");
+
+            static_assert(constexpr_sum(is_kw_only<Extra>::value...) <= 1,
+                          "py::kw_only may be specified only once");
+            static_assert(constexpr_sum(is_pos_only<Extra>::value...) <= 1,
+                          "py::pos_only may be specified only once");
+            constexpr auto kw_only_pos = constexpr_first<is_kw_only, Extra...>();
+            constexpr auto pos_only_pos = constexpr_first<is_pos_only, Extra...>();
+            static_assert(!(has_kw_only_args && has_pos_only_args) || pos_only_pos < kw_only_pos,
+                          "py::pos_only must come before py::kw_only");
+        }
+
+        /* Generate a readable signature describing the function's arguments and return
+           value types */
+        static constexpr auto signature
+            = const_name("(") + cast_in::arg_names + const_name(") -> ") + cast_out::name;
+        PYBIND11_DESCR_CONSTEXPR auto types = decltype(signature)::types();
+
+        /* Register the function with Python from generic (non-templated) code */
+        // Pass on the ownership over the `unique_rec` to `initialize_generic`. `rec` stays valid.
+        initialize_generic(std::move(unique_rec), signature.text, types.data(), sizeof...(Args));
+
+        /* Stash some additional information used by an important optimization in 'functional.h' */
+        using FunctionType = Return (*)(Args...);
+        constexpr bool is_function_ptr
+            = std::is_convertible<Func, FunctionType>::value && sizeof(capture) == sizeof(void *);
+        if (is_function_ptr) {
+            rec->is_stateless = true;
+            rec->data[1]
+                = const_cast<void *>(reinterpret_cast<const void *>(&typeid(FunctionType)));
+        }
+    }
+
+    // Utility class that keeps track of all duplicated strings, and cleans them up in its
+    // destructor, unless they are released. Basically a RAII-solution to deal with exceptions
+    // along the way.
+    class strdup_guard {
+    public:
+        strdup_guard() = default;
+        strdup_guard(const strdup_guard &) = delete;
+        strdup_guard &operator=(const strdup_guard &) = delete;
+
+        ~strdup_guard() {
+            for (auto *s : strings) {
+                std::free(s);
+            }
+        }
+        char *operator()(const char *s) {
+            auto *t = PYBIND11_COMPAT_STRDUP(s);
+            strings.push_back(t);
+            return t;
+        }
+        void release() { strings.clear(); }
+
+    private:
+        std::vector<char *> strings;
+    };
+
+    /// Register a function call with Python (generic non-templated code goes here)
+    void initialize_generic(unique_function_record &&unique_rec,
+                            const char *text,
+                            const std::type_info *const *types,
+                            size_t args) {
+        // Do NOT receive `unique_rec` by value. If this function fails to move out the unique_ptr,
+        // we do not want this to destruct the pointer. `initialize` (the caller) still relies on
+        // the pointee being alive after this call. Only move out if a `capsule` is going to keep
+        // it alive.
+        auto *rec = unique_rec.get();
+
+        // Keep track of strdup'ed strings, and clean them up as long as the function's capsule
+        // has not taken ownership yet (when `unique_rec.release()` is called).
+        // Note: This cannot easily be fixed by a `unique_ptr` with custom deleter, because the
+        // strings are only referenced before strdup'ing. So only *after* the following block could
+        // `destruct` safely be called, but even then, `repr` could still throw in the middle of
+        // copying all strings.
+        strdup_guard guarded_strdup;
+
+        /* Create copies of all referenced C-style strings */
+        rec->name = guarded_strdup(rec->name ? rec->name : "");
+        if (rec->doc) {
+            rec->doc = guarded_strdup(rec->doc);
+        }
+        for (auto &a : rec->args) {
+            if (a.name) {
+                a.name = guarded_strdup(a.name);
+            }
+            if (a.descr) {
+                a.descr = guarded_strdup(a.descr);
+            } else if (a.value) {
+                a.descr = guarded_strdup(repr(a.value).cast<std::string>().c_str());
+            }
+        }
+
+        rec->is_constructor = (std::strcmp(rec->name, "__init__") == 0)
+                              || (std::strcmp(rec->name, "__setstate__") == 0);
+
+#if defined(PYBIND11_DETAILED_ERROR_MESSAGES) && !defined(PYBIND11_DISABLE_NEW_STYLE_INIT_WARNING)
+        if (rec->is_constructor && !rec->is_new_style_constructor) {
+            const auto class_name
+                = detail::get_fully_qualified_tp_name((PyTypeObject *) rec->scope.ptr());
+            const auto func_name = std::string(rec->name);
+            PyErr_WarnEx(PyExc_FutureWarning,
+                         ("pybind11-bound class '" + class_name
+                          + "' is using an old-style "
+                            "placement-new '"
+                          + func_name
+                          + "' which has been deprecated. See "
+                            "the upgrade guide in pybind11's docs. This message is only visible "
+                            "when compiled in debug mode.")
+                             .c_str(),
+                         0);
+        }
+#endif
+
+        /* Generate a proper function signature */
+        std::string signature;
+        size_t type_index = 0, arg_index = 0;
+        bool is_starred = false;
+        for (const auto *pc = text; *pc != '\0'; ++pc) {
+            const auto c = *pc;
+
+            if (c == '{') {
+                // Write arg name for everything except *args and **kwargs.
+                is_starred = *(pc + 1) == '*';
+                if (is_starred) {
+                    continue;
+                }
+                // Separator for keyword-only arguments, placed before the kw
+                // arguments start (unless we are already putting an *args)
+                if (!rec->has_args && arg_index == rec->nargs_pos) {
+                    signature += "*, ";
+                }
+                if (arg_index < rec->args.size() && rec->args[arg_index].name) {
+                    signature += rec->args[arg_index].name;
+                } else if (arg_index == 0 && rec->is_method) {
+                    signature += "self";
+                } else {
+                    signature += "arg" + std::to_string(arg_index - (rec->is_method ? 1 : 0));
+                }
+                signature += ": ";
+            } else if (c == '}') {
+                // Write default value if available.
+                if (!is_starred && arg_index < rec->args.size() && rec->args[arg_index].descr) {
+                    signature += " = ";
+                    signature += detail::replace_newlines_and_squash(rec->args[arg_index].descr);
+                }
+                // Separator for positional-only arguments (placed after the
+                // argument, rather than before like *
+                if (rec->nargs_pos_only > 0 && (arg_index + 1) == rec->nargs_pos_only) {
+                    signature += ", /";
+                }
+                if (!is_starred) {
+                    arg_index++;
+                }
+            } else if (c == '%') {
+                const std::type_info *t = types[type_index++];
+                if (!t) {
+                    pybind11_fail("Internal error while parsing type signature (1)");
+                }
+                if (auto *tinfo = detail::get_type_info(*t)) {
+                    handle th((PyObject *) tinfo->type);
+                    signature += th.attr("__module__").cast<std::string>() + "."
+                                 + th.attr("__qualname__").cast<std::string>();
+                } else if (rec->is_new_style_constructor && arg_index == 0) {
+                    // A new-style `__init__` takes `self` as `value_and_holder`.
+                    // Rewrite it to the proper class type.
+                    signature += rec->scope.attr("__module__").cast<std::string>() + "."
+                                 + rec->scope.attr("__qualname__").cast<std::string>();
+                } else {
+                    signature += detail::quote_cpp_type_name(detail::clean_type_id(t->name()));
+                }
+            } else {
+                signature += c;
+            }
+        }
+
+        if (arg_index != args - rec->has_args - rec->has_kwargs || types[type_index] != nullptr) {
+            pybind11_fail("Internal error while parsing type signature (2)");
+        }
+
+        rec->signature = guarded_strdup(signature.c_str());
+        rec->args.shrink_to_fit();
+        rec->nargs = (std::uint16_t) args;
+
+        if (rec->sibling && PYBIND11_INSTANCE_METHOD_CHECK(rec->sibling.ptr())) {
+            rec->sibling = PYBIND11_INSTANCE_METHOD_GET_FUNCTION(rec->sibling.ptr());
+        }
+
+        detail::function_record *chain = nullptr, *chain_start = rec;
+        if (rec->sibling) {
+            if (PyCFunction_Check(rec->sibling.ptr())) {
+                auto *self = PyCFunction_GET_SELF(rec->sibling.ptr());
+                if (!isinstance<capsule>(self)) {
+                    chain = nullptr;
+                } else {
+                    auto rec_capsule = reinterpret_borrow<capsule>(self);
+                    if (detail::is_function_record_capsule(rec_capsule)) {
+                        chain = rec_capsule.get_pointer<detail::function_record>();
+                        /* Never append a method to an overload chain of a parent class;
+                           instead, hide the parent's overloads in this case */
+                        if (!chain->scope.is(rec->scope)) {
+                            chain = nullptr;
+                        }
+                    } else {
+                        chain = nullptr;
+                    }
+                }
+            }
+            // Don't trigger for things like the default __init__, which are wrapper_descriptors
+            // that we are intentionally replacing
+            else if (!rec->sibling.is_none() && rec->name[0] != '_') {
+                pybind11_fail("Cannot overload existing non-function object \""
+                              + std::string(rec->name) + "\" with a function of the same name");
+            }
+        }
+
+        if (!chain) {
+            /* No existing overload was found, create a new function object */
+            rec->def = new PyMethodDef();
+            std::memset(rec->def, 0, sizeof(PyMethodDef));
+            rec->def->ml_name = rec->name;
+            rec->def->ml_meth
+                = reinterpret_cast<PyCFunction>(reinterpret_cast<void (*)()>(dispatcher));
+            rec->def->ml_flags = METH_VARARGS | METH_KEYWORDS;
+
+            capsule rec_capsule(unique_rec.release(),
+                                detail::get_function_record_capsule_name(),
+                                [](void *ptr) { destruct((detail::function_record *) ptr); });
+            guarded_strdup.release();
+
+            object scope_module;
+            if (rec->scope) {
+                if (hasattr(rec->scope, "__module__")) {
+                    scope_module = rec->scope.attr("__module__");
+                } else if (hasattr(rec->scope, "__name__")) {
+                    scope_module = rec->scope.attr("__name__");
+                }
+            }
+
+            m_ptr = PyCFunction_NewEx(rec->def, rec_capsule.ptr(), scope_module.ptr());
+            if (!m_ptr) {
+                pybind11_fail("cpp_function::cpp_function(): Could not allocate function object");
+            }
+        } else {
+            /* Append at the beginning or end of the overload chain */
+            m_ptr = rec->sibling.ptr();
+            inc_ref();
+            if (chain->is_method != rec->is_method) {
+                pybind11_fail(
+                    "overloading a method with both static and instance methods is not supported; "
+#if !defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+                    "#define PYBIND11_DETAILED_ERROR_MESSAGES or compile in debug mode for more "
+                    "details"
+#else
+                    "error while attempting to bind "
+                    + std::string(rec->is_method ? "instance" : "static") + " method "
+                    + std::string(pybind11::str(rec->scope.attr("__name__"))) + "."
+                    + std::string(rec->name) + signature
+#endif
+                );
+            }
+
+            if (rec->prepend) {
+                // Beginning of chain; we need to replace the capsule's current head-of-the-chain
+                // pointer with this one, then make this one point to the previous head of the
+                // chain.
+                chain_start = rec;
+                rec->next = chain;
+                auto rec_capsule
+                    = reinterpret_borrow<capsule>(((PyCFunctionObject *) m_ptr)->m_self);
+                rec_capsule.set_pointer(unique_rec.release());
+                guarded_strdup.release();
+            } else {
+                // Or end of chain (normal behavior)
+                chain_start = chain;
+                while (chain->next) {
+                    chain = chain->next;
+                }
+                chain->next = unique_rec.release();
+                guarded_strdup.release();
+            }
+        }
+
+        std::string signatures;
+        int index = 0;
+        /* Create a nice pydoc rec including all signatures and
+           docstrings of the functions in the overload chain */
+        if (chain && options::show_function_signatures()
+            && std::strcmp(rec->name, "_pybind11_conduit_v1_") != 0) {
+            // First a generic signature
+            signatures += rec->name;
+            signatures += "(*args, **kwargs)\n";
+            signatures += "Overloaded function.\n\n";
+        }
+        // Then specific overload signatures
+        bool first_user_def = true;
+        for (auto *it = chain_start; it != nullptr; it = it->next) {
+            if (options::show_function_signatures()
+                && std::strcmp(rec->name, "_pybind11_conduit_v1_") != 0) {
+                if (index > 0) {
+                    signatures += '\n';
+                }
+                if (chain) {
+                    signatures += std::to_string(++index) + ". ";
+                }
+                signatures += rec->name;
+                signatures += it->signature;
+                signatures += '\n';
+            }
+            if (it->doc && it->doc[0] != '\0' && options::show_user_defined_docstrings()) {
+                // If we're appending another docstring, and aren't printing function signatures,
+                // we need to append a newline first:
+                if (!options::show_function_signatures()) {
+                    if (first_user_def) {
+                        first_user_def = false;
+                    } else {
+                        signatures += '\n';
+                    }
+                }
+                if (options::show_function_signatures()) {
+                    signatures += '\n';
+                }
+                signatures += it->doc;
+                if (options::show_function_signatures()) {
+                    signatures += '\n';
+                }
+            }
+        }
+
+        /* Install docstring */
+        auto *func = (PyCFunctionObject *) m_ptr;
+        std::free(const_cast<char *>(func->m_ml->ml_doc));
+        // Install docstring if it's non-empty (when at least one option is enabled)
+        func->m_ml->ml_doc
+            = signatures.empty() ? nullptr : PYBIND11_COMPAT_STRDUP(signatures.c_str());
+
+        if (rec->is_method) {
+            m_ptr = PYBIND11_INSTANCE_METHOD_NEW(m_ptr, rec->scope.ptr());
+            if (!m_ptr) {
+                pybind11_fail(
+                    "cpp_function::cpp_function(): Could not allocate instance method object");
+            }
+            Py_DECREF(func);
+        }
+    }
+
+    /// When a cpp_function is GCed, release any memory allocated by pybind11
+    static void destruct(detail::function_record *rec, bool free_strings = true) {
+// If on Python 3.9, check the interpreter "MICRO" (patch) version.
+// If this is running on 3.9.0, we have to work around a bug.
+#if !defined(PYPY_VERSION) && PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION == 9
+        static bool is_zero = Py_GetVersion()[4] == '0';
+#endif
+
+        while (rec) {
+            detail::function_record *next = rec->next;
+            if (rec->free_data) {
+                rec->free_data(rec);
+            }
+            // During initialization, these strings might not have been copied yet,
+            // so they cannot be freed. Once the function has been created, they can.
+            // Check `make_function_record` for more details.
+            if (free_strings) {
+                std::free((char *) rec->name);
+                std::free((char *) rec->doc);
+                std::free((char *) rec->signature);
+                for (auto &arg : rec->args) {
+                    std::free(const_cast<char *>(arg.name));
+                    std::free(const_cast<char *>(arg.descr));
+                }
+            }
+            for (auto &arg : rec->args) {
+                arg.value.dec_ref();
+            }
+            if (rec->def) {
+                std::free(const_cast<char *>(rec->def->ml_doc));
+// Python 3.9.0 decref's these in the wrong order; rec->def
+// If loaded on 3.9.0, let these leak (use Python 3.9.1 at runtime to fix)
+// See https://github.com/python/cpython/pull/22670
+#if !defined(PYPY_VERSION) && PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION == 9
+                if (!is_zero) {
+                    delete rec->def;
+                }
+#else
+                delete rec->def;
+#endif
+            }
+            delete rec;
+            rec = next;
+        }
+    }
+
+    /// Main dispatch logic for calls to functions bound using pybind11
+    static PyObject *dispatcher(PyObject *self, PyObject *args_in, PyObject *kwargs_in) {
+        using namespace detail;
+        assert(isinstance<capsule>(self));
+
+        /* Iterator over the list of potentially admissible overloads */
+        const function_record *overloads = reinterpret_cast<function_record *>(
+                                  PyCapsule_GetPointer(self, get_function_record_capsule_name())),
+                              *current_overload = overloads;
+        assert(overloads != nullptr);
+
+        /* Need to know how many arguments + keyword arguments there are to pick the right
+           overload */
+        const auto n_args_in = (size_t) PyTuple_GET_SIZE(args_in);
+
+        handle parent = n_args_in > 0 ? PyTuple_GET_ITEM(args_in, 0) : nullptr,
+               result = PYBIND11_TRY_NEXT_OVERLOAD;
+
+        auto self_value_and_holder = value_and_holder();
+        if (overloads->is_constructor) {
+            if (!parent
+                || !PyObject_TypeCheck(parent.ptr(), (PyTypeObject *) overloads->scope.ptr())) {
+                set_error(PyExc_TypeError,
+                          "__init__(self, ...) called with invalid or missing `self` argument");
+                return nullptr;
+            }
+
+            auto *const tinfo = get_type_info((PyTypeObject *) overloads->scope.ptr());
+            auto *const pi = reinterpret_cast<instance *>(parent.ptr());
+            self_value_and_holder = pi->get_value_and_holder(tinfo, true);
+
+            // If this value is already registered it must mean __init__ is invoked multiple times;
+            // we really can't support that in C++, so just ignore the second __init__.
+            if (self_value_and_holder.instance_registered()) {
+                return none().release().ptr();
+            }
+        }
+
+        try {
+            // We do this in two passes: in the first pass, we load arguments with `convert=false`;
+            // in the second, we allow conversion (except for arguments with an explicit
+            // py::arg().noconvert()).  This lets us prefer calls without conversion, with
+            // conversion as a fallback.
+            std::vector<function_call> second_pass;
+
+            // However, if there are no overloads, we can just skip the no-convert pass entirely
+            const bool overloaded
+                = current_overload != nullptr && current_overload->next != nullptr;
+
+            for (; current_overload != nullptr; current_overload = current_overload->next) {
+
+                /* For each overload:
+                   1. Copy all positional arguments we were given, also checking to make sure that
+                      named positional arguments weren't *also* specified via kwarg.
+                   2. If we weren't given enough, try to make up the omitted ones by checking
+                      whether they were provided by a kwarg matching the `py::arg("name")` name. If
+                      so, use it (and remove it from kwargs); if not, see if the function binding
+                      provided a default that we can use.
+                   3. Ensure that either all keyword arguments were "consumed", or that the
+                   function takes a kwargs argument to accept unconsumed kwargs.
+                   4. Any positional arguments still left get put into a tuple (for args), and any
+                      leftover kwargs get put into a dict.
+                   5. Pack everything into a vector; if we have py::args or py::kwargs, they are an
+                      extra tuple or dict at the end of the positional arguments.
+                   6. Call the function call dispatcher (function_record::impl)
+
+                   If one of these fail, move on to the next overload and keep trying until we get
+                   a result other than PYBIND11_TRY_NEXT_OVERLOAD.
+                 */
+
+                const function_record &func = *current_overload;
+                size_t num_args = func.nargs; // Number of positional arguments that we need
+                if (func.has_args) {
+                    --num_args; // (but don't count py::args
+                }
+                if (func.has_kwargs) {
+                    --num_args; //  or py::kwargs)
+                }
+                size_t pos_args = func.nargs_pos;
+
+                if (!func.has_args && n_args_in > pos_args) {
+                    continue; // Too many positional arguments for this overload
+                }
+
+                if (n_args_in < pos_args && func.args.size() < pos_args) {
+                    continue; // Not enough positional arguments given, and not enough defaults to
+                              // fill in the blanks
+                }
+
+                function_call call(func, parent);
+
+                // Protect std::min with parentheses
+                size_t args_to_copy = (std::min)(pos_args, n_args_in);
+                size_t args_copied = 0;
+
+                // 0. Inject new-style `self` argument
+                if (func.is_new_style_constructor) {
+                    // The `value` may have been preallocated by an old-style `__init__`
+                    // if it was a preceding candidate for overload resolution.
+                    if (self_value_and_holder) {
+                        self_value_and_holder.type->dealloc(self_value_and_holder);
+                    }
+
+                    call.init_self = PyTuple_GET_ITEM(args_in, 0);
+                    call.args.emplace_back(reinterpret_cast<PyObject *>(&self_value_and_holder));
+                    call.args_convert.push_back(false);
+                    ++args_copied;
+                }
+
+                // 1. Copy any position arguments given.
+                bool bad_arg = false;
+                for (; args_copied < args_to_copy; ++args_copied) {
+                    const argument_record *arg_rec
+                        = args_copied < func.args.size() ? &func.args[args_copied] : nullptr;
+                    if (kwargs_in && arg_rec && arg_rec->name
+                        && dict_getitemstring(kwargs_in, arg_rec->name)) {
+                        bad_arg = true;
+                        break;
+                    }
+
+                    handle arg(PyTuple_GET_ITEM(args_in, args_copied));
+                    if (arg_rec && !arg_rec->none && arg.is_none()) {
+                        bad_arg = true;
+                        break;
+                    }
+                    call.args.push_back(arg);
+                    call.args_convert.push_back(arg_rec ? arg_rec->convert : true);
+                }
+                if (bad_arg) {
+                    continue; // Maybe it was meant for another overload (issue #688)
+                }
+
+                // Keep track of how many position args we copied out in case we need to come back
+                // to copy the rest into a py::args argument.
+                size_t positional_args_copied = args_copied;
+
+                // We'll need to copy this if we steal some kwargs for defaults
+                dict kwargs = reinterpret_borrow<dict>(kwargs_in);
+
+                // 1.5. Fill in any missing pos_only args from defaults if they exist
+                if (args_copied < func.nargs_pos_only) {
+                    for (; args_copied < func.nargs_pos_only; ++args_copied) {
+                        const auto &arg_rec = func.args[args_copied];
+                        handle value;
+
+                        if (arg_rec.value) {
+                            value = arg_rec.value;
+                        }
+                        if (value) {
+                            call.args.push_back(value);
+                            call.args_convert.push_back(arg_rec.convert);
+                        } else {
+                            break;
+                        }
+                    }
+
+                    if (args_copied < func.nargs_pos_only) {
+                        continue; // Not enough defaults to fill the positional arguments
+                    }
+                }
+
+                // 2. Check kwargs and, failing that, defaults that may help complete the list
+                if (args_copied < num_args) {
+                    bool copied_kwargs = false;
+
+                    for (; args_copied < num_args; ++args_copied) {
+                        const auto &arg_rec = func.args[args_copied];
+
+                        handle value;
+                        if (kwargs_in && arg_rec.name) {
+                            value = dict_getitemstring(kwargs.ptr(), arg_rec.name);
+                        }
+
+                        if (value) {
+                            // Consume a kwargs value
+                            if (!copied_kwargs) {
+                                kwargs = reinterpret_steal<dict>(PyDict_Copy(kwargs.ptr()));
+                                copied_kwargs = true;
+                            }
+                            if (PyDict_DelItemString(kwargs.ptr(), arg_rec.name) == -1) {
+                                throw error_already_set();
+                            }
+                        } else if (arg_rec.value) {
+                            value = arg_rec.value;
+                        }
+
+                        if (!arg_rec.none && value.is_none()) {
+                            break;
+                        }
+
+                        if (value) {
+                            // If we're at the py::args index then first insert a stub for it to be
+                            // replaced later
+                            if (func.has_args && call.args.size() == func.nargs_pos) {
+                                call.args.push_back(none());
+                            }
+
+                            call.args.push_back(value);
+                            call.args_convert.push_back(arg_rec.convert);
+                        } else {
+                            break;
+                        }
+                    }
+
+                    if (args_copied < num_args) {
+                        continue; // Not enough arguments, defaults, or kwargs to fill the
+                                  // positional arguments
+                    }
+                }
+
+                // 3. Check everything was consumed (unless we have a kwargs arg)
+                if (kwargs && !kwargs.empty() && !func.has_kwargs) {
+                    continue; // Unconsumed kwargs, but no py::kwargs argument to accept them
+                }
+
+                // 4a. If we have a py::args argument, create a new tuple with leftovers
+                if (func.has_args) {
+                    tuple extra_args;
+                    if (args_to_copy == 0) {
+                        // We didn't copy out any position arguments from the args_in tuple, so we
+                        // can reuse it directly without copying:
+                        extra_args = reinterpret_borrow<tuple>(args_in);
+                    } else if (positional_args_copied >= n_args_in) {
+                        extra_args = tuple(0);
+                    } else {
+                        size_t args_size = n_args_in - positional_args_copied;
+                        extra_args = tuple(args_size);
+                        for (size_t i = 0; i < args_size; ++i) {
+                            extra_args[i] = PyTuple_GET_ITEM(args_in, positional_args_copied + i);
+                        }
+                    }
+                    if (call.args.size() <= func.nargs_pos) {
+                        call.args.push_back(extra_args);
+                    } else {
+                        call.args[func.nargs_pos] = extra_args;
+                    }
+                    call.args_convert.push_back(false);
+                    call.args_ref = std::move(extra_args);
+                }
+
+                // 4b. If we have a py::kwargs, pass on any remaining kwargs
+                if (func.has_kwargs) {
+                    if (!kwargs.ptr()) {
+                        kwargs = dict(); // If we didn't get one, send an empty one
+                    }
+                    call.args.push_back(kwargs);
+                    call.args_convert.push_back(false);
+                    call.kwargs_ref = std::move(kwargs);
+                }
+
+// 5. Put everything in a vector.  Not technically step 5, we've been building it
+// in `call.args` all along.
+#if defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+                if (call.args.size() != func.nargs || call.args_convert.size() != func.nargs) {
+                    pybind11_fail("Internal error: function call dispatcher inserted wrong number "
+                                  "of arguments!");
+                }
+#endif
+
+                std::vector<bool> second_pass_convert;
+                if (overloaded) {
+                    // We're in the first no-convert pass, so swap out the conversion flags for a
+                    // set of all-false flags.  If the call fails, we'll swap the flags back in for
+                    // the conversion-allowed call below.
+                    second_pass_convert.resize(func.nargs, false);
+                    call.args_convert.swap(second_pass_convert);
+                }
+
+                // 6. Call the function.
+                try {
+                    loader_life_support guard{};
+                    result = func.impl(call);
+                } catch (reference_cast_error &) {
+                    result = PYBIND11_TRY_NEXT_OVERLOAD;
+                }
+
+                if (result.ptr() != PYBIND11_TRY_NEXT_OVERLOAD) {
+                    break;
+                }
+
+                if (overloaded) {
+                    // The (overloaded) call failed; if the call has at least one argument that
+                    // permits conversion (i.e. it hasn't been explicitly specified `.noconvert()`)
+                    // then add this call to the list of second pass overloads to try.
+                    for (size_t i = func.is_method ? 1 : 0; i < pos_args; i++) {
+                        if (second_pass_convert[i]) {
+                            // Found one: swap the converting flags back in and store the call for
+                            // the second pass.
+                            call.args_convert.swap(second_pass_convert);
+                            second_pass.push_back(std::move(call));
+                            break;
+                        }
+                    }
+                }
+            }
+
+            if (overloaded && !second_pass.empty() && result.ptr() == PYBIND11_TRY_NEXT_OVERLOAD) {
+                // The no-conversion pass finished without success, try again with conversion
+                // allowed
+                for (auto &call : second_pass) {
+                    try {
+                        loader_life_support guard{};
+                        result = call.func.impl(call);
+                    } catch (reference_cast_error &) {
+                        result = PYBIND11_TRY_NEXT_OVERLOAD;
+                    }
+
+                    if (result.ptr() != PYBIND11_TRY_NEXT_OVERLOAD) {
+                        // The error reporting logic below expects 'current_overload' to be valid,
+                        // as it would be if we'd encountered this failure in the first-pass loop.
+                        if (!result) {
+                            current_overload = &call.func;
+                        }
+                        break;
+                    }
+                }
+            }
+        } catch (error_already_set &e) {
+            e.restore();
+            return nullptr;
+#ifdef __GLIBCXX__
+        } catch (abi::__forced_unwind &) {
+            throw;
+#endif
+        } catch (...) {
+            try_translate_exceptions();
+            return nullptr;
+        }
+
+        auto append_note_if_missing_header_is_suspected = [](std::string &msg) {
+            if (msg.find("std::") != std::string::npos) {
+                msg += "\n\n"
+                       "Did you forget to `#include <pybind11/stl.h>`? Or <pybind11/complex.h>,\n"
+                       "<pybind11/functional.h>, <pybind11/chrono.h>, etc. Some automatic\n"
+                       "conversions are optional and require extra headers to be included\n"
+                       "when compiling your pybind11 module.";
+            }
+        };
+
+        if (result.ptr() == PYBIND11_TRY_NEXT_OVERLOAD) {
+            if (overloads->is_operator) {
+                return handle(Py_NotImplemented).inc_ref().ptr();
+            }
+
+            std::string msg = std::string(overloads->name) + "(): incompatible "
+                              + std::string(overloads->is_constructor ? "constructor" : "function")
+                              + " arguments. The following argument types are supported:\n";
+
+            int ctr = 0;
+            for (const function_record *it2 = overloads; it2 != nullptr; it2 = it2->next) {
+                msg += "    " + std::to_string(++ctr) + ". ";
+
+                bool wrote_sig = false;
+                if (overloads->is_constructor) {
+                    // For a constructor, rewrite `(self: Object, arg0, ...) -> NoneType` as
+                    // `Object(arg0, ...)`
+                    std::string sig = it2->signature;
+                    size_t start = sig.find('(') + 7; // skip "(self: "
+                    if (start < sig.size()) {
+                        // End at the , for the next argument
+                        size_t end = sig.find(", "), next = end + 2;
+                        size_t ret = sig.rfind(" -> ");
+                        // Or the ), if there is no comma:
+                        if (end >= sig.size()) {
+                            next = end = sig.find(')');
+                        }
+                        if (start < end && next < sig.size()) {
+                            msg.append(sig, start, end - start);
+                            msg += '(';
+                            msg.append(sig, next, ret - next);
+                            wrote_sig = true;
+                        }
+                    }
+                }
+                if (!wrote_sig) {
+                    msg += it2->signature;
+                }
+
+                msg += '\n';
+            }
+            msg += "\nInvoked with: ";
+            auto args_ = reinterpret_borrow<tuple>(args_in);
+            bool some_args = false;
+            for (size_t ti = overloads->is_constructor ? 1 : 0; ti < args_.size(); ++ti) {
+                if (!some_args) {
+                    some_args = true;
+                } else {
+                    msg += ", ";
+                }
+                try {
+                    msg += pybind11::repr(args_[ti]);
+                } catch (const error_already_set &) {
+                    msg += "<repr raised Error>";
+                }
+            }
+            if (kwargs_in) {
+                auto kwargs = reinterpret_borrow<dict>(kwargs_in);
+                if (!kwargs.empty()) {
+                    if (some_args) {
+                        msg += "; ";
+                    }
+                    msg += "kwargs: ";
+                    bool first = true;
+                    for (const auto &kwarg : kwargs) {
+                        if (first) {
+                            first = false;
+                        } else {
+                            msg += ", ";
+                        }
+                        msg += pybind11::str("{}=").format(kwarg.first);
+                        try {
+                            msg += pybind11::repr(kwarg.second);
+                        } catch (const error_already_set &) {
+                            msg += "<repr raised Error>";
+                        }
+                    }
+                }
+            }
+
+            append_note_if_missing_header_is_suspected(msg);
+            // Attach additional error info to the exception if supported
+            if (PyErr_Occurred()) {
+                // #HelpAppreciated: unit test coverage for this branch.
+                raise_from(PyExc_TypeError, msg.c_str());
+                return nullptr;
+            }
+            set_error(PyExc_TypeError, msg.c_str());
+            return nullptr;
+        }
+        if (!result) {
+            std::string msg = "Unable to convert function return value to a "
+                              "Python type! The signature was\n\t";
+            assert(current_overload != nullptr);
+            msg += current_overload->signature;
+            append_note_if_missing_header_is_suspected(msg);
+            // Attach additional error info to the exception if supported
+            if (PyErr_Occurred()) {
+                raise_from(PyExc_TypeError, msg.c_str());
+                return nullptr;
+            }
+            set_error(PyExc_TypeError, msg.c_str());
+            return nullptr;
+        }
+        if (overloads->is_constructor && !self_value_and_holder.holder_constructed()) {
+            auto *pi = reinterpret_cast<instance *>(parent.ptr());
+            self_value_and_holder.type->init_instance(pi, nullptr);
+        }
+        return result.ptr();
+    }
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <>
+struct handle_type_name<cpp_function> {
+    static constexpr auto name = const_name("Callable");
+};
+
+PYBIND11_NAMESPACE_END(detail)
+
+// Use to activate Py_MOD_GIL_NOT_USED.
+class mod_gil_not_used {
+public:
+    explicit mod_gil_not_used(bool flag = true) : flag_(flag) {}
+    bool flag() const { return flag_; }
+
+private:
+    bool flag_;
+};
+
+/// Wrapper for Python extension modules
+class module_ : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(module_, object, PyModule_Check)
+
+    /// Create a new top-level Python module with the given name and docstring
+    PYBIND11_DEPRECATED("Use PYBIND11_MODULE or module_::create_extension_module instead")
+    explicit module_(const char *name, const char *doc = nullptr) {
+        *this = create_extension_module(name, doc, new PyModuleDef());
+    }
+
+    /** \rst
+        Create Python binding for a new function within the module scope. ``Func``
+        can be a plain C++ function, a function pointer, or a lambda function. For
+        details on the ``Extra&& ... extra`` argument, see section :ref:`extras`.
+    \endrst */
+    template <typename Func, typename... Extra>
+    module_ &def(const char *name_, Func &&f, const Extra &...extra) {
+        cpp_function func(std::forward<Func>(f),
+                          name(name_),
+                          scope(*this),
+                          sibling(getattr(*this, name_, none())),
+                          extra...);
+        // NB: allow overwriting here because cpp_function sets up a chain with the intention of
+        // overwriting (and has already checked internally that it isn't overwriting
+        // non-functions).
+        add_object(name_, func, true /* overwrite */);
+        return *this;
+    }
+
+    /** \rst
+        Create and return a new Python submodule with the given name and docstring.
+        This also works recursively, i.e.
+
+        .. code-block:: cpp
+
+            py::module_ m("example", "pybind11 example plugin");
+            py::module_ m2 = m.def_submodule("sub", "A submodule of 'example'");
+            py::module_ m3 = m2.def_submodule("subsub", "A submodule of 'example.sub'");
+    \endrst */
+    module_ def_submodule(const char *name, const char *doc = nullptr) {
+        const char *this_name = PyModule_GetName(m_ptr);
+        if (this_name == nullptr) {
+            throw error_already_set();
+        }
+        std::string full_name = std::string(this_name) + '.' + name;
+        handle submodule = PyImport_AddModule(full_name.c_str());
+        if (!submodule) {
+            throw error_already_set();
+        }
+        auto result = reinterpret_borrow<module_>(submodule);
+        if (doc && options::show_user_defined_docstrings()) {
+            result.attr("__doc__") = pybind11::str(doc);
+        }
+        attr(name) = result;
+        return result;
+    }
+
+    /// Import and return a module or throws `error_already_set`.
+    static module_ import(const char *name) {
+        PyObject *obj = PyImport_ImportModule(name);
+        if (!obj) {
+            throw error_already_set();
+        }
+        return reinterpret_steal<module_>(obj);
+    }
+
+    /// Reload the module or throws `error_already_set`.
+    void reload() {
+        PyObject *obj = PyImport_ReloadModule(ptr());
+        if (!obj) {
+            throw error_already_set();
+        }
+        *this = reinterpret_steal<module_>(obj);
+    }
+
+    /** \rst
+        Adds an object to the module using the given name.  Throws if an object with the given name
+        already exists.
+
+        ``overwrite`` should almost always be false: attempting to overwrite objects that pybind11
+        has established will, in most cases, break things.
+    \endrst */
+    PYBIND11_NOINLINE void add_object(const char *name, handle obj, bool overwrite = false) {
+        if (!overwrite && hasattr(*this, name)) {
+            pybind11_fail(
+                "Error during initialization: multiple incompatible definitions with name \""
+                + std::string(name) + "\"");
+        }
+
+        PyModule_AddObject(ptr(), name, obj.inc_ref().ptr() /* steals a reference */);
+    }
+
+    using module_def = PyModuleDef; // TODO: Can this be removed (it was needed only for Python 2)?
+
+    /** \rst
+        Create a new top-level module that can be used as the main module of a C extension.
+
+        ``def`` should point to a statically allocated module_def.
+    \endrst */
+    static module_ create_extension_module(const char *name,
+                                           const char *doc,
+                                           module_def *def,
+                                           mod_gil_not_used gil_not_used
+                                           = mod_gil_not_used(false)) {
+        // module_def is PyModuleDef
+        // Placement new (not an allocation).
+        def = new (def)
+            PyModuleDef{/* m_base */ PyModuleDef_HEAD_INIT,
+                        /* m_name */ name,
+                        /* m_doc */ options::show_user_defined_docstrings() ? doc : nullptr,
+                        /* m_size */ -1,
+                        /* m_methods */ nullptr,
+                        /* m_slots */ nullptr,
+                        /* m_traverse */ nullptr,
+                        /* m_clear */ nullptr,
+                        /* m_free */ nullptr};
+        auto *m = PyModule_Create(def);
+        if (m == nullptr) {
+            if (PyErr_Occurred()) {
+                throw error_already_set();
+            }
+            pybind11_fail("Internal error in module_::create_extension_module()");
+        }
+        if (gil_not_used.flag()) {
+#ifdef Py_GIL_DISABLED
+            PyUnstable_Module_SetGIL(m, Py_MOD_GIL_NOT_USED);
+#endif
+        }
+        // TODO: Should be reinterpret_steal for Python 3, but Python also steals it again when
+        //       returned from PyInit_...
+        //       For Python 2, reinterpret_borrow was correct.
+        return reinterpret_borrow<module_>(m);
+    }
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <>
+struct handle_type_name<module_> {
+    static constexpr auto name = const_name("module");
+};
+
+PYBIND11_NAMESPACE_END(detail)
+
+// When inside a namespace (or anywhere as long as it's not the first item on a line),
+// C++20 allows "module" to be used. This is provided for backward compatibility, and for
+// simplicity, if someone wants to use py::module for example, that is perfectly safe.
+using module = module_;
+
+/// \ingroup python_builtins
+/// Return a dictionary representing the global variables in the current execution frame,
+/// or ``__main__.__dict__`` if there is no frame (usually when the interpreter is embedded).
+inline dict globals() {
+#if PY_VERSION_HEX >= 0x030d0000
+    PyObject *p = PyEval_GetFrameGlobals();
+    return p ? reinterpret_steal<dict>(p)
+             : reinterpret_borrow<dict>(module_::import("__main__").attr("__dict__").ptr());
+#else
+    PyObject *p = PyEval_GetGlobals();
+    return reinterpret_borrow<dict>(p ? p : module_::import("__main__").attr("__dict__").ptr());
+#endif
+}
+
+template <typename... Args, typename = detail::enable_if_t<args_are_all_keyword_or_ds<Args...>()>>
+PYBIND11_DEPRECATED("make_simple_namespace should be replaced with "
+                    "py::module_::import(\"types\").attr(\"SimpleNamespace\") ")
+object make_simple_namespace(Args &&...args_) {
+    return module_::import("types").attr("SimpleNamespace")(std::forward<Args>(args_)...);
+}
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+/// Generic support for creating new Python heap types
+class generic_type : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(generic_type, object, PyType_Check)
+protected:
+    void initialize(const type_record &rec) {
+        if (rec.scope && hasattr(rec.scope, "__dict__")
+            && rec.scope.attr("__dict__").contains(rec.name)) {
+            pybind11_fail("generic_type: cannot initialize type \"" + std::string(rec.name)
+                          + "\": an object with that name is already defined");
+        }
+
+        if ((rec.module_local ? get_local_type_info(*rec.type) : get_global_type_info(*rec.type))
+            != nullptr) {
+            pybind11_fail("generic_type: type \"" + std::string(rec.name)
+                          + "\" is already registered!");
+        }
+
+        m_ptr = make_new_python_type(rec);
+
+        /* Register supplemental type information in C++ dict */
+        auto *tinfo = new detail::type_info();
+        tinfo->type = (PyTypeObject *) m_ptr;
+        tinfo->cpptype = rec.type;
+        tinfo->type_size = rec.type_size;
+        tinfo->type_align = rec.type_align;
+        tinfo->operator_new = rec.operator_new;
+        tinfo->holder_size_in_ptrs = size_in_ptrs(rec.holder_size);
+        tinfo->init_instance = rec.init_instance;
+        tinfo->dealloc = rec.dealloc;
+        tinfo->simple_type = true;
+        tinfo->simple_ancestors = true;
+        tinfo->default_holder = rec.default_holder;
+        tinfo->module_local = rec.module_local;
+
+        with_internals([&](internals &internals) {
+            auto tindex = std::type_index(*rec.type);
+            tinfo->direct_conversions = &internals.direct_conversions[tindex];
+            if (rec.module_local) {
+                get_local_internals().registered_types_cpp[tindex] = tinfo;
+            } else {
+                internals.registered_types_cpp[tindex] = tinfo;
+            }
+            internals.registered_types_py[(PyTypeObject *) m_ptr] = {tinfo};
+        });
+
+        if (rec.bases.size() > 1 || rec.multiple_inheritance) {
+            mark_parents_nonsimple(tinfo->type);
+            tinfo->simple_ancestors = false;
+        } else if (rec.bases.size() == 1) {
+            auto *parent_tinfo = get_type_info((PyTypeObject *) rec.bases[0].ptr());
+            assert(parent_tinfo != nullptr);
+            bool parent_simple_ancestors = parent_tinfo->simple_ancestors;
+            tinfo->simple_ancestors = parent_simple_ancestors;
+            // The parent can no longer be a simple type if it has MI and has a child
+            parent_tinfo->simple_type = parent_tinfo->simple_type && parent_simple_ancestors;
+        }
+
+        if (rec.module_local) {
+            // Stash the local typeinfo and loader so that external modules can access it.
+            tinfo->module_local_load = &type_caster_generic::local_load;
+            setattr(m_ptr, PYBIND11_MODULE_LOCAL_ID, capsule(tinfo));
+        }
+    }
+
+    /// Helper function which tags all parents of a type using mult. inheritance
+    void mark_parents_nonsimple(PyTypeObject *value) {
+        auto t = reinterpret_borrow<tuple>(value->tp_bases);
+        for (handle h : t) {
+            auto *tinfo2 = get_type_info((PyTypeObject *) h.ptr());
+            if (tinfo2) {
+                tinfo2->simple_type = false;
+            }
+            mark_parents_nonsimple((PyTypeObject *) h.ptr());
+        }
+    }
+
+    void install_buffer_funcs(buffer_info *(*get_buffer)(PyObject *, void *),
+                              void *get_buffer_data) {
+        auto *type = (PyHeapTypeObject *) m_ptr;
+        auto *tinfo = detail::get_type_info(&type->ht_type);
+
+        if (!type->ht_type.tp_as_buffer) {
+            pybind11_fail("To be able to register buffer protocol support for the type '"
+                          + get_fully_qualified_tp_name(tinfo->type)
+                          + "' the associated class<>(..) invocation must "
+                            "include the pybind11::buffer_protocol() annotation!");
+        }
+
+        tinfo->get_buffer = get_buffer;
+        tinfo->get_buffer_data = get_buffer_data;
+    }
+
+    // rec_func must be set for either fget or fset.
+    void def_property_static_impl(const char *name,
+                                  handle fget,
+                                  handle fset,
+                                  detail::function_record *rec_func) {
+        const auto is_static = (rec_func != nullptr) && !(rec_func->is_method && rec_func->scope);
+        const auto has_doc = (rec_func != nullptr) && (rec_func->doc != nullptr)
+                             && pybind11::options::show_user_defined_docstrings();
+        auto property = handle(
+            (PyObject *) (is_static ? get_internals().static_property_type : &PyProperty_Type));
+        attr(name) = property(fget.ptr() ? fget : none(),
+                              fset.ptr() ? fset : none(),
+                              /*deleter*/ none(),
+                              pybind11::str(has_doc ? rec_func->doc : ""));
+    }
+};
+
+/// Set the pointer to operator new if it exists. The cast is needed because it can be overloaded.
+template <typename T,
+          typename = void_t<decltype(static_cast<void *(*) (size_t)>(T::operator new))>>
+void set_operator_new(type_record *r) {
+    r->operator_new = &T::operator new;
+}
+
+template <typename>
+void set_operator_new(...) {}
+
+template <typename T, typename SFINAE = void>
+struct has_operator_delete : std::false_type {};
+template <typename T>
+struct has_operator_delete<T, void_t<decltype(static_cast<void (*)(void *)>(T::operator delete))>>
+    : std::true_type {};
+template <typename T, typename SFINAE = void>
+struct has_operator_delete_size : std::false_type {};
+template <typename T>
+struct has_operator_delete_size<
+    T,
+    void_t<decltype(static_cast<void (*)(void *, size_t)>(T::operator delete))>> : std::true_type {
+};
+/// Call class-specific delete if it exists or global otherwise. Can also be an overload set.
+template <typename T, enable_if_t<has_operator_delete<T>::value, int> = 0>
+void call_operator_delete(T *p, size_t, size_t) {
+    T::operator delete(p);
+}
+template <typename T,
+          enable_if_t<!has_operator_delete<T>::value && has_operator_delete_size<T>::value, int>
+          = 0>
+void call_operator_delete(T *p, size_t s, size_t) {
+    T::operator delete(p, s);
+}
+
+inline void call_operator_delete(void *p, size_t s, size_t a) {
+    (void) s;
+    (void) a;
+#if defined(__cpp_aligned_new) && (!defined(_MSC_VER) || _MSC_VER >= 1912)
+    if (a > __STDCPP_DEFAULT_NEW_ALIGNMENT__) {
+#    ifdef __cpp_sized_deallocation
+        ::operator delete(p, s, std::align_val_t(a));
+#    else
+        ::operator delete(p, std::align_val_t(a));
+#    endif
+        return;
+    }
+#endif
+#ifdef __cpp_sized_deallocation
+    ::operator delete(p, s);
+#else
+    ::operator delete(p);
+#endif
+}
+
+inline void add_class_method(object &cls, const char *name_, const cpp_function &cf) {
+    cls.attr(cf.name()) = cf;
+    if (std::strcmp(name_, "__eq__") == 0 && !cls.attr("__dict__").contains("__hash__")) {
+        cls.attr("__hash__") = none();
+    }
+}
+
+PYBIND11_NAMESPACE_END(detail)
+
+/// Given a pointer to a member function, cast it to its `Derived` version.
+/// Forward everything else unchanged.
+template <typename /*Derived*/, typename F>
+auto method_adaptor(F &&f) -> decltype(std::forward<F>(f)) {
+    return std::forward<F>(f);
+}
+
+template <typename Derived, typename Return, typename Class, typename... Args>
+auto method_adaptor(Return (Class::*pmf)(Args...)) -> Return (Derived::*)(Args...) {
+    static_assert(
+        detail::is_accessible_base_of<Class, Derived>::value,
+        "Cannot bind an inaccessible base class method; use a lambda definition instead");
+    return pmf;
+}
+
+template <typename Derived, typename Return, typename Class, typename... Args>
+auto method_adaptor(Return (Class::*pmf)(Args...) const) -> Return (Derived::*)(Args...) const {
+    static_assert(
+        detail::is_accessible_base_of<Class, Derived>::value,
+        "Cannot bind an inaccessible base class method; use a lambda definition instead");
+    return pmf;
+}
+
+template <typename type_, typename... options>
+class class_ : public detail::generic_type {
+    template <typename T>
+    using is_holder = detail::is_holder_type<type_, T>;
+    template <typename T>
+    using is_subtype = detail::is_strict_base_of<type_, T>;
+    template <typename T>
+    using is_base = detail::is_strict_base_of<T, type_>;
+    // struct instead of using here to help MSVC:
+    template <typename T>
+    struct is_valid_class_option : detail::any_of<is_holder<T>, is_subtype<T>, is_base<T>> {};
+
+public:
+    using type = type_;
+    using type_alias = detail::exactly_one_t<is_subtype, void, options...>;
+    constexpr static bool has_alias = !std::is_void<type_alias>::value;
+    using holder_type = detail::exactly_one_t<is_holder, std::unique_ptr<type>, options...>;
+
+    static_assert(detail::all_of<is_valid_class_option<options>...>::value,
+                  "Unknown/invalid class_ template parameters provided");
+
+    static_assert(!has_alias || std::is_polymorphic<type>::value,
+                  "Cannot use an alias class with a non-polymorphic type");
+
+    PYBIND11_OBJECT(class_, generic_type, PyType_Check)
+
+    template <typename... Extra>
+    class_(handle scope, const char *name, const Extra &...extra) {
+        using namespace detail;
+
+        // MI can only be specified via class_ template options, not constructor parameters
+        static_assert(
+            none_of<is_pyobject<Extra>...>::value || // no base class arguments, or:
+                (constexpr_sum(is_pyobject<Extra>::value...) == 1 && // Exactly one base
+                 constexpr_sum(is_base<options>::value...) == 0 &&   // no template option bases
+                 // no multiple_inheritance attr
+                 none_of<std::is_same<multiple_inheritance, Extra>...>::value),
+            "Error: multiple inheritance bases must be specified via class_ template options");
+
+        type_record record;
+        record.scope = scope;
+        record.name = name;
+        record.type = &typeid(type);
+        record.type_size = sizeof(conditional_t<has_alias, type_alias, type>);
+        record.type_align = alignof(conditional_t<has_alias, type_alias, type> &);
+        record.holder_size = sizeof(holder_type);
+        record.init_instance = init_instance;
+        record.dealloc = dealloc;
+        record.default_holder = detail::is_instantiation<std::unique_ptr, holder_type>::value;
+
+        set_operator_new<type>(&record);
+
+        /* Register base classes specified via template arguments to class_, if any */
+        PYBIND11_EXPAND_SIDE_EFFECTS(add_base<options>(record));
+
+        /* Process optional arguments, if any */
+        process_attributes<Extra...>::init(extra..., &record);
+
+        generic_type::initialize(record);
+
+        if (has_alias) {
+            with_internals([&](internals &internals) {
+                auto &instances = record.module_local ? get_local_internals().registered_types_cpp
+                                                      : internals.registered_types_cpp;
+                instances[std::type_index(typeid(type_alias))]
+                    = instances[std::type_index(typeid(type))];
+            });
+        }
+        def("_pybind11_conduit_v1_", cpp_conduit_method);
+    }
+
+    template <typename Base, detail::enable_if_t<is_base<Base>::value, int> = 0>
+    static void add_base(detail::type_record &rec) {
+        rec.add_base(typeid(Base), [](void *src) -> void * {
+            return static_cast<Base *>(reinterpret_cast<type *>(src));
+        });
+    }
+
+    template <typename Base, detail::enable_if_t<!is_base<Base>::value, int> = 0>
+    static void add_base(detail::type_record &) {}
+
+    template <typename Func, typename... Extra>
+    class_ &def(const char *name_, Func &&f, const Extra &...extra) {
+        cpp_function cf(method_adaptor<type>(std::forward<Func>(f)),
+                        name(name_),
+                        is_method(*this),
+                        sibling(getattr(*this, name_, none())),
+                        extra...);
+        add_class_method(*this, name_, cf);
+        return *this;
+    }
+
+    template <typename Func, typename... Extra>
+    class_ &def_static(const char *name_, Func &&f, const Extra &...extra) {
+        static_assert(!std::is_member_function_pointer<Func>::value,
+                      "def_static(...) called with a non-static member function pointer");
+        cpp_function cf(std::forward<Func>(f),
+                        name(name_),
+                        scope(*this),
+                        sibling(getattr(*this, name_, none())),
+                        extra...);
+        auto cf_name = cf.name();
+        attr(std::move(cf_name)) = staticmethod(std::move(cf));
+        return *this;
+    }
+
+    template <typename T, typename... Extra, detail::enable_if_t<T::op_enable_if_hook, int> = 0>
+    class_ &def(const T &op, const Extra &...extra) {
+        op.execute(*this, extra...);
+        return *this;
+    }
+
+    template <typename T, typename... Extra, detail::enable_if_t<T::op_enable_if_hook, int> = 0>
+    class_ &def_cast(const T &op, const Extra &...extra) {
+        op.execute_cast(*this, extra...);
+        return *this;
+    }
+
+    template <typename... Args, typename... Extra>
+    class_ &def(const detail::initimpl::constructor<Args...> &init, const Extra &...extra) {
+        PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(init);
+        init.execute(*this, extra...);
+        return *this;
+    }
+
+    template <typename... Args, typename... Extra>
+    class_ &def(const detail::initimpl::alias_constructor<Args...> &init, const Extra &...extra) {
+        PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(init);
+        init.execute(*this, extra...);
+        return *this;
+    }
+
+    template <typename... Args, typename... Extra>
+    class_ &def(detail::initimpl::factory<Args...> &&init, const Extra &...extra) {
+        std::move(init).execute(*this, extra...);
+        return *this;
+    }
+
+    template <typename... Args, typename... Extra>
+    class_ &def(detail::initimpl::pickle_factory<Args...> &&pf, const Extra &...extra) {
+        std::move(pf).execute(*this, extra...);
+        return *this;
+    }
+
+    template <typename Func>
+    class_ &def_buffer(Func &&func) {
+        struct capture {
+            Func func;
+        };
+        auto *ptr = new capture{std::forward<Func>(func)};
+        install_buffer_funcs(
+            [](PyObject *obj, void *ptr) -> buffer_info * {
+                detail::make_caster<type> caster;
+                if (!caster.load(obj, false)) {
+                    return nullptr;
+                }
+                return new buffer_info(((capture *) ptr)->func(std::move(caster)));
+            },
+            ptr);
+        weakref(m_ptr, cpp_function([ptr](handle wr) {
+                    delete ptr;
+                    wr.dec_ref();
+                }))
+            .release();
+        return *this;
+    }
+
+    template <typename Return, typename Class, typename... Args>
+    class_ &def_buffer(Return (Class::*func)(Args...)) {
+        return def_buffer([func](type &obj) { return (obj.*func)(); });
+    }
+
+    template <typename Return, typename Class, typename... Args>
+    class_ &def_buffer(Return (Class::*func)(Args...) const) {
+        return def_buffer([func](const type &obj) { return (obj.*func)(); });
+    }
+
+    template <typename C, typename D, typename... Extra>
+    class_ &def_readwrite(const char *name, D C::*pm, const Extra &...extra) {
+        static_assert(std::is_same<C, type>::value || std::is_base_of<C, type>::value,
+                      "def_readwrite() requires a class member (or base class member)");
+        cpp_function fget([pm](const type &c) -> const D & { return c.*pm; }, is_method(*this)),
+            fset([pm](type &c, const D &value) { c.*pm = value; }, is_method(*this));
+        def_property(name, fget, fset, return_value_policy::reference_internal, extra...);
+        return *this;
+    }
+
+    template <typename C, typename D, typename... Extra>
+    class_ &def_readonly(const char *name, const D C::*pm, const Extra &...extra) {
+        static_assert(std::is_same<C, type>::value || std::is_base_of<C, type>::value,
+                      "def_readonly() requires a class member (or base class member)");
+        cpp_function fget([pm](const type &c) -> const D & { return c.*pm; }, is_method(*this));
+        def_property_readonly(name, fget, return_value_policy::reference_internal, extra...);
+        return *this;
+    }
+
+    template <typename D, typename... Extra>
+    class_ &def_readwrite_static(const char *name, D *pm, const Extra &...extra) {
+        cpp_function fget([pm](const object &) -> const D & { return *pm; }, scope(*this)),
+            fset([pm](const object &, const D &value) { *pm = value; }, scope(*this));
+        def_property_static(name, fget, fset, return_value_policy::reference, extra...);
+        return *this;
+    }
+
+    template <typename D, typename... Extra>
+    class_ &def_readonly_static(const char *name, const D *pm, const Extra &...extra) {
+        cpp_function fget([pm](const object &) -> const D & { return *pm; }, scope(*this));
+        def_property_readonly_static(name, fget, return_value_policy::reference, extra...);
+        return *this;
+    }
+
+    /// Uses return_value_policy::reference_internal by default
+    template <typename Getter, typename... Extra>
+    class_ &def_property_readonly(const char *name, const Getter &fget, const Extra &...extra) {
+        return def_property_readonly(name,
+                                     cpp_function(method_adaptor<type>(fget)),
+                                     return_value_policy::reference_internal,
+                                     extra...);
+    }
+
+    /// Uses cpp_function's return_value_policy by default
+    template <typename... Extra>
+    class_ &
+    def_property_readonly(const char *name, const cpp_function &fget, const Extra &...extra) {
+        return def_property(name, fget, nullptr, extra...);
+    }
+
+    /// Uses return_value_policy::reference by default
+    template <typename Getter, typename... Extra>
+    class_ &
+    def_property_readonly_static(const char *name, const Getter &fget, const Extra &...extra) {
+        return def_property_readonly_static(
+            name, cpp_function(fget), return_value_policy::reference, extra...);
+    }
+
+    /// Uses cpp_function's return_value_policy by default
+    template <typename... Extra>
+    class_ &def_property_readonly_static(const char *name,
+                                         const cpp_function &fget,
+                                         const Extra &...extra) {
+        return def_property_static(name, fget, nullptr, extra...);
+    }
+
+    /// Uses return_value_policy::reference_internal by default
+    template <typename Getter, typename Setter, typename... Extra>
+    class_ &
+    def_property(const char *name, const Getter &fget, const Setter &fset, const Extra &...extra) {
+        return def_property(
+            name, fget, cpp_function(method_adaptor<type>(fset), is_setter()), extra...);
+    }
+    template <typename Getter, typename... Extra>
+    class_ &def_property(const char *name,
+                         const Getter &fget,
+                         const cpp_function &fset,
+                         const Extra &...extra) {
+        return def_property(name,
+                            cpp_function(method_adaptor<type>(fget)),
+                            fset,
+                            return_value_policy::reference_internal,
+                            extra...);
+    }
+
+    /// Uses cpp_function's return_value_policy by default
+    template <typename... Extra>
+    class_ &def_property(const char *name,
+                         const cpp_function &fget,
+                         const cpp_function &fset,
+                         const Extra &...extra) {
+        return def_property_static(name, fget, fset, is_method(*this), extra...);
+    }
+
+    /// Uses return_value_policy::reference by default
+    template <typename Getter, typename... Extra>
+    class_ &def_property_static(const char *name,
+                                const Getter &fget,
+                                const cpp_function &fset,
+                                const Extra &...extra) {
+        return def_property_static(
+            name, cpp_function(fget), fset, return_value_policy::reference, extra...);
+    }
+
+    /// Uses cpp_function's return_value_policy by default
+    template <typename... Extra>
+    class_ &def_property_static(const char *name,
+                                const cpp_function &fget,
+                                const cpp_function &fset,
+                                const Extra &...extra) {
+        static_assert(0 == detail::constexpr_sum(std::is_base_of<arg, Extra>::value...),
+                      "Argument annotations are not allowed for properties");
+        auto rec_fget = get_function_record(fget), rec_fset = get_function_record(fset);
+        auto *rec_active = rec_fget;
+        if (rec_fget) {
+            char *doc_prev = rec_fget->doc; /* 'extra' field may include a property-specific
+                                               documentation string */
+            detail::process_attributes<Extra...>::init(extra..., rec_fget);
+            if (rec_fget->doc && rec_fget->doc != doc_prev) {
+                std::free(doc_prev);
+                rec_fget->doc = PYBIND11_COMPAT_STRDUP(rec_fget->doc);
+            }
+        }
+        if (rec_fset) {
+            char *doc_prev = rec_fset->doc;
+            detail::process_attributes<Extra...>::init(extra..., rec_fset);
+            if (rec_fset->doc && rec_fset->doc != doc_prev) {
+                std::free(doc_prev);
+                rec_fset->doc = PYBIND11_COMPAT_STRDUP(rec_fset->doc);
+            }
+            if (!rec_active) {
+                rec_active = rec_fset;
+            }
+        }
+        def_property_static_impl(name, fget, fset, rec_active);
+        return *this;
+    }
+
+private:
+    /// Initialize holder object, variant 1: object derives from enable_shared_from_this
+    template <typename T>
+    static void init_holder(detail::instance *inst,
+                            detail::value_and_holder &v_h,
+                            const holder_type * /* unused */,
+                            const std::enable_shared_from_this<T> * /* dummy */) {
+
+        auto sh = std::dynamic_pointer_cast<typename holder_type::element_type>(
+            detail::try_get_shared_from_this(v_h.value_ptr<type>()));
+        if (sh) {
+            new (std::addressof(v_h.holder<holder_type>())) holder_type(std::move(sh));
+            v_h.set_holder_constructed();
+        }
+
+        if (!v_h.holder_constructed() && inst->owned) {
+            new (std::addressof(v_h.holder<holder_type>())) holder_type(v_h.value_ptr<type>());
+            v_h.set_holder_constructed();
+        }
+    }
+
+    static void init_holder_from_existing(const detail::value_and_holder &v_h,
+                                          const holder_type *holder_ptr,
+                                          std::true_type /*is_copy_constructible*/) {
+        new (std::addressof(v_h.holder<holder_type>()))
+            holder_type(*reinterpret_cast<const holder_type *>(holder_ptr));
+    }
+
+    static void init_holder_from_existing(const detail::value_and_holder &v_h,
+                                          const holder_type *holder_ptr,
+                                          std::false_type /*is_copy_constructible*/) {
+        new (std::addressof(v_h.holder<holder_type>()))
+            holder_type(std::move(*const_cast<holder_type *>(holder_ptr)));
+    }
+
+    /// Initialize holder object, variant 2: try to construct from existing holder object, if
+    /// possible
+    static void init_holder(detail::instance *inst,
+                            detail::value_and_holder &v_h,
+                            const holder_type *holder_ptr,
+                            const void * /* dummy -- not enable_shared_from_this<T>) */) {
+        if (holder_ptr) {
+            init_holder_from_existing(v_h, holder_ptr, std::is_copy_constructible<holder_type>());
+            v_h.set_holder_constructed();
+        } else if (detail::always_construct_holder<holder_type>::value || inst->owned) {
+            new (std::addressof(v_h.holder<holder_type>())) holder_type(v_h.value_ptr<type>());
+            v_h.set_holder_constructed();
+        }
+    }
+
+    /// Performs instance initialization including constructing a holder and registering the known
+    /// instance.  Should be called as soon as the `type` value_ptr is set for an instance.  Takes
+    /// an optional pointer to an existing holder to use; if not specified and the instance is
+    /// `.owned`, a new holder will be constructed to manage the value pointer.
+    static void init_instance(detail::instance *inst, const void *holder_ptr) {
+        auto v_h = inst->get_value_and_holder(detail::get_type_info(typeid(type)));
+        if (!v_h.instance_registered()) {
+            register_instance(inst, v_h.value_ptr(), v_h.type);
+            v_h.set_instance_registered();
+        }
+        init_holder(inst, v_h, (const holder_type *) holder_ptr, v_h.value_ptr<type>());
+    }
+
+    /// Deallocates an instance; via holder, if constructed; otherwise via operator delete.
+    static void dealloc(detail::value_and_holder &v_h) {
+        // We could be deallocating because we are cleaning up after a Python exception.
+        // If so, the Python error indicator will be set. We need to clear that before
+        // running the destructor, in case the destructor code calls more Python.
+        // If we don't, the Python API will exit with an exception, and pybind11 will
+        // throw error_already_set from the C++ destructor which is forbidden and triggers
+        // std::terminate().
+        error_scope scope;
+        if (v_h.holder_constructed()) {
+            v_h.holder<holder_type>().~holder_type();
+            v_h.set_holder_constructed(false);
+        } else {
+            detail::call_operator_delete(
+                v_h.value_ptr<type>(), v_h.type->type_size, v_h.type->type_align);
+        }
+        v_h.value_ptr() = nullptr;
+    }
+
+    static detail::function_record *get_function_record(handle h) {
+        h = detail::get_function(h);
+        if (!h) {
+            return nullptr;
+        }
+
+        handle func_self = PyCFunction_GET_SELF(h.ptr());
+        if (!func_self) {
+            throw error_already_set();
+        }
+        if (!isinstance<capsule>(func_self)) {
+            return nullptr;
+        }
+        auto cap = reinterpret_borrow<capsule>(func_self);
+        if (!detail::is_function_record_capsule(cap)) {
+            return nullptr;
+        }
+        return cap.get_pointer<detail::function_record>();
+    }
+};
+
+/// Binds an existing constructor taking arguments Args...
+template <typename... Args>
+detail::initimpl::constructor<Args...> init() {
+    return {};
+}
+/// Like `init<Args...>()`, but the instance is always constructed through the alias class (even
+/// when not inheriting on the Python side).
+template <typename... Args>
+detail::initimpl::alias_constructor<Args...> init_alias() {
+    return {};
+}
+
+/// Binds a factory function as a constructor
+template <typename Func, typename Ret = detail::initimpl::factory<Func>>
+Ret init(Func &&f) {
+    return {std::forward<Func>(f)};
+}
+
+/// Dual-argument factory function: the first function is called when no alias is needed, the
+/// second when an alias is needed (i.e. due to python-side inheritance).  Arguments must be
+/// identical.
+template <typename CFunc, typename AFunc, typename Ret = detail::initimpl::factory<CFunc, AFunc>>
+Ret init(CFunc &&c, AFunc &&a) {
+    return {std::forward<CFunc>(c), std::forward<AFunc>(a)};
+}
+
+/// Binds pickling functions `__getstate__` and `__setstate__` and ensures that the type
+/// returned by `__getstate__` is the same as the argument accepted by `__setstate__`.
+template <typename GetState, typename SetState>
+detail::initimpl::pickle_factory<GetState, SetState> pickle(GetState &&g, SetState &&s) {
+    return {std::forward<GetState>(g), std::forward<SetState>(s)};
+}
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+inline str enum_name(handle arg) {
+    dict entries = arg.get_type().attr("__entries");
+    for (auto kv : entries) {
+        if (handle(kv.second[int_(0)]).equal(arg)) {
+            return pybind11::str(kv.first);
+        }
+    }
+    return "???";
+}
+
+struct enum_base {
+    enum_base(const handle &base, const handle &parent) : m_base(base), m_parent(parent) {}
+
+    PYBIND11_NOINLINE void init(bool is_arithmetic, bool is_convertible) {
+        m_base.attr("__entries") = dict();
+        auto property = handle((PyObject *) &PyProperty_Type);
+        auto static_property = handle((PyObject *) get_internals().static_property_type);
+
+        m_base.attr("__repr__") = cpp_function(
+            [](const object &arg) -> str {
+                handle type = type::handle_of(arg);
+                object type_name = type.attr("__name__");
+                return pybind11::str("<{}.{}: {}>")
+                    .format(std::move(type_name), enum_name(arg), int_(arg));
+            },
+            name("__repr__"),
+            is_method(m_base));
+
+        m_base.attr("name") = property(cpp_function(&enum_name, name("name"), is_method(m_base)));
+
+        m_base.attr("__str__") = cpp_function(
+            [](handle arg) -> str {
+                object type_name = type::handle_of(arg).attr("__name__");
+                return pybind11::str("{}.{}").format(std::move(type_name), enum_name(arg));
+            },
+            name("__str__"),
+            is_method(m_base));
+
+        if (options::show_enum_members_docstring()) {
+            m_base.attr("__doc__") = static_property(
+                cpp_function(
+                    [](handle arg) -> std::string {
+                        std::string docstring;
+                        dict entries = arg.attr("__entries");
+                        if (((PyTypeObject *) arg.ptr())->tp_doc) {
+                            docstring += std::string(
+                                reinterpret_cast<PyTypeObject *>(arg.ptr())->tp_doc);
+                            docstring += "\n\n";
+                        }
+                        docstring += "Members:";
+                        for (auto kv : entries) {
+                            auto key = std::string(pybind11::str(kv.first));
+                            auto comment = kv.second[int_(1)];
+                            docstring += "\n\n  ";
+                            docstring += key;
+                            if (!comment.is_none()) {
+                                docstring += " : ";
+                                docstring += pybind11::str(comment).cast<std::string>();
+                            }
+                        }
+                        return docstring;
+                    },
+                    name("__doc__")),
+                none(),
+                none(),
+                "");
+        }
+
+        m_base.attr("__members__") = static_property(cpp_function(
+                                                         [](handle arg) -> dict {
+                                                             dict entries = arg.attr("__entries"),
+                                                                  m;
+                                                             for (auto kv : entries) {
+                                                                 m[kv.first] = kv.second[int_(0)];
+                                                             }
+                                                             return m;
+                                                         },
+                                                         name("__members__")),
+                                                     none(),
+                                                     none(),
+                                                     "");
+
+#define PYBIND11_ENUM_OP_STRICT(op, expr, strict_behavior)                                        \
+    m_base.attr(op) = cpp_function(                                                               \
+        [](const object &a, const object &b) {                                                    \
+            if (!type::handle_of(a).is(type::handle_of(b)))                                       \
+                strict_behavior; /* NOLINT(bugprone-macro-parentheses) */                         \
+            return expr;                                                                          \
+        },                                                                                        \
+        name(op),                                                                                 \
+        is_method(m_base),                                                                        \
+        arg("other"))
+
+#define PYBIND11_ENUM_OP_CONV(op, expr)                                                           \
+    m_base.attr(op) = cpp_function(                                                               \
+        [](const object &a_, const object &b_) {                                                  \
+            int_ a(a_), b(b_);                                                                    \
+            return expr;                                                                          \
+        },                                                                                        \
+        name(op),                                                                                 \
+        is_method(m_base),                                                                        \
+        arg("other"))
+
+#define PYBIND11_ENUM_OP_CONV_LHS(op, expr)                                                       \
+    m_base.attr(op) = cpp_function(                                                               \
+        [](const object &a_, const object &b) {                                                   \
+            int_ a(a_);                                                                           \
+            return expr;                                                                          \
+        },                                                                                        \
+        name(op),                                                                                 \
+        is_method(m_base),                                                                        \
+        arg("other"))
+
+        if (is_convertible) {
+            PYBIND11_ENUM_OP_CONV_LHS("__eq__", !b.is_none() && a.equal(b));
+            PYBIND11_ENUM_OP_CONV_LHS("__ne__", b.is_none() || !a.equal(b));
+
+            if (is_arithmetic) {
+                PYBIND11_ENUM_OP_CONV("__lt__", a < b);
+                PYBIND11_ENUM_OP_CONV("__gt__", a > b);
+                PYBIND11_ENUM_OP_CONV("__le__", a <= b);
+                PYBIND11_ENUM_OP_CONV("__ge__", a >= b);
+                PYBIND11_ENUM_OP_CONV("__and__", a & b);
+                PYBIND11_ENUM_OP_CONV("__rand__", a & b);
+                PYBIND11_ENUM_OP_CONV("__or__", a | b);
+                PYBIND11_ENUM_OP_CONV("__ror__", a | b);
+                PYBIND11_ENUM_OP_CONV("__xor__", a ^ b);
+                PYBIND11_ENUM_OP_CONV("__rxor__", a ^ b);
+                m_base.attr("__invert__")
+                    = cpp_function([](const object &arg) { return ~(int_(arg)); },
+                                   name("__invert__"),
+                                   is_method(m_base));
+            }
+        } else {
+            PYBIND11_ENUM_OP_STRICT("__eq__", int_(a).equal(int_(b)), return false);
+            PYBIND11_ENUM_OP_STRICT("__ne__", !int_(a).equal(int_(b)), return true);
+
+            if (is_arithmetic) {
+#define PYBIND11_THROW throw type_error("Expected an enumeration of matching type!");
+                PYBIND11_ENUM_OP_STRICT("__lt__", int_(a) < int_(b), PYBIND11_THROW);
+                PYBIND11_ENUM_OP_STRICT("__gt__", int_(a) > int_(b), PYBIND11_THROW);
+                PYBIND11_ENUM_OP_STRICT("__le__", int_(a) <= int_(b), PYBIND11_THROW);
+                PYBIND11_ENUM_OP_STRICT("__ge__", int_(a) >= int_(b), PYBIND11_THROW);
+#undef PYBIND11_THROW
+            }
+        }
+
+#undef PYBIND11_ENUM_OP_CONV_LHS
+#undef PYBIND11_ENUM_OP_CONV
+#undef PYBIND11_ENUM_OP_STRICT
+
+        m_base.attr("__getstate__") = cpp_function(
+            [](const object &arg) { return int_(arg); }, name("__getstate__"), is_method(m_base));
+
+        m_base.attr("__hash__") = cpp_function(
+            [](const object &arg) { return int_(arg); }, name("__hash__"), is_method(m_base));
+    }
+
+    PYBIND11_NOINLINE void value(char const *name_, object value, const char *doc = nullptr) {
+        dict entries = m_base.attr("__entries");
+        str name(name_);
+        if (entries.contains(name)) {
+            std::string type_name = (std::string) str(m_base.attr("__name__"));
+            throw value_error(std::move(type_name) + ": element \"" + std::string(name_)
+                              + "\" already exists!");
+        }
+
+        entries[name] = pybind11::make_tuple(value, doc);
+        m_base.attr(std::move(name)) = std::move(value);
+    }
+
+    PYBIND11_NOINLINE void export_values() {
+        dict entries = m_base.attr("__entries");
+        for (auto kv : entries) {
+            m_parent.attr(kv.first) = kv.second[int_(0)];
+        }
+    }
+
+    handle m_base;
+    handle m_parent;
+};
+
+template <bool is_signed, size_t length>
+struct equivalent_integer {};
+template <>
+struct equivalent_integer<true, 1> {
+    using type = int8_t;
+};
+template <>
+struct equivalent_integer<false, 1> {
+    using type = uint8_t;
+};
+template <>
+struct equivalent_integer<true, 2> {
+    using type = int16_t;
+};
+template <>
+struct equivalent_integer<false, 2> {
+    using type = uint16_t;
+};
+template <>
+struct equivalent_integer<true, 4> {
+    using type = int32_t;
+};
+template <>
+struct equivalent_integer<false, 4> {
+    using type = uint32_t;
+};
+template <>
+struct equivalent_integer<true, 8> {
+    using type = int64_t;
+};
+template <>
+struct equivalent_integer<false, 8> {
+    using type = uint64_t;
+};
+
+template <typename IntLike>
+using equivalent_integer_t =
+    typename equivalent_integer<std::is_signed<IntLike>::value, sizeof(IntLike)>::type;
+
+PYBIND11_NAMESPACE_END(detail)
+
+/// Binds C++ enumerations and enumeration classes to Python
+template <typename Type>
+class enum_ : public class_<Type> {
+public:
+    using Base = class_<Type>;
+    using Base::attr;
+    using Base::def;
+    using Base::def_property_readonly;
+    using Base::def_property_readonly_static;
+    using Underlying = typename std::underlying_type<Type>::type;
+    // Scalar is the integer representation of underlying type
+    using Scalar = detail::conditional_t<detail::any_of<detail::is_std_char_type<Underlying>,
+                                                        std::is_same<Underlying, bool>>::value,
+                                         detail::equivalent_integer_t<Underlying>,
+                                         Underlying>;
+
+    template <typename... Extra>
+    enum_(const handle &scope, const char *name, const Extra &...extra)
+        : class_<Type>(scope, name, extra...), m_base(*this, scope) {
+        constexpr bool is_arithmetic = detail::any_of<std::is_same<arithmetic, Extra>...>::value;
+        constexpr bool is_convertible = std::is_convertible<Type, Underlying>::value;
+        m_base.init(is_arithmetic, is_convertible);
+
+        def(init([](Scalar i) { return static_cast<Type>(i); }), arg("value"));
+        def_property_readonly("value", [](Type value) { return (Scalar) value; });
+        def("__int__", [](Type value) { return (Scalar) value; });
+        def("__index__", [](Type value) { return (Scalar) value; });
+        attr("__setstate__") = cpp_function(
+            [](detail::value_and_holder &v_h, Scalar arg) {
+                detail::initimpl::setstate<Base>(
+                    v_h, static_cast<Type>(arg), Py_TYPE(v_h.inst) != v_h.type->type);
+            },
+            detail::is_new_style_constructor(),
+            pybind11::name("__setstate__"),
+            is_method(*this),
+            arg("state"));
+    }
+
+    /// Export enumeration entries into the parent scope
+    enum_ &export_values() {
+        m_base.export_values();
+        return *this;
+    }
+
+    /// Add an enumeration entry
+    enum_ &value(char const *name, Type value, const char *doc = nullptr) {
+        m_base.value(name, pybind11::cast(value, return_value_policy::copy), doc);
+        return *this;
+    }
+
+private:
+    detail::enum_base m_base;
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+PYBIND11_NOINLINE void keep_alive_impl(handle nurse, handle patient) {
+    if (!nurse || !patient) {
+        pybind11_fail("Could not activate keep_alive!");
+    }
+
+    if (patient.is_none() || nurse.is_none()) {
+        return; /* Nothing to keep alive or nothing to be kept alive by */
+    }
+
+    auto tinfo = all_type_info(Py_TYPE(nurse.ptr()));
+    if (!tinfo.empty()) {
+        /* It's a pybind-registered type, so we can store the patient in the
+         * internal list. */
+        add_patient(nurse.ptr(), patient.ptr());
+    } else {
+        /* Fall back to clever approach based on weak references taken from
+         * Boost.Python. This is not used for pybind-registered types because
+         * the objects can be destroyed out-of-order in a GC pass. */
+        cpp_function disable_lifesupport([patient](handle weakref) {
+            patient.dec_ref();
+            weakref.dec_ref();
+        });
+
+        weakref wr(nurse, disable_lifesupport);
+
+        patient.inc_ref(); /* reference patient and leak the weak reference */
+        (void) wr.release();
+    }
+}
+
+PYBIND11_NOINLINE void
+keep_alive_impl(size_t Nurse, size_t Patient, function_call &call, handle ret) {
+    auto get_arg = [&](size_t n) {
+        if (n == 0) {
+            return ret;
+        }
+        if (n == 1 && call.init_self) {
+            return call.init_self;
+        }
+        if (n <= call.args.size()) {
+            return call.args[n - 1];
+        }
+        return handle();
+    };
+
+    keep_alive_impl(get_arg(Nurse), get_arg(Patient));
+}
+
+inline std::pair<decltype(internals::registered_types_py)::iterator, bool>
+all_type_info_get_cache(PyTypeObject *type) {
+    auto res = with_internals([type](internals &internals) {
+        return internals
+            .registered_types_py
+#ifdef __cpp_lib_unordered_map_try_emplace
+            .try_emplace(type);
+#else
+            .emplace(type, std::vector<detail::type_info *>());
+#endif
+    });
+    if (res.second) {
+        // New cache entry created; set up a weak reference to automatically remove it if the type
+        // gets destroyed:
+        weakref((PyObject *) type, cpp_function([type](handle wr) {
+                    with_internals([type](internals &internals) {
+                        internals.registered_types_py.erase(type);
+
+                        // TODO consolidate the erasure code in pybind11_meta_dealloc() in class.h
+                        auto &cache = internals.inactive_override_cache;
+                        for (auto it = cache.begin(), last = cache.end(); it != last;) {
+                            if (it->first == reinterpret_cast<PyObject *>(type)) {
+                                it = cache.erase(it);
+                            } else {
+                                ++it;
+                            }
+                        }
+                    });
+
+                    wr.dec_ref();
+                }))
+            .release();
+    }
+
+    return res;
+}
+
+/* There are a large number of apparently unused template arguments because
+ * each combination requires a separate py::class_ registration.
+ */
+template <typename Access,
+          return_value_policy Policy,
+          typename Iterator,
+          typename Sentinel,
+          typename ValueType,
+          typename... Extra>
+struct iterator_state {
+    Iterator it;
+    Sentinel end;
+    bool first_or_done;
+};
+
+// Note: these helpers take the iterator by non-const reference because some
+// iterators in the wild can't be dereferenced when const. The & after Iterator
+// is required for MSVC < 16.9. SFINAE cannot be reused for result_type due to
+// bugs in ICC, NVCC, and PGI compilers. See PR #3293.
+template <typename Iterator, typename SFINAE = decltype(*std::declval<Iterator &>())>
+struct iterator_access {
+    using result_type = decltype(*std::declval<Iterator &>());
+    // NOLINTNEXTLINE(readability-const-return-type) // PR #3263
+    result_type operator()(Iterator &it) const { return *it; }
+};
+
+template <typename Iterator, typename SFINAE = decltype((*std::declval<Iterator &>()).first)>
+class iterator_key_access {
+private:
+    using pair_type = decltype(*std::declval<Iterator &>());
+
+public:
+    /* If either the pair itself or the element of the pair is a reference, we
+     * want to return a reference, otherwise a value. When the decltype
+     * expression is parenthesized it is based on the value category of the
+     * expression; otherwise it is the declared type of the pair member.
+     * The use of declval<pair_type> in the second branch rather than directly
+     * using *std::declval<Iterator &>() is a workaround for nvcc
+     * (it's not used in the first branch because going via decltype and back
+     * through declval does not perfectly preserve references).
+     */
+    using result_type
+        = conditional_t<std::is_reference<decltype(*std::declval<Iterator &>())>::value,
+                        decltype(((*std::declval<Iterator &>()).first)),
+                        decltype(std::declval<pair_type>().first)>;
+    result_type operator()(Iterator &it) const { return (*it).first; }
+};
+
+template <typename Iterator, typename SFINAE = decltype((*std::declval<Iterator &>()).second)>
+class iterator_value_access {
+private:
+    using pair_type = decltype(*std::declval<Iterator &>());
+
+public:
+    using result_type
+        = conditional_t<std::is_reference<decltype(*std::declval<Iterator &>())>::value,
+                        decltype(((*std::declval<Iterator &>()).second)),
+                        decltype(std::declval<pair_type>().second)>;
+    result_type operator()(Iterator &it) const { return (*it).second; }
+};
+
+template <typename Access,
+          return_value_policy Policy,
+          typename Iterator,
+          typename Sentinel,
+          typename ValueType,
+          typename... Extra>
+iterator make_iterator_impl(Iterator first, Sentinel last, Extra &&...extra) {
+    using state = detail::iterator_state<Access, Policy, Iterator, Sentinel, ValueType, Extra...>;
+    // TODO: state captures only the types of Extra, not the values
+
+    if (!detail::get_type_info(typeid(state), false)) {
+        class_<state>(handle(), "iterator", pybind11::module_local())
+            .def("__iter__", [](state &s) -> state & { return s; })
+            .def(
+                "__next__",
+                [](state &s) -> ValueType {
+                    if (!s.first_or_done) {
+                        ++s.it;
+                    } else {
+                        s.first_or_done = false;
+                    }
+                    if (s.it == s.end) {
+                        s.first_or_done = true;
+                        throw stop_iteration();
+                    }
+                    return Access()(s.it);
+                    // NOLINTNEXTLINE(readability-const-return-type) // PR #3263
+                },
+                std::forward<Extra>(extra)...,
+                Policy);
+    }
+
+    return cast(state{std::forward<Iterator>(first), std::forward<Sentinel>(last), true});
+}
+
+PYBIND11_NAMESPACE_END(detail)
+
+/// Makes a python iterator from a first and past-the-end C++ InputIterator.
+template <return_value_policy Policy = return_value_policy::reference_internal,
+          typename Iterator,
+          typename Sentinel,
+          typename ValueType = typename detail::iterator_access<Iterator>::result_type,
+          typename... Extra>
+typing::Iterator<ValueType> make_iterator(Iterator first, Sentinel last, Extra &&...extra) {
+    return detail::make_iterator_impl<detail::iterator_access<Iterator>,
+                                      Policy,
+                                      Iterator,
+                                      Sentinel,
+                                      ValueType,
+                                      Extra...>(std::forward<Iterator>(first),
+                                                std::forward<Sentinel>(last),
+                                                std::forward<Extra>(extra)...);
+}
+
+/// Makes a python iterator over the keys (`.first`) of a iterator over pairs from a
+/// first and past-the-end InputIterator.
+template <return_value_policy Policy = return_value_policy::reference_internal,
+          typename Iterator,
+          typename Sentinel,
+          typename KeyType = typename detail::iterator_key_access<Iterator>::result_type,
+          typename... Extra>
+typing::Iterator<KeyType> make_key_iterator(Iterator first, Sentinel last, Extra &&...extra) {
+    return detail::make_iterator_impl<detail::iterator_key_access<Iterator>,
+                                      Policy,
+                                      Iterator,
+                                      Sentinel,
+                                      KeyType,
+                                      Extra...>(std::forward<Iterator>(first),
+                                                std::forward<Sentinel>(last),
+                                                std::forward<Extra>(extra)...);
+}
+
+/// Makes a python iterator over the values (`.second`) of a iterator over pairs from a
+/// first and past-the-end InputIterator.
+template <return_value_policy Policy = return_value_policy::reference_internal,
+          typename Iterator,
+          typename Sentinel,
+          typename ValueType = typename detail::iterator_value_access<Iterator>::result_type,
+          typename... Extra>
+typing::Iterator<ValueType> make_value_iterator(Iterator first, Sentinel last, Extra &&...extra) {
+    return detail::make_iterator_impl<detail::iterator_value_access<Iterator>,
+                                      Policy,
+                                      Iterator,
+                                      Sentinel,
+                                      ValueType,
+                                      Extra...>(std::forward<Iterator>(first),
+                                                std::forward<Sentinel>(last),
+                                                std::forward<Extra>(extra)...);
+}
+
+/// Makes an iterator over values of an stl container or other container supporting
+/// `std::begin()`/`std::end()`
+template <return_value_policy Policy = return_value_policy::reference_internal,
+          typename Type,
+          typename ValueType = typename detail::iterator_access<
+              decltype(std::begin(std::declval<Type &>()))>::result_type,
+          typename... Extra>
+typing::Iterator<ValueType> make_iterator(Type &value, Extra &&...extra) {
+    return make_iterator<Policy>(
+        std::begin(value), std::end(value), std::forward<Extra>(extra)...);
+}
+
+/// Makes an iterator over the keys (`.first`) of a stl map-like container supporting
+/// `std::begin()`/`std::end()`
+template <return_value_policy Policy = return_value_policy::reference_internal,
+          typename Type,
+          typename KeyType = typename detail::iterator_key_access<
+              decltype(std::begin(std::declval<Type &>()))>::result_type,
+          typename... Extra>
+typing::Iterator<KeyType> make_key_iterator(Type &value, Extra &&...extra) {
+    return make_key_iterator<Policy>(
+        std::begin(value), std::end(value), std::forward<Extra>(extra)...);
+}
+
+/// Makes an iterator over the values (`.second`) of a stl map-like container supporting
+/// `std::begin()`/`std::end()`
+template <return_value_policy Policy = return_value_policy::reference_internal,
+          typename Type,
+          typename ValueType = typename detail::iterator_value_access<
+              decltype(std::begin(std::declval<Type &>()))>::result_type,
+          typename... Extra>
+typing::Iterator<ValueType> make_value_iterator(Type &value, Extra &&...extra) {
+    return make_value_iterator<Policy>(
+        std::begin(value), std::end(value), std::forward<Extra>(extra)...);
+}
+
+template <typename InputType, typename OutputType>
+void implicitly_convertible() {
+    struct set_flag {
+        bool &flag;
+        explicit set_flag(bool &flag_) : flag(flag_) { flag_ = true; }
+        ~set_flag() { flag = false; }
+    };
+    auto implicit_caster = [](PyObject *obj, PyTypeObject *type) -> PyObject * {
+#ifdef Py_GIL_DISABLED
+        thread_local bool currently_used = false;
+#else
+        static bool currently_used = false;
+#endif
+        if (currently_used) { // implicit conversions are non-reentrant
+            return nullptr;
+        }
+        set_flag flag_helper(currently_used);
+        if (!detail::make_caster<InputType>().load(obj, false)) {
+            return nullptr;
+        }
+        tuple args(1);
+        args[0] = obj;
+        PyObject *result = PyObject_Call((PyObject *) type, args.ptr(), nullptr);
+        if (result == nullptr) {
+            PyErr_Clear();
+        }
+        return result;
+    };
+
+    if (auto *tinfo = detail::get_type_info(typeid(OutputType))) {
+        tinfo->implicit_conversions.emplace_back(std::move(implicit_caster));
+    } else {
+        pybind11_fail("implicitly_convertible: Unable to find type " + type_id<OutputType>());
+    }
+}
+
+inline void register_exception_translator(ExceptionTranslator &&translator) {
+    detail::with_internals([&](detail::internals &internals) {
+        internals.registered_exception_translators.push_front(
+            std::forward<ExceptionTranslator>(translator));
+    });
+}
+
+/**
+ * Add a new module-local exception translator. Locally registered functions
+ * will be tried before any globally registered exception translators, which
+ * will only be invoked if the module-local handlers do not deal with
+ * the exception.
+ */
+inline void register_local_exception_translator(ExceptionTranslator &&translator) {
+    detail::with_internals([&](detail::internals &internals) {
+        (void) internals;
+        detail::get_local_internals().registered_exception_translators.push_front(
+            std::forward<ExceptionTranslator>(translator));
+    });
+}
+
+/**
+ * Wrapper to generate a new Python exception type.
+ *
+ * This should only be used with py::set_error() for now.
+ * It is not (yet) possible to use as a py::base.
+ * Template type argument is reserved for future use.
+ */
+template <typename type>
+class exception : public object {
+public:
+    exception() = default;
+    exception(handle scope, const char *name, handle base = PyExc_Exception) {
+        std::string full_name
+            = scope.attr("__name__").cast<std::string>() + std::string(".") + name;
+        m_ptr = PyErr_NewException(const_cast<char *>(full_name.c_str()), base.ptr(), nullptr);
+        if (hasattr(scope, "__dict__") && scope.attr("__dict__").contains(name)) {
+            pybind11_fail("Error during initialization: multiple incompatible "
+                          "definitions with name \""
+                          + std::string(name) + "\"");
+        }
+        scope.attr(name) = *this;
+    }
+
+    // Sets the current python exception to this exception object with the given message
+    PYBIND11_DEPRECATED("Please use py::set_error() instead "
+                        "(https://github.com/pybind/pybind11/pull/4772)")
+    void operator()(const char *message) const { set_error(*this, message); }
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <>
+struct handle_type_name<exception<void>> {
+    static constexpr auto name = const_name("Exception");
+};
+
+// Helper function for register_exception and register_local_exception
+template <typename CppException>
+exception<CppException> &
+register_exception_impl(handle scope, const char *name, handle base, bool isLocal) {
+    PYBIND11_CONSTINIT static gil_safe_call_once_and_store<exception<CppException>> exc_storage;
+    exc_storage.call_once_and_store_result(
+        [&]() { return exception<CppException>(scope, name, base); });
+
+    auto register_func
+        = isLocal ? &register_local_exception_translator : &register_exception_translator;
+
+    register_func([](std::exception_ptr p) {
+        if (!p) {
+            return;
+        }
+        try {
+            std::rethrow_exception(p);
+        } catch (const CppException &e) {
+            set_error(exc_storage.get_stored(), e.what());
+        }
+    });
+    return exc_storage.get_stored();
+}
+
+PYBIND11_NAMESPACE_END(detail)
+
+/**
+ * Registers a Python exception in `m` of the given `name` and installs a translator to
+ * translate the C++ exception to the created Python exception using the what() method.
+ * This is intended for simple exception translations; for more complex translation, register the
+ * exception object and translator directly.
+ */
+template <typename CppException>
+exception<CppException> &
+register_exception(handle scope, const char *name, handle base = PyExc_Exception) {
+    return detail::register_exception_impl<CppException>(scope, name, base, false /* isLocal */);
+}
+
+/**
+ * Registers a Python exception in `m` of the given `name` and installs a translator to
+ * translate the C++ exception to the created Python exception using the what() method.
+ * This translator will only be used for exceptions that are thrown in this module and will be
+ * tried before global exception translators, including those registered with register_exception.
+ * This is intended for simple exception translations; for more complex translation, register the
+ * exception object and translator directly.
+ */
+template <typename CppException>
+exception<CppException> &
+register_local_exception(handle scope, const char *name, handle base = PyExc_Exception) {
+    return detail::register_exception_impl<CppException>(scope, name, base, true /* isLocal */);
+}
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+PYBIND11_NOINLINE void print(const tuple &args, const dict &kwargs) {
+    auto strings = tuple(args.size());
+    for (size_t i = 0; i < args.size(); ++i) {
+        strings[i] = str(args[i]);
+    }
+    auto sep = kwargs.contains("sep") ? kwargs["sep"] : str(" ");
+    auto line = sep.attr("join")(std::move(strings));
+
+    object file;
+    if (kwargs.contains("file")) {
+        file = kwargs["file"].cast<object>();
+    } else {
+        try {
+            file = module_::import("sys").attr("stdout");
+        } catch (const error_already_set &) {
+            /* If print() is called from code that is executed as
+               part of garbage collection during interpreter shutdown,
+               importing 'sys' can fail. Give up rather than crashing the
+               interpreter in this case. */
+            return;
+        }
+    }
+
+    auto write = file.attr("write");
+    write(std::move(line));
+    write(kwargs.contains("end") ? kwargs["end"] : str("\n"));
+
+    if (kwargs.contains("flush") && kwargs["flush"].cast<bool>()) {
+        file.attr("flush")();
+    }
+}
+PYBIND11_NAMESPACE_END(detail)
+
+template <return_value_policy policy = return_value_policy::automatic_reference, typename... Args>
+void print(Args &&...args) {
+    auto c = detail::collect_arguments<policy>(std::forward<Args>(args)...);
+    detail::print(c.args(), c.kwargs());
+}
+
+inline void
+error_already_set::m_fetched_error_deleter(detail::error_fetch_and_normalize *raw_ptr) {
+    gil_scoped_acquire gil;
+    error_scope scope;
+    delete raw_ptr;
+}
+
+inline const char *error_already_set::what() const noexcept {
+    gil_scoped_acquire gil;
+    error_scope scope;
+    return m_fetched_error->error_string().c_str();
+}
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+inline function
+get_type_override(const void *this_ptr, const type_info *this_type, const char *name) {
+    handle self = get_object_handle(this_ptr, this_type);
+    if (!self) {
+        return function();
+    }
+    handle type = type::handle_of(self);
+    auto key = std::make_pair(type.ptr(), name);
+
+    /* Cache functions that aren't overridden in Python to avoid
+       many costly Python dictionary lookups below */
+    bool not_overridden = with_internals([&key](internals &internals) {
+        auto &cache = internals.inactive_override_cache;
+        return cache.find(key) != cache.end();
+    });
+    if (not_overridden) {
+        return function();
+    }
+
+    function override = getattr(self, name, function());
+    if (override.is_cpp_function()) {
+        with_internals([&](internals &internals) {
+            internals.inactive_override_cache.insert(std::move(key));
+        });
+        return function();
+    }
+
+    /* Don't call dispatch code if invoked from overridden function.
+       Unfortunately this doesn't work on PyPy. */
+#if !defined(PYPY_VERSION)
+#    if PY_VERSION_HEX >= 0x03090000
+    PyFrameObject *frame = PyThreadState_GetFrame(PyThreadState_Get());
+    if (frame != nullptr) {
+        PyCodeObject *f_code = PyFrame_GetCode(frame);
+        // f_code is guaranteed to not be NULL
+        if ((std::string) str(f_code->co_name) == name && f_code->co_argcount > 0) {
+#        if PY_VERSION_HEX >= 0x030d0000
+            PyObject *locals = PyEval_GetFrameLocals();
+#        else
+            PyObject *locals = PyEval_GetLocals();
+            Py_XINCREF(locals);
+#        endif
+            if (locals != nullptr) {
+#        if PY_VERSION_HEX >= 0x030b0000
+                PyObject *co_varnames = PyCode_GetVarnames(f_code);
+#        else
+                PyObject *co_varnames = PyObject_GetAttrString((PyObject *) f_code, "co_varnames");
+#        endif
+                PyObject *self_arg = PyTuple_GET_ITEM(co_varnames, 0);
+                Py_DECREF(co_varnames);
+                PyObject *self_caller = dict_getitem(locals, self_arg);
+                Py_DECREF(locals);
+                if (self_caller == self.ptr()) {
+                    Py_DECREF(f_code);
+                    Py_DECREF(frame);
+                    return function();
+                }
+            }
+        }
+        Py_DECREF(f_code);
+        Py_DECREF(frame);
+    }
+#    else
+    PyFrameObject *frame = PyThreadState_Get()->frame;
+    if (frame != nullptr && (std::string) str(frame->f_code->co_name) == name
+        && frame->f_code->co_argcount > 0) {
+        PyFrame_FastToLocals(frame);
+        PyObject *self_caller
+            = dict_getitem(frame->f_locals, PyTuple_GET_ITEM(frame->f_code->co_varnames, 0));
+        if (self_caller == self.ptr()) {
+            return function();
+        }
+    }
+#    endif
+
+#else
+    /* PyPy currently doesn't provide a detailed cpyext emulation of
+       frame objects, so we have to emulate this using Python. This
+       is going to be slow..*/
+    dict d;
+    d["self"] = self;
+    d["name"] = pybind11::str(name);
+    PyObject *result
+        = PyRun_String("import inspect\n"
+                       "frame = inspect.currentframe()\n"
+                       "if frame is not None:\n"
+                       "    frame = frame.f_back\n"
+                       "    if frame is not None and str(frame.f_code.co_name) == name and "
+                       "frame.f_code.co_argcount > 0:\n"
+                       "        self_caller = frame.f_locals[frame.f_code.co_varnames[0]]\n"
+                       "        if self_caller == self:\n"
+                       "            self = None\n",
+                       Py_file_input,
+                       d.ptr(),
+                       d.ptr());
+    if (result == nullptr)
+        throw error_already_set();
+    Py_DECREF(result);
+    if (d["self"].is_none())
+        return function();
+#endif
+
+    return override;
+}
+PYBIND11_NAMESPACE_END(detail)
+
+/** \rst
+  Try to retrieve a python method by the provided name from the instance pointed to by the
+  this_ptr.
+
+  :this_ptr: The pointer to the object the overridden method should be retrieved for. This should
+             be the first non-trampoline class encountered in the inheritance chain.
+  :name: The name of the overridden Python method to retrieve.
+  :return: The Python method by this name from the object or an empty function wrapper.
+ \endrst */
+template <class T>
+function get_override(const T *this_ptr, const char *name) {
+    auto *tinfo = detail::get_type_info(typeid(T));
+    return tinfo ? detail::get_type_override(this_ptr, tinfo, name) : function();
+}
+
+#define PYBIND11_OVERRIDE_IMPL(ret_type, cname, name, ...)                                        \
+    do {                                                                                          \
+        pybind11::gil_scoped_acquire gil;                                                         \
+        pybind11::function override                                                               \
+            = pybind11::get_override(static_cast<const cname *>(this), name);                     \
+        if (override) {                                                                           \
+            auto o = override(__VA_ARGS__);                                                       \
+            PYBIND11_WARNING_PUSH                                                                 \
+            PYBIND11_WARNING_DISABLE_MSVC(4127)                                                   \
+            if (pybind11::detail::cast_is_temporary_value_reference<ret_type>::value              \
+                && !pybind11::detail::is_same_ignoring_cvref<ret_type, PyObject *>::value) {      \
+                static pybind11::detail::override_caster_t<ret_type> caster;                      \
+                return pybind11::detail::cast_ref<ret_type>(std::move(o), caster);                \
+            }                                                                                     \
+            PYBIND11_WARNING_POP                                                                  \
+            return pybind11::detail::cast_safe<ret_type>(std::move(o));                           \
+        }                                                                                         \
+    } while (false)
+
+/** \rst
+    Macro to populate the virtual method in the trampoline class. This macro tries to look up a
+    method named 'fn' from the Python side, deals with the :ref:`gil` and necessary argument
+    conversions to call this method and return the appropriate type.
+    See :ref:`overriding_virtuals` for more information. This macro should be used when the method
+    name in C is not the same as the method name in Python. For example with `__str__`.
+
+    .. code-block:: cpp
+
+      std::string toString() override {
+        PYBIND11_OVERRIDE_NAME(
+            std::string, // Return type (ret_type)
+            Animal,      // Parent class (cname)
+            "__str__",   // Name of method in Python (name)
+            toString,    // Name of function in C++ (fn)
+        );
+      }
+\endrst */
+#define PYBIND11_OVERRIDE_NAME(ret_type, cname, name, fn, ...)                                    \
+    do {                                                                                          \
+        PYBIND11_OVERRIDE_IMPL(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), name, __VA_ARGS__); \
+        return cname::fn(__VA_ARGS__);                                                            \
+    } while (false)
+
+/** \rst
+    Macro for pure virtual functions, this function is identical to
+    :c:macro:`PYBIND11_OVERRIDE_NAME`, except that it throws if no override can be found.
+\endrst */
+#define PYBIND11_OVERRIDE_PURE_NAME(ret_type, cname, name, fn, ...)                               \
+    do {                                                                                          \
+        PYBIND11_OVERRIDE_IMPL(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), name, __VA_ARGS__); \
+        pybind11::pybind11_fail(                                                                  \
+            "Tried to call pure virtual function \"" PYBIND11_STRINGIFY(cname) "::" name "\"");   \
+    } while (false)
+
+/** \rst
+    Macro to populate the virtual method in the trampoline class. This macro tries to look up the
+    method from the Python side, deals with the :ref:`gil` and necessary argument conversions to
+    call this method and return the appropriate type. This macro should be used if the method name
+    in C and in Python are identical.
+    See :ref:`overriding_virtuals` for more information.
+
+    .. code-block:: cpp
+
+      class PyAnimal : public Animal {
+      public:
+          // Inherit the constructors
+          using Animal::Animal;
+
+          // Trampoline (need one for each virtual function)
+          std::string go(int n_times) override {
+              PYBIND11_OVERRIDE_PURE(
+                  std::string, // Return type (ret_type)
+                  Animal,      // Parent class (cname)
+                  go,          // Name of function in C++ (must match Python name) (fn)
+                  n_times      // Argument(s) (...)
+              );
+          }
+      };
+\endrst */
+#define PYBIND11_OVERRIDE(ret_type, cname, fn, ...)                                               \
+    PYBIND11_OVERRIDE_NAME(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), #fn, fn, __VA_ARGS__)
+
+/** \rst
+    Macro for pure virtual functions, this function is identical to :c:macro:`PYBIND11_OVERRIDE`,
+    except that it throws if no override can be found.
+\endrst */
+#define PYBIND11_OVERRIDE_PURE(ret_type, cname, fn, ...)                                          \
+    PYBIND11_OVERRIDE_PURE_NAME(                                                                  \
+        PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), #fn, fn, __VA_ARGS__)
+
+// Deprecated versions
+
+PYBIND11_DEPRECATED("get_type_overload has been deprecated")
+inline function
+get_type_overload(const void *this_ptr, const detail::type_info *this_type, const char *name) {
+    return detail::get_type_override(this_ptr, this_type, name);
+}
+
+template <class T>
+inline function get_overload(const T *this_ptr, const char *name) {
+    return get_override(this_ptr, name);
+}
+
+#define PYBIND11_OVERLOAD_INT(ret_type, cname, name, ...)                                         \
+    PYBIND11_OVERRIDE_IMPL(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), name, __VA_ARGS__)
+#define PYBIND11_OVERLOAD_NAME(ret_type, cname, name, fn, ...)                                    \
+    PYBIND11_OVERRIDE_NAME(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), name, fn, __VA_ARGS__)
+#define PYBIND11_OVERLOAD_PURE_NAME(ret_type, cname, name, fn, ...)                               \
+    PYBIND11_OVERRIDE_PURE_NAME(                                                                  \
+        PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), name, fn, __VA_ARGS__);
+#define PYBIND11_OVERLOAD(ret_type, cname, fn, ...)                                               \
+    PYBIND11_OVERRIDE(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), fn, __VA_ARGS__)
+#define PYBIND11_OVERLOAD_PURE(ret_type, cname, fn, ...)                                          \
+    PYBIND11_OVERRIDE_PURE(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), fn, __VA_ARGS__);
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/phivenv/Lib/site-packages/torch/include/pybind11/pytypes.h b/phivenv/Lib/site-packages/torch/include/pybind11/pytypes.h
new file mode 100644
index 0000000000000000000000000000000000000000..804704ee096c3c57d75b6a2ceae245078ad7fcba
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/pybind11/pytypes.h
@@ -0,0 +1,2606 @@
+/*
+    pybind11/pytypes.h: Convenience wrapper classes for basic Python types
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+#include "buffer_info.h"
+
+#include <assert.h>
+#include <cstddef>
+#include <exception>
+#include <frameobject.h>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <typeinfo>
+#include <utility>
+
+#if defined(PYBIND11_HAS_OPTIONAL)
+#    include <optional>
+#endif
+
+#ifdef PYBIND11_HAS_STRING_VIEW
+#    include <string_view>
+#endif
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+PYBIND11_WARNING_DISABLE_MSVC(4127)
+
+/* A few forward declarations */
+class handle;
+class object;
+class str;
+class iterator;
+class type;
+struct arg;
+struct arg_v;
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+class args_proxy;
+bool isinstance_generic(handle obj, const std::type_info &tp);
+
+// Accessor forward declarations
+template <typename Policy>
+class accessor;
+namespace accessor_policies {
+struct obj_attr;
+struct str_attr;
+struct generic_item;
+struct sequence_item;
+struct list_item;
+struct tuple_item;
+} // namespace accessor_policies
+// PLEASE KEEP handle_type_name SPECIALIZATIONS IN SYNC.
+using obj_attr_accessor = accessor<accessor_policies::obj_attr>;
+using str_attr_accessor = accessor<accessor_policies::str_attr>;
+using item_accessor = accessor<accessor_policies::generic_item>;
+using sequence_accessor = accessor<accessor_policies::sequence_item>;
+using list_accessor = accessor<accessor_policies::list_item>;
+using tuple_accessor = accessor<accessor_policies::tuple_item>;
+
+/// Tag and check to identify a class which implements the Python object API
+class pyobject_tag {};
+template <typename T>
+using is_pyobject = std::is_base_of<pyobject_tag, remove_reference_t<T>>;
+
+/** \rst
+    A mixin class which adds common functions to `handle`, `object` and various accessors.
+    The only requirement for `Derived` is to implement ``PyObject *Derived::ptr() const``.
+\endrst */
+template <typename Derived>
+class object_api : public pyobject_tag {
+    const Derived &derived() const { return static_cast<const Derived &>(*this); }
+
+public:
+    /** \rst
+        Return an iterator equivalent to calling ``iter()`` in Python. The object
+        must be a collection which supports the iteration protocol.
+    \endrst */
+    iterator begin() const;
+    /// Return a sentinel which ends iteration.
+    iterator end() const;
+
+    /** \rst
+        Return an internal functor to invoke the object's sequence protocol. Casting
+        the returned ``detail::item_accessor`` instance to a `handle` or `object`
+        subclass causes a corresponding call to ``__getitem__``. Assigning a `handle`
+        or `object` subclass causes a call to ``__setitem__``.
+    \endrst */
+    item_accessor operator[](handle key) const;
+    /// See above (the only difference is that the key's reference is stolen)
+    item_accessor operator[](object &&key) const;
+    /// See above (the only difference is that the key is provided as a string literal)
+    item_accessor operator[](const char *key) const;
+
+    /** \rst
+        Return an internal functor to access the object's attributes. Casting the
+        returned ``detail::obj_attr_accessor`` instance to a `handle` or `object`
+        subclass causes a corresponding call to ``getattr``. Assigning a `handle`
+        or `object` subclass causes a call to ``setattr``.
+    \endrst */
+    obj_attr_accessor attr(handle key) const;
+    /// See above (the only difference is that the key's reference is stolen)
+    obj_attr_accessor attr(object &&key) const;
+    /// See above (the only difference is that the key is provided as a string literal)
+    str_attr_accessor attr(const char *key) const;
+
+    /** \rst
+        Matches * unpacking in Python, e.g. to unpack arguments out of a ``tuple``
+        or ``list`` for a function call. Applying another * to the result yields
+        ** unpacking, e.g. to unpack a dict as function keyword arguments.
+        See :ref:`calling_python_functions`.
+    \endrst */
+    args_proxy operator*() const;
+
+    /// Check if the given item is contained within this object, i.e. ``item in obj``.
+    template <typename T>
+    bool contains(T &&item) const;
+
+    /** \rst
+        Assuming the Python object is a function or implements the ``__call__``
+        protocol, ``operator()`` invokes the underlying function, passing an
+        arbitrary set of parameters. The result is returned as a `object` and
+        may need to be converted back into a Python object using `handle::cast()`.
+
+        When some of the arguments cannot be converted to Python objects, the
+        function will throw a `cast_error` exception. When the Python function
+        call fails, a `error_already_set` exception is thrown.
+    \endrst */
+    template <return_value_policy policy = return_value_policy::automatic_reference,
+              typename... Args>
+    object operator()(Args &&...args) const;
+    template <return_value_policy policy = return_value_policy::automatic_reference,
+              typename... Args>
+    PYBIND11_DEPRECATED("call(...) was deprecated in favor of operator()(...)")
+    object call(Args &&...args) const;
+
+    /// Equivalent to ``obj is other`` in Python.
+    bool is(object_api const &other) const { return derived().ptr() == other.derived().ptr(); }
+    /// Equivalent to ``obj is None`` in Python.
+    bool is_none() const { return derived().ptr() == Py_None; }
+    /// Equivalent to obj == other in Python
+    bool equal(object_api const &other) const { return rich_compare(other, Py_EQ); }
+    bool not_equal(object_api const &other) const { return rich_compare(other, Py_NE); }
+    bool operator<(object_api const &other) const { return rich_compare(other, Py_LT); }
+    bool operator<=(object_api const &other) const { return rich_compare(other, Py_LE); }
+    bool operator>(object_api const &other) const { return rich_compare(other, Py_GT); }
+    bool operator>=(object_api const &other) const { return rich_compare(other, Py_GE); }
+
+    object operator-() const;
+    object operator~() const;
+    object operator+(object_api const &other) const;
+    object operator+=(object_api const &other);
+    object operator-(object_api const &other) const;
+    object operator-=(object_api const &other);
+    object operator*(object_api const &other) const;
+    object operator*=(object_api const &other);
+    object operator/(object_api const &other) const;
+    object operator/=(object_api const &other);
+    object operator|(object_api const &other) const;
+    object operator|=(object_api const &other);
+    object operator&(object_api const &other) const;
+    object operator&=(object_api const &other);
+    object operator^(object_api const &other) const;
+    object operator^=(object_api const &other);
+    object operator<<(object_api const &other) const;
+    object operator<<=(object_api const &other);
+    object operator>>(object_api const &other) const;
+    object operator>>=(object_api const &other);
+
+    PYBIND11_DEPRECATED("Use py::str(obj) instead")
+    pybind11::str str() const;
+
+    /// Get or set the object's docstring, i.e. ``obj.__doc__``.
+    str_attr_accessor doc() const;
+
+    /// Return the object's current reference count
+    ssize_t ref_count() const {
+#ifdef PYPY_VERSION
+        // PyPy uses the top few bits for REFCNT_FROM_PYPY & REFCNT_FROM_PYPY_LIGHT
+        // Following pybind11 2.12.1 and older behavior and removing this part
+        return static_cast<ssize_t>(static_cast<int>(Py_REFCNT(derived().ptr())));
+#else
+        return Py_REFCNT(derived().ptr());
+#endif
+    }
+
+    // TODO PYBIND11_DEPRECATED(
+    //     "Call py::type::handle_of(h) or py::type::of(h) instead of h.get_type()")
+    handle get_type() const;
+
+private:
+    bool rich_compare(object_api const &other, int value) const;
+};
+
+template <typename T>
+using is_pyobj_ptr_or_nullptr_t = detail::any_of<std::is_same<T, PyObject *>,
+                                                 std::is_same<T, PyObject *const>,
+                                                 std::is_same<T, std::nullptr_t>>;
+
+PYBIND11_NAMESPACE_END(detail)
+
+#if !defined(PYBIND11_HANDLE_REF_DEBUG) && !defined(NDEBUG)
+#    define PYBIND11_HANDLE_REF_DEBUG
+#endif
+
+/** \rst
+    Holds a reference to a Python object (no reference counting)
+
+    The `handle` class is a thin wrapper around an arbitrary Python object (i.e. a
+    ``PyObject *`` in Python's C API). It does not perform any automatic reference
+    counting and merely provides a basic C++ interface to various Python API functions.
+
+    .. seealso::
+        The `object` class inherits from `handle` and adds automatic reference
+        counting features.
+\endrst */
+class handle : public detail::object_api<handle> {
+public:
+    /// The default constructor creates a handle with a ``nullptr``-valued pointer
+    handle() = default;
+
+    /// Enable implicit conversion from ``PyObject *`` and ``nullptr``.
+    /// Not using ``handle(PyObject *ptr)`` to avoid implicit conversion from ``0``.
+    template <typename T,
+              detail::enable_if_t<detail::is_pyobj_ptr_or_nullptr_t<T>::value, int> = 0>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    handle(T ptr) : m_ptr(ptr) {}
+
+    /// Enable implicit conversion through ``T::operator PyObject *()``.
+    template <
+        typename T,
+        detail::enable_if_t<detail::all_of<detail::none_of<std::is_base_of<handle, T>,
+                                                           detail::is_pyobj_ptr_or_nullptr_t<T>>,
+                                           std::is_convertible<T, PyObject *>>::value,
+                            int>
+        = 0>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    handle(T &obj) : m_ptr(obj) {}
+
+    /// Return the underlying ``PyObject *`` pointer
+    PyObject *ptr() const { return m_ptr; }
+    PyObject *&ptr() { return m_ptr; }
+
+    /** \rst
+        Manually increase the reference count of the Python object. Usually, it is
+        preferable to use the `object` class which derives from `handle` and calls
+        this function automatically. Returns a reference to itself.
+    \endrst */
+    const handle &inc_ref() const & {
+#ifdef PYBIND11_HANDLE_REF_DEBUG
+        inc_ref_counter(1);
+#endif
+#ifdef PYBIND11_ASSERT_GIL_HELD_INCREF_DECREF
+        if (m_ptr != nullptr && !PyGILState_Check()) {
+            throw_gilstate_error("pybind11::handle::inc_ref()");
+        }
+#endif
+        Py_XINCREF(m_ptr);
+        return *this;
+    }
+
+    /** \rst
+        Manually decrease the reference count of the Python object. Usually, it is
+        preferable to use the `object` class which derives from `handle` and calls
+        this function automatically. Returns a reference to itself.
+    \endrst */
+    const handle &dec_ref() const & {
+#ifdef PYBIND11_ASSERT_GIL_HELD_INCREF_DECREF
+        if (m_ptr != nullptr && !PyGILState_Check()) {
+            throw_gilstate_error("pybind11::handle::dec_ref()");
+        }
+#endif
+        Py_XDECREF(m_ptr);
+        return *this;
+    }
+
+    /** \rst
+        Attempt to cast the Python object into the given C++ type. A `cast_error`
+        will be throw upon failure.
+    \endrst */
+    template <typename T>
+    T cast() const;
+    /// Return ``true`` when the `handle` wraps a valid Python object
+    explicit operator bool() const { return m_ptr != nullptr; }
+    /** \rst
+        Deprecated: Check that the underlying pointers are the same.
+        Equivalent to ``obj1 is obj2`` in Python.
+    \endrst */
+    PYBIND11_DEPRECATED("Use obj1.is(obj2) instead")
+    bool operator==(const handle &h) const { return m_ptr == h.m_ptr; }
+    PYBIND11_DEPRECATED("Use !obj1.is(obj2) instead")
+    bool operator!=(const handle &h) const { return m_ptr != h.m_ptr; }
+    PYBIND11_DEPRECATED("Use handle::operator bool() instead")
+    bool check() const { return m_ptr != nullptr; }
+
+protected:
+    PyObject *m_ptr = nullptr;
+
+private:
+#ifdef PYBIND11_ASSERT_GIL_HELD_INCREF_DECREF
+    void throw_gilstate_error(const std::string &function_name) const {
+        fprintf(
+            stderr,
+            "%s is being called while the GIL is either not held or invalid. Please see "
+            "https://pybind11.readthedocs.io/en/stable/advanced/"
+            "misc.html#common-sources-of-global-interpreter-lock-errors for debugging advice.\n"
+            "If you are convinced there is no bug in your code, you can #define "
+            "PYBIND11_NO_ASSERT_GIL_HELD_INCREF_DECREF "
+            "to disable this check. In that case you have to ensure this #define is consistently "
+            "used for all translation units linked into a given pybind11 extension, otherwise "
+            "there will be ODR violations.",
+            function_name.c_str());
+        if (Py_TYPE(m_ptr)->tp_name != nullptr) {
+            fprintf(stderr,
+                    " The failing %s call was triggered on a %s object.",
+                    function_name.c_str(),
+                    Py_TYPE(m_ptr)->tp_name);
+        }
+        fprintf(stderr, "\n");
+        fflush(stderr);
+        throw std::runtime_error(function_name + " PyGILState_Check() failure.");
+    }
+#endif
+
+#ifdef PYBIND11_HANDLE_REF_DEBUG
+    static std::size_t inc_ref_counter(std::size_t add) {
+        thread_local std::size_t counter = 0;
+        counter += add;
+        return counter;
+    }
+
+public:
+    static std::size_t inc_ref_counter() { return inc_ref_counter(0); }
+#endif
+};
+
+inline void set_error(const handle &type, const char *message) {
+    PyErr_SetString(type.ptr(), message);
+}
+
+inline void set_error(const handle &type, const handle &value) {
+    PyErr_SetObject(type.ptr(), value.ptr());
+}
+
+/** \rst
+    Holds a reference to a Python object (with reference counting)
+
+    Like `handle`, the `object` class is a thin wrapper around an arbitrary Python
+    object (i.e. a ``PyObject *`` in Python's C API). In contrast to `handle`, it
+    optionally increases the object's reference count upon construction, and it
+    *always* decreases the reference count when the `object` instance goes out of
+    scope and is destructed. When using `object` instances consistently, it is much
+    easier to get reference counting right at the first attempt.
+\endrst */
+class object : public handle {
+public:
+    object() = default;
+    PYBIND11_DEPRECATED("Use reinterpret_borrow<object>() or reinterpret_steal<object>()")
+    object(handle h, bool is_borrowed) : handle(h) {
+        if (is_borrowed) {
+            inc_ref();
+        }
+    }
+    /// Copy constructor; always increases the reference count
+    object(const object &o) : handle(o) { inc_ref(); }
+    /// Move constructor; steals the object from ``other`` and preserves its reference count
+    object(object &&other) noexcept : handle(other) { other.m_ptr = nullptr; }
+    /// Destructor; automatically calls `handle::dec_ref()`
+    ~object() { dec_ref(); }
+
+    /** \rst
+        Resets the internal pointer to ``nullptr`` without decreasing the
+        object's reference count. The function returns a raw handle to the original
+        Python object.
+    \endrst */
+    handle release() {
+        PyObject *tmp = m_ptr;
+        m_ptr = nullptr;
+        return handle(tmp);
+    }
+
+    object &operator=(const object &other) {
+        // Skip inc_ref and dec_ref if both objects are the same
+        if (!this->is(other)) {
+            other.inc_ref();
+            // Use temporary variable to ensure `*this` remains valid while
+            // `Py_XDECREF` executes, in case `*this` is accessible from Python.
+            handle temp(m_ptr);
+            m_ptr = other.m_ptr;
+            temp.dec_ref();
+        }
+        return *this;
+    }
+
+    object &operator=(object &&other) noexcept {
+        if (this != &other) {
+            handle temp(m_ptr);
+            m_ptr = other.m_ptr;
+            other.m_ptr = nullptr;
+            temp.dec_ref();
+        }
+        return *this;
+    }
+
+#define PYBIND11_INPLACE_OP(iop)                                                                  \
+    object iop(object_api const &other) { return operator=(handle::iop(other)); }
+
+    PYBIND11_INPLACE_OP(operator+=)
+    PYBIND11_INPLACE_OP(operator-=)
+    PYBIND11_INPLACE_OP(operator*=)
+    PYBIND11_INPLACE_OP(operator/=)
+    PYBIND11_INPLACE_OP(operator|=)
+    PYBIND11_INPLACE_OP(operator&=)
+    PYBIND11_INPLACE_OP(operator^=)
+    PYBIND11_INPLACE_OP(operator<<=)
+    PYBIND11_INPLACE_OP(operator>>=)
+#undef PYBIND11_INPLACE_OP
+
+    // Calling cast() on an object lvalue just copies (via handle::cast)
+    template <typename T>
+    T cast() const &;
+    // Calling on an object rvalue does a move, if needed and/or possible
+    template <typename T>
+    T cast() &&;
+
+protected:
+    // Tags for choosing constructors from raw PyObject *
+    struct borrowed_t {};
+    struct stolen_t {};
+
+    /// @cond BROKEN
+    template <typename T>
+    friend T reinterpret_borrow(handle);
+    template <typename T>
+    friend T reinterpret_steal(handle);
+    /// @endcond
+
+public:
+    // Only accessible from derived classes and the reinterpret_* functions
+    object(handle h, borrowed_t) : handle(h) { inc_ref(); }
+    object(handle h, stolen_t) : handle(h) {}
+};
+
+/** \rst
+    Declare that a `handle` or ``PyObject *`` is a certain type and borrow the reference.
+    The target type ``T`` must be `object` or one of its derived classes. The function
+    doesn't do any conversions or checks. It's up to the user to make sure that the
+    target type is correct.
+
+    .. code-block:: cpp
+
+        PyObject *p = PyList_GetItem(obj, index);
+        py::object o = reinterpret_borrow<py::object>(p);
+        // or
+        py::tuple t = reinterpret_borrow<py::tuple>(p); // <-- `p` must be already be a `tuple`
+\endrst */
+template <typename T>
+T reinterpret_borrow(handle h) {
+    return {h, object::borrowed_t{}};
+}
+
+/** \rst
+    Like `reinterpret_borrow`, but steals the reference.
+
+     .. code-block:: cpp
+
+        PyObject *p = PyObject_Str(obj);
+        py::str s = reinterpret_steal<py::str>(p); // <-- `p` must be already be a `str`
+\endrst */
+template <typename T>
+T reinterpret_steal(handle h) {
+    return {h, object::stolen_t{}};
+}
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// Equivalent to obj.__class__.__name__ (or obj.__name__ if obj is a class).
+inline const char *obj_class_name(PyObject *obj) {
+    if (PyType_Check(obj)) {
+        return reinterpret_cast<PyTypeObject *>(obj)->tp_name;
+    }
+    return Py_TYPE(obj)->tp_name;
+}
+
+std::string error_string();
+
+// The code in this struct is very unusual, to minimize the chances of
+// masking bugs (elsewhere) by errors during the error handling (here).
+// This is meant to be a lifeline for troubleshooting long-running processes
+// that crash under conditions that are virtually impossible to reproduce.
+// Low-level implementation alternatives are preferred to higher-level ones
+// that might raise cascading exceptions. Last-ditch-kind-of attempts are made
+// to report as much of the original error as possible, even if there are
+// secondary issues obtaining some of the details.
+struct error_fetch_and_normalize {
+    // This comment only applies to Python <= 3.11:
+    //     Immediate normalization is long-established behavior (starting with
+    //     https://github.com/pybind/pybind11/commit/135ba8deafb8bf64a15b24d1513899eb600e2011
+    //     from Sep 2016) and safest. Normalization could be deferred, but this could mask
+    //     errors elsewhere, the performance gain is very minor in typical situations
+    //     (usually the dominant bottleneck is EH unwinding), and the implementation here
+    //     would be more complex.
+    // Starting with Python 3.12, PyErr_Fetch() normalizes exceptions immediately.
+    // Any errors during normalization are tracked under __notes__.
+    explicit error_fetch_and_normalize(const char *called) {
+        PyErr_Fetch(&m_type.ptr(), &m_value.ptr(), &m_trace.ptr());
+        if (!m_type) {
+            pybind11_fail("Internal error: " + std::string(called)
+                          + " called while "
+                            "Python error indicator not set.");
+        }
+        const char *exc_type_name_orig = detail::obj_class_name(m_type.ptr());
+        if (exc_type_name_orig == nullptr) {
+            pybind11_fail("Internal error: " + std::string(called)
+                          + " failed to obtain the name "
+                            "of the original active exception type.");
+        }
+        m_lazy_error_string = exc_type_name_orig;
+#if PY_VERSION_HEX >= 0x030C0000
+        // The presence of __notes__ is likely due to exception normalization
+        // errors, although that is not necessarily true, therefore insert a
+        // hint only:
+        if (PyObject_HasAttrString(m_value.ptr(), "__notes__")) {
+            m_lazy_error_string += "[WITH __notes__]";
+        }
+#else
+        // PyErr_NormalizeException() may change the exception type if there are cascading
+        // failures. This can potentially be extremely confusing.
+        PyErr_NormalizeException(&m_type.ptr(), &m_value.ptr(), &m_trace.ptr());
+        if (m_type.ptr() == nullptr) {
+            pybind11_fail("Internal error: " + std::string(called)
+                          + " failed to normalize the "
+                            "active exception.");
+        }
+        const char *exc_type_name_norm = detail::obj_class_name(m_type.ptr());
+        if (exc_type_name_norm == nullptr) {
+            pybind11_fail("Internal error: " + std::string(called)
+                          + " failed to obtain the name "
+                            "of the normalized active exception type.");
+        }
+#    if defined(PYPY_VERSION_NUM) && PYPY_VERSION_NUM < 0x07030a00
+        // This behavior runs the risk of masking errors in the error handling, but avoids a
+        // conflict with PyPy, which relies on the normalization here to change OSError to
+        // FileNotFoundError (https://github.com/pybind/pybind11/issues/4075).
+        m_lazy_error_string = exc_type_name_norm;
+#    else
+        if (exc_type_name_norm != m_lazy_error_string) {
+            std::string msg = std::string(called)
+                              + ": MISMATCH of original and normalized "
+                                "active exception types: ";
+            msg += "ORIGINAL ";
+            msg += m_lazy_error_string;
+            msg += " REPLACED BY ";
+            msg += exc_type_name_norm;
+            msg += ": " + format_value_and_trace();
+            pybind11_fail(msg);
+        }
+#    endif
+#endif
+    }
+
+    error_fetch_and_normalize(const error_fetch_and_normalize &) = delete;
+    error_fetch_and_normalize(error_fetch_and_normalize &&) = delete;
+
+    std::string format_value_and_trace() const {
+        std::string result;
+        std::string message_error_string;
+        if (m_value) {
+            auto value_str = reinterpret_steal<object>(PyObject_Str(m_value.ptr()));
+            constexpr const char *message_unavailable_exc
+                = "<MESSAGE UNAVAILABLE DUE TO ANOTHER EXCEPTION>";
+            if (!value_str) {
+                message_error_string = detail::error_string();
+                result = message_unavailable_exc;
+            } else {
+                // Not using `value_str.cast<std::string>()`, to not potentially throw a secondary
+                // error_already_set that will then result in process termination (#4288).
+                auto value_bytes = reinterpret_steal<object>(
+                    PyUnicode_AsEncodedString(value_str.ptr(), "utf-8", "backslashreplace"));
+                if (!value_bytes) {
+                    message_error_string = detail::error_string();
+                    result = message_unavailable_exc;
+                } else {
+                    char *buffer = nullptr;
+                    Py_ssize_t length = 0;
+                    if (PyBytes_AsStringAndSize(value_bytes.ptr(), &buffer, &length) == -1) {
+                        message_error_string = detail::error_string();
+                        result = message_unavailable_exc;
+                    } else {
+                        result = std::string(buffer, static_cast<std::size_t>(length));
+                    }
+                }
+            }
+#if PY_VERSION_HEX >= 0x030B0000
+            auto notes
+                = reinterpret_steal<object>(PyObject_GetAttrString(m_value.ptr(), "__notes__"));
+            if (!notes) {
+                PyErr_Clear(); // No notes is good news.
+            } else {
+                auto len_notes = PyList_Size(notes.ptr());
+                if (len_notes < 0) {
+                    result += "\nFAILURE obtaining len(__notes__): " + detail::error_string();
+                } else {
+                    result += "\n__notes__ (len=" + std::to_string(len_notes) + "):";
+                    for (ssize_t i = 0; i < len_notes; i++) {
+                        PyObject *note = PyList_GET_ITEM(notes.ptr(), i);
+                        auto note_bytes = reinterpret_steal<object>(
+                            PyUnicode_AsEncodedString(note, "utf-8", "backslashreplace"));
+                        if (!note_bytes) {
+                            result += "\nFAILURE obtaining __notes__[" + std::to_string(i)
+                                      + "]: " + detail::error_string();
+                        } else {
+                            char *buffer = nullptr;
+                            Py_ssize_t length = 0;
+                            if (PyBytes_AsStringAndSize(note_bytes.ptr(), &buffer, &length)
+                                == -1) {
+                                result += "\nFAILURE formatting __notes__[" + std::to_string(i)
+                                          + "]: " + detail::error_string();
+                            } else {
+                                result += '\n';
+                                result += std::string(buffer, static_cast<std::size_t>(length));
+                            }
+                        }
+                    }
+                }
+            }
+#endif
+        } else {
+            result = "<MESSAGE UNAVAILABLE>";
+        }
+        if (result.empty()) {
+            result = "<EMPTY MESSAGE>";
+        }
+
+        bool have_trace = false;
+        if (m_trace) {
+#if !defined(PYPY_VERSION)
+            auto *tb = reinterpret_cast<PyTracebackObject *>(m_trace.ptr());
+
+            // Get the deepest trace possible.
+            while (tb->tb_next) {
+                tb = tb->tb_next;
+            }
+
+            PyFrameObject *frame = tb->tb_frame;
+            Py_XINCREF(frame);
+            result += "\n\nAt:\n";
+            while (frame) {
+#    if PY_VERSION_HEX >= 0x030900B1
+                PyCodeObject *f_code = PyFrame_GetCode(frame);
+#    else
+                PyCodeObject *f_code = frame->f_code;
+                Py_INCREF(f_code);
+#    endif
+                int lineno = PyFrame_GetLineNumber(frame);
+                result += "  ";
+                result += handle(f_code->co_filename).cast<std::string>();
+                result += '(';
+                result += std::to_string(lineno);
+                result += "): ";
+                result += handle(f_code->co_name).cast<std::string>();
+                result += '\n';
+                Py_DECREF(f_code);
+#    if PY_VERSION_HEX >= 0x030900B1
+                auto *b_frame = PyFrame_GetBack(frame);
+#    else
+                auto *b_frame = frame->f_back;
+                Py_XINCREF(b_frame);
+#    endif
+                Py_DECREF(frame);
+                frame = b_frame;
+            }
+
+            have_trace = true;
+#endif //! defined(PYPY_VERSION)
+        }
+
+        if (!message_error_string.empty()) {
+            if (!have_trace) {
+                result += '\n';
+            }
+            result += "\nMESSAGE UNAVAILABLE DUE TO EXCEPTION: " + message_error_string;
+        }
+
+        return result;
+    }
+
+    std::string const &error_string() const {
+        if (!m_lazy_error_string_completed) {
+            m_lazy_error_string += ": " + format_value_and_trace();
+            m_lazy_error_string_completed = true;
+        }
+        return m_lazy_error_string;
+    }
+
+    void restore() {
+        if (m_restore_called) {
+            pybind11_fail("Internal error: pybind11::detail::error_fetch_and_normalize::restore() "
+                          "called a second time. ORIGINAL ERROR: "
+                          + error_string());
+        }
+        PyErr_Restore(m_type.inc_ref().ptr(), m_value.inc_ref().ptr(), m_trace.inc_ref().ptr());
+        m_restore_called = true;
+    }
+
+    bool matches(handle exc) const {
+        return (PyErr_GivenExceptionMatches(m_type.ptr(), exc.ptr()) != 0);
+    }
+
+    // Not protecting these for simplicity.
+    object m_type, m_value, m_trace;
+
+private:
+    // Only protecting invariants.
+    mutable std::string m_lazy_error_string;
+    mutable bool m_lazy_error_string_completed = false;
+    mutable bool m_restore_called = false;
+};
+
+inline std::string error_string() {
+    return error_fetch_and_normalize("pybind11::detail::error_string").error_string();
+}
+
+PYBIND11_NAMESPACE_END(detail)
+
+/// Fetch and hold an error which was already set in Python.  An instance of this is typically
+/// thrown to propagate python-side errors back through C++ which can either be caught manually or
+/// else falls back to the function dispatcher (which then raises the captured error back to
+/// python).
+class PYBIND11_EXPORT_EXCEPTION error_already_set : public std::exception {
+public:
+    /// Fetches the current Python exception (using PyErr_Fetch()), which will clear the
+    /// current Python error indicator.
+    error_already_set()
+        : m_fetched_error{new detail::error_fetch_and_normalize("pybind11::error_already_set"),
+                          m_fetched_error_deleter} {}
+
+    /// The what() result is built lazily on demand.
+    /// WARNING: This member function needs to acquire the Python GIL. This can lead to
+    ///          crashes (undefined behavior) if the Python interpreter is finalizing.
+    const char *what() const noexcept override;
+
+    /// Restores the currently-held Python error (which will clear the Python error indicator first
+    /// if already set).
+    /// NOTE: This member function will always restore the normalized exception, which may or may
+    ///       not be the original Python exception.
+    /// WARNING: The GIL must be held when this member function is called!
+    void restore() { m_fetched_error->restore(); }
+
+    /// If it is impossible to raise the currently-held error, such as in a destructor, we can
+    /// write it out using Python's unraisable hook (`sys.unraisablehook`). The error context
+    /// should be some object whose `repr()` helps identify the location of the error. Python
+    /// already knows the type and value of the error, so there is no need to repeat that.
+    void discard_as_unraisable(object err_context) {
+        restore();
+        PyErr_WriteUnraisable(err_context.ptr());
+    }
+    /// An alternate version of `discard_as_unraisable()`, where a string provides information on
+    /// the location of the error. For example, `__func__` could be helpful.
+    /// WARNING: The GIL must be held when this member function is called!
+    void discard_as_unraisable(const char *err_context) {
+        discard_as_unraisable(reinterpret_steal<object>(PYBIND11_FROM_STRING(err_context)));
+    }
+
+    // Does nothing; provided for backwards compatibility.
+    PYBIND11_DEPRECATED("Use of error_already_set.clear() is deprecated")
+    void clear() {}
+
+    /// Check if the currently trapped error type matches the given Python exception class (or a
+    /// subclass thereof).  May also be passed a tuple to search for any exception class matches in
+    /// the given tuple.
+    bool matches(handle exc) const { return m_fetched_error->matches(exc); }
+
+    const object &type() const { return m_fetched_error->m_type; }
+    const object &value() const { return m_fetched_error->m_value; }
+    const object &trace() const { return m_fetched_error->m_trace; }
+
+private:
+    std::shared_ptr<detail::error_fetch_and_normalize> m_fetched_error;
+
+    /// WARNING: This custom deleter needs to acquire the Python GIL. This can lead to
+    ///          crashes (undefined behavior) if the Python interpreter is finalizing.
+    static void m_fetched_error_deleter(detail::error_fetch_and_normalize *raw_ptr);
+};
+
+/// Replaces the current Python error indicator with the chosen error, performing a
+/// 'raise from' to indicate that the chosen error was caused by the original error.
+inline void raise_from(PyObject *type, const char *message) {
+    // Based on _PyErr_FormatVFromCause:
+    // https://github.com/python/cpython/blob/467ab194fc6189d9f7310c89937c51abeac56839/Python/errors.c#L405
+    // See https://github.com/pybind/pybind11/pull/2112 for details.
+    PyObject *exc = nullptr, *val = nullptr, *val2 = nullptr, *tb = nullptr;
+
+    assert(PyErr_Occurred());
+    PyErr_Fetch(&exc, &val, &tb);
+    PyErr_NormalizeException(&exc, &val, &tb);
+    if (tb != nullptr) {
+        PyException_SetTraceback(val, tb);
+        Py_DECREF(tb);
+    }
+    Py_DECREF(exc);
+    assert(!PyErr_Occurred());
+
+    PyErr_SetString(type, message);
+
+    PyErr_Fetch(&exc, &val2, &tb);
+    PyErr_NormalizeException(&exc, &val2, &tb);
+    Py_INCREF(val);
+    PyException_SetCause(val2, val);
+    PyException_SetContext(val2, val);
+    PyErr_Restore(exc, val2, tb);
+}
+
+/// Sets the current Python error indicator with the chosen error, performing a 'raise from'
+/// from the error contained in error_already_set to indicate that the chosen error was
+/// caused by the original error.
+inline void raise_from(error_already_set &err, PyObject *type, const char *message) {
+    err.restore();
+    raise_from(type, message);
+}
+
+/** \defgroup python_builtins const_name
+    Unless stated otherwise, the following C++ functions behave the same
+    as their Python counterparts.
+ */
+
+/** \ingroup python_builtins
+    \rst
+    Return true if ``obj`` is an instance of ``T``. Type ``T`` must be a subclass of
+    `object` or a class which was exposed to Python as ``py::class_<T>``.
+\endrst */
+template <typename T, detail::enable_if_t<std::is_base_of<object, T>::value, int> = 0>
+bool isinstance(handle obj) {
+    return T::check_(obj);
+}
+
+template <typename T, detail::enable_if_t<!std::is_base_of<object, T>::value, int> = 0>
+bool isinstance(handle obj) {
+    return detail::isinstance_generic(obj, typeid(T));
+}
+
+template <>
+inline bool isinstance<handle>(handle) = delete;
+template <>
+inline bool isinstance<object>(handle obj) {
+    return obj.ptr() != nullptr;
+}
+
+/// \ingroup python_builtins
+/// Return true if ``obj`` is an instance of the ``type``.
+inline bool isinstance(handle obj, handle type) {
+    const auto result = PyObject_IsInstance(obj.ptr(), type.ptr());
+    if (result == -1) {
+        throw error_already_set();
+    }
+    return result != 0;
+}
+
+/// \addtogroup python_builtins
+/// @{
+inline bool hasattr(handle obj, handle name) {
+    return PyObject_HasAttr(obj.ptr(), name.ptr()) == 1;
+}
+
+inline bool hasattr(handle obj, const char *name) {
+    return PyObject_HasAttrString(obj.ptr(), name) == 1;
+}
+
+inline void delattr(handle obj, handle name) {
+    if (PyObject_DelAttr(obj.ptr(), name.ptr()) != 0) {
+        throw error_already_set();
+    }
+}
+
+inline void delattr(handle obj, const char *name) {
+    if (PyObject_DelAttrString(obj.ptr(), name) != 0) {
+        throw error_already_set();
+    }
+}
+
+inline object getattr(handle obj, handle name) {
+    PyObject *result = PyObject_GetAttr(obj.ptr(), name.ptr());
+    if (!result) {
+        throw error_already_set();
+    }
+    return reinterpret_steal<object>(result);
+}
+
+inline object getattr(handle obj, const char *name) {
+    PyObject *result = PyObject_GetAttrString(obj.ptr(), name);
+    if (!result) {
+        throw error_already_set();
+    }
+    return reinterpret_steal<object>(result);
+}
+
+inline object getattr(handle obj, handle name, handle default_) {
+    if (PyObject *result = PyObject_GetAttr(obj.ptr(), name.ptr())) {
+        return reinterpret_steal<object>(result);
+    }
+    PyErr_Clear();
+    return reinterpret_borrow<object>(default_);
+}
+
+inline object getattr(handle obj, const char *name, handle default_) {
+    if (PyObject *result = PyObject_GetAttrString(obj.ptr(), name)) {
+        return reinterpret_steal<object>(result);
+    }
+    PyErr_Clear();
+    return reinterpret_borrow<object>(default_);
+}
+
+inline void setattr(handle obj, handle name, handle value) {
+    if (PyObject_SetAttr(obj.ptr(), name.ptr(), value.ptr()) != 0) {
+        throw error_already_set();
+    }
+}
+
+inline void setattr(handle obj, const char *name, handle value) {
+    if (PyObject_SetAttrString(obj.ptr(), name, value.ptr()) != 0) {
+        throw error_already_set();
+    }
+}
+
+inline ssize_t hash(handle obj) {
+    auto h = PyObject_Hash(obj.ptr());
+    if (h == -1) {
+        throw error_already_set();
+    }
+    return h;
+}
+
+/// @} python_builtins
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+inline handle get_function(handle value) {
+    if (value) {
+        if (PyInstanceMethod_Check(value.ptr())) {
+            value = PyInstanceMethod_GET_FUNCTION(value.ptr());
+        } else if (PyMethod_Check(value.ptr())) {
+            value = PyMethod_GET_FUNCTION(value.ptr());
+        }
+    }
+    return value;
+}
+
+// Reimplementation of python's dict helper functions to ensure that exceptions
+// aren't swallowed (see #2862)
+
+// copied from cpython _PyDict_GetItemStringWithError
+inline PyObject *dict_getitemstring(PyObject *v, const char *key) {
+    PyObject *kv = nullptr, *rv = nullptr;
+    kv = PyUnicode_FromString(key);
+    if (kv == nullptr) {
+        throw error_already_set();
+    }
+
+    rv = PyDict_GetItemWithError(v, kv);
+    Py_DECREF(kv);
+    if (rv == nullptr && PyErr_Occurred()) {
+        throw error_already_set();
+    }
+    return rv;
+}
+
+inline PyObject *dict_getitem(PyObject *v, PyObject *key) {
+    PyObject *rv = PyDict_GetItemWithError(v, key);
+    if (rv == nullptr && PyErr_Occurred()) {
+        throw error_already_set();
+    }
+    return rv;
+}
+
+inline PyObject *dict_getitemstringref(PyObject *v, const char *key) {
+#if PY_VERSION_HEX >= 0x030D0000
+    PyObject *rv;
+    if (PyDict_GetItemStringRef(v, key, &rv) < 0) {
+        throw error_already_set();
+    }
+    return rv;
+#else
+    PyObject *rv = dict_getitemstring(v, key);
+    if (rv == nullptr && PyErr_Occurred()) {
+        throw error_already_set();
+    }
+    Py_XINCREF(rv);
+    return rv;
+#endif
+}
+
+// Helper aliases/functions to support implicit casting of values given to python
+// accessors/methods. When given a pyobject, this simply returns the pyobject as-is; for other C++
+// type, the value goes through pybind11::cast(obj) to convert it to an `object`.
+template <typename T, enable_if_t<is_pyobject<T>::value, int> = 0>
+auto object_or_cast(T &&o) -> decltype(std::forward<T>(o)) {
+    return std::forward<T>(o);
+}
+// The following casting version is implemented in cast.h:
+template <typename T, enable_if_t<!is_pyobject<T>::value, int> = 0>
+object object_or_cast(T &&o);
+// Match a PyObject*, which we want to convert directly to handle via its converting constructor
+inline handle object_or_cast(PyObject *ptr) { return ptr; }
+
+PYBIND11_WARNING_PUSH
+PYBIND11_WARNING_DISABLE_MSVC(4522) // warning C4522: multiple assignment operators specified
+template <typename Policy>
+class accessor : public object_api<accessor<Policy>> {
+    using key_type = typename Policy::key_type;
+
+public:
+    accessor(handle obj, key_type key) : obj(obj), key(std::move(key)) {}
+    accessor(const accessor &) = default;
+    accessor(accessor &&) noexcept = default;
+
+    // accessor overload required to override default assignment operator (templates are not
+    // allowed to replace default compiler-generated assignments).
+    void operator=(const accessor &a) && { std::move(*this).operator=(handle(a)); }
+    void operator=(const accessor &a) & { operator=(handle(a)); }
+
+    template <typename T>
+    void operator=(T &&value) && {
+        Policy::set(obj, key, object_or_cast(std::forward<T>(value)));
+    }
+    template <typename T>
+    void operator=(T &&value) & {
+        get_cache() = ensure_object(object_or_cast(std::forward<T>(value)));
+    }
+
+    template <typename T = Policy>
+    PYBIND11_DEPRECATED(
+        "Use of obj.attr(...) as bool is deprecated in favor of pybind11::hasattr(obj, ...)")
+    explicit
+    operator enable_if_t<std::is_same<T, accessor_policies::str_attr>::value
+                             || std::is_same<T, accessor_policies::obj_attr>::value,
+                         bool>() const {
+        return hasattr(obj, key);
+    }
+    template <typename T = Policy>
+    PYBIND11_DEPRECATED("Use of obj[key] as bool is deprecated in favor of obj.contains(key)")
+    explicit
+    operator enable_if_t<std::is_same<T, accessor_policies::generic_item>::value, bool>() const {
+        return obj.contains(key);
+    }
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator object() const { return get_cache(); }
+    PyObject *ptr() const { return get_cache().ptr(); }
+    template <typename T>
+    T cast() const {
+        return get_cache().template cast<T>();
+    }
+
+private:
+    static object ensure_object(object &&o) { return std::move(o); }
+    static object ensure_object(handle h) { return reinterpret_borrow<object>(h); }
+
+    object &get_cache() const {
+        if (!cache) {
+            cache = Policy::get(obj, key);
+        }
+        return cache;
+    }
+
+private:
+    handle obj;
+    key_type key;
+    mutable object cache;
+};
+PYBIND11_WARNING_POP
+
+PYBIND11_NAMESPACE_BEGIN(accessor_policies)
+struct obj_attr {
+    using key_type = object;
+    static object get(handle obj, handle key) { return getattr(obj, key); }
+    static void set(handle obj, handle key, handle val) { setattr(obj, key, val); }
+};
+
+struct str_attr {
+    using key_type = const char *;
+    static object get(handle obj, const char *key) { return getattr(obj, key); }
+    static void set(handle obj, const char *key, handle val) { setattr(obj, key, val); }
+};
+
+struct generic_item {
+    using key_type = object;
+
+    static object get(handle obj, handle key) {
+        PyObject *result = PyObject_GetItem(obj.ptr(), key.ptr());
+        if (!result) {
+            throw error_already_set();
+        }
+        return reinterpret_steal<object>(result);
+    }
+
+    static void set(handle obj, handle key, handle val) {
+        if (PyObject_SetItem(obj.ptr(), key.ptr(), val.ptr()) != 0) {
+            throw error_already_set();
+        }
+    }
+};
+
+struct sequence_item {
+    using key_type = size_t;
+
+    template <typename IdxType, detail::enable_if_t<std::is_integral<IdxType>::value, int> = 0>
+    static object get(handle obj, const IdxType &index) {
+        PyObject *result = PySequence_GetItem(obj.ptr(), ssize_t_cast(index));
+        if (!result) {
+            throw error_already_set();
+        }
+        return reinterpret_steal<object>(result);
+    }
+
+    template <typename IdxType, detail::enable_if_t<std::is_integral<IdxType>::value, int> = 0>
+    static void set(handle obj, const IdxType &index, handle val) {
+        // PySequence_SetItem does not steal a reference to 'val'
+        if (PySequence_SetItem(obj.ptr(), ssize_t_cast(index), val.ptr()) != 0) {
+            throw error_already_set();
+        }
+    }
+};
+
+struct list_item {
+    using key_type = size_t;
+
+    template <typename IdxType, detail::enable_if_t<std::is_integral<IdxType>::value, int> = 0>
+    static object get(handle obj, const IdxType &index) {
+        PyObject *result = PyList_GetItem(obj.ptr(), ssize_t_cast(index));
+        if (!result) {
+            throw error_already_set();
+        }
+        return reinterpret_borrow<object>(result);
+    }
+
+    template <typename IdxType, detail::enable_if_t<std::is_integral<IdxType>::value, int> = 0>
+    static void set(handle obj, const IdxType &index, handle val) {
+        // PyList_SetItem steals a reference to 'val'
+        if (PyList_SetItem(obj.ptr(), ssize_t_cast(index), val.inc_ref().ptr()) != 0) {
+            throw error_already_set();
+        }
+    }
+};
+
+struct tuple_item {
+    using key_type = size_t;
+
+    template <typename IdxType, detail::enable_if_t<std::is_integral<IdxType>::value, int> = 0>
+    static object get(handle obj, const IdxType &index) {
+        PyObject *result = PyTuple_GetItem(obj.ptr(), ssize_t_cast(index));
+        if (!result) {
+            throw error_already_set();
+        }
+        return reinterpret_borrow<object>(result);
+    }
+
+    template <typename IdxType, detail::enable_if_t<std::is_integral<IdxType>::value, int> = 0>
+    static void set(handle obj, const IdxType &index, handle val) {
+        // PyTuple_SetItem steals a reference to 'val'
+        if (PyTuple_SetItem(obj.ptr(), ssize_t_cast(index), val.inc_ref().ptr()) != 0) {
+            throw error_already_set();
+        }
+    }
+};
+PYBIND11_NAMESPACE_END(accessor_policies)
+
+/// STL iterator template used for tuple, list, sequence and dict
+template <typename Policy>
+class generic_iterator : public Policy {
+    using It = generic_iterator;
+
+public:
+    using difference_type = ssize_t;
+    using iterator_category = typename Policy::iterator_category;
+    using value_type = typename Policy::value_type;
+    using reference = typename Policy::reference;
+    using pointer = typename Policy::pointer;
+
+    generic_iterator() = default;
+    generic_iterator(handle seq, ssize_t index) : Policy(seq, index) {}
+
+    // NOLINTNEXTLINE(readability-const-return-type) // PR #3263
+    reference operator*() const { return Policy::dereference(); }
+    // NOLINTNEXTLINE(readability-const-return-type) // PR #3263
+    reference operator[](difference_type n) const { return *(*this + n); }
+    pointer operator->() const { return **this; }
+
+    It &operator++() {
+        Policy::increment();
+        return *this;
+    }
+    It operator++(int) {
+        auto copy = *this;
+        Policy::increment();
+        return copy;
+    }
+    It &operator--() {
+        Policy::decrement();
+        return *this;
+    }
+    It operator--(int) {
+        auto copy = *this;
+        Policy::decrement();
+        return copy;
+    }
+    It &operator+=(difference_type n) {
+        Policy::advance(n);
+        return *this;
+    }
+    It &operator-=(difference_type n) {
+        Policy::advance(-n);
+        return *this;
+    }
+
+    friend It operator+(const It &a, difference_type n) {
+        auto copy = a;
+        return copy += n;
+    }
+    friend It operator+(difference_type n, const It &b) { return b + n; }
+    friend It operator-(const It &a, difference_type n) {
+        auto copy = a;
+        return copy -= n;
+    }
+    friend difference_type operator-(const It &a, const It &b) { return a.distance_to(b); }
+
+    friend bool operator==(const It &a, const It &b) { return a.equal(b); }
+    friend bool operator!=(const It &a, const It &b) { return !(a == b); }
+    friend bool operator<(const It &a, const It &b) { return b - a > 0; }
+    friend bool operator>(const It &a, const It &b) { return b < a; }
+    friend bool operator>=(const It &a, const It &b) { return !(a < b); }
+    friend bool operator<=(const It &a, const It &b) { return !(a > b); }
+};
+
+PYBIND11_NAMESPACE_BEGIN(iterator_policies)
+/// Quick proxy class needed to implement ``operator->`` for iterators which can't return pointers
+template <typename T>
+struct arrow_proxy {
+    T value;
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    arrow_proxy(T &&value) noexcept : value(std::move(value)) {}
+    T *operator->() const { return &value; }
+};
+
+/// Lightweight iterator policy using just a simple pointer: see ``PySequence_Fast_ITEMS``
+class sequence_fast_readonly {
+protected:
+    using iterator_category = std::random_access_iterator_tag;
+    using value_type = handle;
+    using reference = const handle; // PR #3263
+    using pointer = arrow_proxy<const handle>;
+
+    sequence_fast_readonly(handle obj, ssize_t n) : ptr(PySequence_Fast_ITEMS(obj.ptr()) + n) {}
+    sequence_fast_readonly() = default;
+
+    // NOLINTNEXTLINE(readability-const-return-type) // PR #3263
+    reference dereference() const { return *ptr; }
+    void increment() { ++ptr; }
+    void decrement() { --ptr; }
+    void advance(ssize_t n) { ptr += n; }
+    bool equal(const sequence_fast_readonly &b) const { return ptr == b.ptr; }
+    ssize_t distance_to(const sequence_fast_readonly &b) const { return ptr - b.ptr; }
+
+private:
+    PyObject **ptr;
+};
+
+/// Full read and write access using the sequence protocol: see ``detail::sequence_accessor``
+class sequence_slow_readwrite {
+protected:
+    using iterator_category = std::random_access_iterator_tag;
+    using value_type = object;
+    using reference = sequence_accessor;
+    using pointer = arrow_proxy<const sequence_accessor>;
+
+    sequence_slow_readwrite(handle obj, ssize_t index) : obj(obj), index(index) {}
+    sequence_slow_readwrite() = default;
+
+    reference dereference() const { return {obj, static_cast<size_t>(index)}; }
+    void increment() { ++index; }
+    void decrement() { --index; }
+    void advance(ssize_t n) { index += n; }
+    bool equal(const sequence_slow_readwrite &b) const { return index == b.index; }
+    ssize_t distance_to(const sequence_slow_readwrite &b) const { return index - b.index; }
+
+private:
+    handle obj;
+    ssize_t index;
+};
+
+/// Python's dictionary protocol permits this to be a forward iterator
+class dict_readonly {
+protected:
+    using iterator_category = std::forward_iterator_tag;
+    using value_type = std::pair<handle, handle>;
+    using reference = const value_type; // PR #3263
+    using pointer = arrow_proxy<const value_type>;
+
+    dict_readonly() = default;
+    dict_readonly(handle obj, ssize_t pos) : obj(obj), pos(pos) { increment(); }
+
+    // NOLINTNEXTLINE(readability-const-return-type) // PR #3263
+    reference dereference() const { return {key, value}; }
+    void increment() {
+        if (PyDict_Next(obj.ptr(), &pos, &key, &value) == 0) {
+            pos = -1;
+        }
+    }
+    bool equal(const dict_readonly &b) const { return pos == b.pos; }
+
+private:
+    handle obj;
+    PyObject *key = nullptr, *value = nullptr;
+    ssize_t pos = -1;
+};
+PYBIND11_NAMESPACE_END(iterator_policies)
+
+#if !defined(PYPY_VERSION)
+using tuple_iterator = generic_iterator<iterator_policies::sequence_fast_readonly>;
+using list_iterator = generic_iterator<iterator_policies::sequence_fast_readonly>;
+#else
+using tuple_iterator = generic_iterator<iterator_policies::sequence_slow_readwrite>;
+using list_iterator = generic_iterator<iterator_policies::sequence_slow_readwrite>;
+#endif
+
+using sequence_iterator = generic_iterator<iterator_policies::sequence_slow_readwrite>;
+using dict_iterator = generic_iterator<iterator_policies::dict_readonly>;
+
+inline bool PyIterable_Check(PyObject *obj) {
+    PyObject *iter = PyObject_GetIter(obj);
+    if (iter) {
+        Py_DECREF(iter);
+        return true;
+    }
+    PyErr_Clear();
+    return false;
+}
+
+inline bool PyNone_Check(PyObject *o) { return o == Py_None; }
+inline bool PyEllipsis_Check(PyObject *o) { return o == Py_Ellipsis; }
+
+#ifdef PYBIND11_STR_LEGACY_PERMISSIVE
+inline bool PyUnicode_Check_Permissive(PyObject *o) {
+    return PyUnicode_Check(o) || PYBIND11_BYTES_CHECK(o);
+}
+#    define PYBIND11_STR_CHECK_FUN detail::PyUnicode_Check_Permissive
+#else
+#    define PYBIND11_STR_CHECK_FUN PyUnicode_Check
+#endif
+
+inline bool PyStaticMethod_Check(PyObject *o) { return o->ob_type == &PyStaticMethod_Type; }
+
+class kwargs_proxy : public handle {
+public:
+    explicit kwargs_proxy(handle h) : handle(h) {}
+};
+
+class args_proxy : public handle {
+public:
+    explicit args_proxy(handle h) : handle(h) {}
+    kwargs_proxy operator*() const { return kwargs_proxy(*this); }
+};
+
+/// Python argument categories (using PEP 448 terms)
+template <typename T>
+using is_keyword = std::is_base_of<arg, T>;
+template <typename T>
+using is_s_unpacking = std::is_same<args_proxy, T>; // * unpacking
+template <typename T>
+using is_ds_unpacking = std::is_same<kwargs_proxy, T>; // ** unpacking
+template <typename T>
+using is_positional = satisfies_none_of<T, is_keyword, is_s_unpacking, is_ds_unpacking>;
+template <typename T>
+using is_keyword_or_ds = satisfies_any_of<T, is_keyword, is_ds_unpacking>;
+
+// Call argument collector forward declarations
+template <return_value_policy policy = return_value_policy::automatic_reference>
+class simple_collector;
+template <return_value_policy policy = return_value_policy::automatic_reference>
+class unpacking_collector;
+
+PYBIND11_NAMESPACE_END(detail)
+
+// TODO: After the deprecated constructors are removed, this macro can be simplified by
+//       inheriting ctors: `using Parent::Parent`. It's not an option right now because
+//       the `using` statement triggers the parent deprecation warning even if the ctor
+//       isn't even used.
+#define PYBIND11_OBJECT_COMMON(Name, Parent, CheckFun)                                            \
+public:                                                                                           \
+    PYBIND11_DEPRECATED("Use reinterpret_borrow<" #Name ">() or reinterpret_steal<" #Name ">()")  \
+    Name(handle h, bool is_borrowed)                                                              \
+        : Parent(is_borrowed ? Parent(h, borrowed_t{}) : Parent(h, stolen_t{})) {}                \
+    Name(handle h, borrowed_t) : Parent(h, borrowed_t{}) {}                                       \
+    Name(handle h, stolen_t) : Parent(h, stolen_t{}) {}                                           \
+    PYBIND11_DEPRECATED("Use py::isinstance<py::python_type>(obj) instead")                       \
+    bool check() const { return m_ptr != nullptr && (CheckFun(m_ptr) != 0); }                     \
+    static bool check_(handle h) { return h.ptr() != nullptr && CheckFun(h.ptr()); }              \
+    template <typename Policy_> /* NOLINTNEXTLINE(google-explicit-constructor) */                 \
+    Name(const ::pybind11::detail::accessor<Policy_> &a) : Name(object(a)) {}
+
+#define PYBIND11_OBJECT_CVT(Name, Parent, CheckFun, ConvertFun)                                   \
+    PYBIND11_OBJECT_COMMON(Name, Parent, CheckFun)                                                \
+    /* This is deliberately not 'explicit' to allow implicit conversion from object: */           \
+    /* NOLINTNEXTLINE(google-explicit-constructor) */                                             \
+    Name(const object &o)                                                                         \
+        : Parent(check_(o) ? o.inc_ref().ptr() : ConvertFun(o.ptr()), stolen_t{}) {               \
+        if (!m_ptr)                                                                               \
+            throw ::pybind11::error_already_set();                                                \
+    }                                                                                             \
+    /* NOLINTNEXTLINE(google-explicit-constructor) */                                             \
+    Name(object &&o) : Parent(check_(o) ? o.release().ptr() : ConvertFun(o.ptr()), stolen_t{}) {  \
+        if (!m_ptr)                                                                               \
+            throw ::pybind11::error_already_set();                                                \
+    }
+
+#define PYBIND11_OBJECT_CVT_DEFAULT(Name, Parent, CheckFun, ConvertFun)                           \
+    PYBIND11_OBJECT_CVT(Name, Parent, CheckFun, ConvertFun)                                       \
+    Name() = default;
+
+#define PYBIND11_OBJECT_CHECK_FAILED(Name, o_ptr)                                                 \
+    ::pybind11::type_error("Object of type '"                                                     \
+                           + ::pybind11::detail::get_fully_qualified_tp_name(Py_TYPE(o_ptr))      \
+                           + "' is not an instance of '" #Name "'")
+
+#define PYBIND11_OBJECT(Name, Parent, CheckFun)                                                   \
+    PYBIND11_OBJECT_COMMON(Name, Parent, CheckFun)                                                \
+    /* This is deliberately not 'explicit' to allow implicit conversion from object: */           \
+    /* NOLINTNEXTLINE(google-explicit-constructor) */                                             \
+    Name(const object &o) : Parent(o) {                                                           \
+        if (m_ptr && !check_(m_ptr))                                                              \
+            throw PYBIND11_OBJECT_CHECK_FAILED(Name, m_ptr);                                      \
+    }                                                                                             \
+    /* NOLINTNEXTLINE(google-explicit-constructor) */                                             \
+    Name(object &&o) : Parent(std::move(o)) {                                                     \
+        if (m_ptr && !check_(m_ptr))                                                              \
+            throw PYBIND11_OBJECT_CHECK_FAILED(Name, m_ptr);                                      \
+    }
+
+#define PYBIND11_OBJECT_DEFAULT(Name, Parent, CheckFun)                                           \
+    PYBIND11_OBJECT(Name, Parent, CheckFun)                                                       \
+    Name() = default;
+
+/// \addtogroup pytypes
+/// @{
+
+/** \rst
+    Wraps a Python iterator so that it can also be used as a C++ input iterator
+
+    Caveat: copying an iterator does not (and cannot) clone the internal
+    state of the Python iterable. This also applies to the post-increment
+    operator. This iterator should only be used to retrieve the current
+    value using ``operator*()``.
+\endrst */
+class iterator : public object {
+public:
+    using iterator_category = std::input_iterator_tag;
+    using difference_type = ssize_t;
+    using value_type = handle;
+    using reference = const handle; // PR #3263
+    using pointer = const handle *;
+
+    PYBIND11_OBJECT_DEFAULT(iterator, object, PyIter_Check)
+
+    iterator &operator++() {
+        advance();
+        return *this;
+    }
+
+    iterator operator++(int) {
+        auto rv = *this;
+        advance();
+        return rv;
+    }
+
+    // NOLINTNEXTLINE(readability-const-return-type) // PR #3263
+    reference operator*() const {
+        if (m_ptr && !value.ptr()) {
+            auto &self = const_cast<iterator &>(*this);
+            self.advance();
+        }
+        return value;
+    }
+
+    pointer operator->() const {
+        operator*();
+        return &value;
+    }
+
+    /** \rst
+         The value which marks the end of the iteration. ``it == iterator::sentinel()``
+         is equivalent to catching ``StopIteration`` in Python.
+
+         .. code-block:: cpp
+
+             void foo(py::iterator it) {
+                 while (it != py::iterator::sentinel()) {
+                    // use `*it`
+                    ++it;
+                 }
+             }
+    \endrst */
+    static iterator sentinel() { return {}; }
+
+    friend bool operator==(const iterator &a, const iterator &b) { return a->ptr() == b->ptr(); }
+    friend bool operator!=(const iterator &a, const iterator &b) { return a->ptr() != b->ptr(); }
+
+private:
+    void advance() {
+        value = reinterpret_steal<object>(PyIter_Next(m_ptr));
+        if (value.ptr() == nullptr && PyErr_Occurred()) {
+            throw error_already_set();
+        }
+    }
+
+private:
+    object value = {};
+};
+
+class type : public object {
+public:
+    PYBIND11_OBJECT(type, object, PyType_Check)
+
+    /// Return a type handle from a handle or an object
+    static handle handle_of(handle h) { return handle((PyObject *) Py_TYPE(h.ptr())); }
+
+    /// Return a type object from a handle or an object
+    static type of(handle h) { return type(type::handle_of(h), borrowed_t{}); }
+
+    // Defined in pybind11/cast.h
+    /// Convert C++ type to handle if previously registered. Does not convert
+    /// standard types, like int, float. etc. yet.
+    /// See https://github.com/pybind/pybind11/issues/2486
+    template <typename T>
+    static handle handle_of();
+
+    /// Convert C++ type to type if previously registered. Does not convert
+    /// standard types, like int, float. etc. yet.
+    /// See https://github.com/pybind/pybind11/issues/2486
+    template <typename T>
+    static type of() {
+        return type(type::handle_of<T>(), borrowed_t{});
+    }
+};
+
+class iterable : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(iterable, object, detail::PyIterable_Check)
+};
+
+class bytes;
+
+class str : public object {
+public:
+    PYBIND11_OBJECT_CVT(str, object, PYBIND11_STR_CHECK_FUN, raw_str)
+
+    template <typename SzType, detail::enable_if_t<std::is_integral<SzType>::value, int> = 0>
+    str(const char *c, const SzType &n)
+        : object(PyUnicode_FromStringAndSize(c, ssize_t_cast(n)), stolen_t{}) {
+        if (!m_ptr) {
+            if (PyErr_Occurred()) {
+                throw error_already_set();
+            }
+            pybind11_fail("Could not allocate string object!");
+        }
+    }
+
+    // 'explicit' is explicitly omitted from the following constructors to allow implicit
+    // conversion to py::str from C++ string-like objects
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    str(const char *c = "") : object(PyUnicode_FromString(c), stolen_t{}) {
+        if (!m_ptr) {
+            if (PyErr_Occurred()) {
+                throw error_already_set();
+            }
+            pybind11_fail("Could not allocate string object!");
+        }
+    }
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    str(const std::string &s) : str(s.data(), s.size()) {}
+
+#ifdef PYBIND11_HAS_STRING_VIEW
+    // enable_if is needed to avoid "ambiguous conversion" errors (see PR #3521).
+    template <typename T, detail::enable_if_t<std::is_same<T, std::string_view>::value, int> = 0>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    str(T s) : str(s.data(), s.size()) {}
+
+#    ifdef PYBIND11_HAS_U8STRING
+    // reinterpret_cast here is safe (C++20 guarantees char8_t has the same size/alignment as char)
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    str(std::u8string_view s) : str(reinterpret_cast<const char *>(s.data()), s.size()) {}
+#    endif
+
+#endif
+
+    explicit str(const bytes &b);
+
+    /** \rst
+        Return a string representation of the object. This is analogous to
+        the ``str()`` function in Python.
+    \endrst */
+    explicit str(handle h) : object(raw_str(h.ptr()), stolen_t{}) {
+        if (!m_ptr) {
+            throw error_already_set();
+        }
+    }
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator std::string() const {
+        object temp = *this;
+        if (PyUnicode_Check(m_ptr)) {
+            temp = reinterpret_steal<object>(PyUnicode_AsUTF8String(m_ptr));
+            if (!temp) {
+                throw error_already_set();
+            }
+        }
+        char *buffer = nullptr;
+        ssize_t length = 0;
+        if (PyBytes_AsStringAndSize(temp.ptr(), &buffer, &length) != 0) {
+            throw error_already_set();
+        }
+        return std::string(buffer, (size_t) length);
+    }
+
+    template <typename... Args>
+    str format(Args &&...args) const {
+        return attr("format")(std::forward<Args>(args)...);
+    }
+
+private:
+    /// Return string representation -- always returns a new reference, even if already a str
+    static PyObject *raw_str(PyObject *op) {
+        PyObject *str_value = PyObject_Str(op);
+        return str_value;
+    }
+};
+/// @} pytypes
+
+inline namespace literals {
+/** \rst
+    String literal version of `str`
+ \endrst */
+inline str
+#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5
+operator"" _s // gcc 4.8.5 insists on having a space (hard error).
+#else
+operator""_s // clang 17 generates a deprecation warning if there is a space.
+#endif
+    (const char *s, size_t size) {
+    return {s, size};
+}
+} // namespace literals
+
+/// \addtogroup pytypes
+/// @{
+class bytes : public object {
+public:
+    PYBIND11_OBJECT(bytes, object, PYBIND11_BYTES_CHECK)
+
+    // Allow implicit conversion:
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    bytes(const char *c = "") : object(PYBIND11_BYTES_FROM_STRING(c), stolen_t{}) {
+        if (!m_ptr) {
+            pybind11_fail("Could not allocate bytes object!");
+        }
+    }
+
+    template <typename SzType, detail::enable_if_t<std::is_integral<SzType>::value, int> = 0>
+    bytes(const char *c, const SzType &n)
+        : object(PYBIND11_BYTES_FROM_STRING_AND_SIZE(c, ssize_t_cast(n)), stolen_t{}) {
+        if (!m_ptr) {
+            pybind11_fail("Could not allocate bytes object!");
+        }
+    }
+
+    // Allow implicit conversion:
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    bytes(const std::string &s) : bytes(s.data(), s.size()) {}
+
+    explicit bytes(const pybind11::str &s);
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator std::string() const { return string_op<std::string>(); }
+
+#ifdef PYBIND11_HAS_STRING_VIEW
+    // enable_if is needed to avoid "ambiguous conversion" errors (see PR #3521).
+    template <typename T, detail::enable_if_t<std::is_same<T, std::string_view>::value, int> = 0>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    bytes(T s) : bytes(s.data(), s.size()) {}
+
+    // Obtain a string view that views the current `bytes` buffer value.  Note that this is only
+    // valid so long as the `bytes` instance remains alive and so generally should not outlive the
+    // lifetime of the `bytes` instance.
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator std::string_view() const { return string_op<std::string_view>(); }
+#endif
+private:
+    template <typename T>
+    T string_op() const {
+        char *buffer = nullptr;
+        ssize_t length = 0;
+        if (PyBytes_AsStringAndSize(m_ptr, &buffer, &length) != 0) {
+            throw error_already_set();
+        }
+        return {buffer, static_cast<size_t>(length)};
+    }
+};
+// Note: breathe >= 4.17.0 will fail to build docs if the below two constructors
+// are included in the doxygen group; close here and reopen after as a workaround
+/// @} pytypes
+
+inline bytes::bytes(const pybind11::str &s) {
+    object temp = s;
+    if (PyUnicode_Check(s.ptr())) {
+        temp = reinterpret_steal<object>(PyUnicode_AsUTF8String(s.ptr()));
+        if (!temp) {
+            throw error_already_set();
+        }
+    }
+    char *buffer = nullptr;
+    ssize_t length = 0;
+    if (PyBytes_AsStringAndSize(temp.ptr(), &buffer, &length) != 0) {
+        throw error_already_set();
+    }
+    auto obj = reinterpret_steal<object>(PYBIND11_BYTES_FROM_STRING_AND_SIZE(buffer, length));
+    if (!obj) {
+        pybind11_fail("Could not allocate bytes object!");
+    }
+    m_ptr = obj.release().ptr();
+}
+
+inline str::str(const bytes &b) {
+    char *buffer = nullptr;
+    ssize_t length = 0;
+    if (PyBytes_AsStringAndSize(b.ptr(), &buffer, &length) != 0) {
+        throw error_already_set();
+    }
+    auto obj = reinterpret_steal<object>(PyUnicode_FromStringAndSize(buffer, length));
+    if (!obj) {
+        if (PyErr_Occurred()) {
+            throw error_already_set();
+        }
+        pybind11_fail("Could not allocate string object!");
+    }
+    m_ptr = obj.release().ptr();
+}
+
+/// \addtogroup pytypes
+/// @{
+class bytearray : public object {
+public:
+    PYBIND11_OBJECT_CVT(bytearray, object, PyByteArray_Check, PyByteArray_FromObject)
+
+    template <typename SzType, detail::enable_if_t<std::is_integral<SzType>::value, int> = 0>
+    bytearray(const char *c, const SzType &n)
+        : object(PyByteArray_FromStringAndSize(c, ssize_t_cast(n)), stolen_t{}) {
+        if (!m_ptr) {
+            pybind11_fail("Could not allocate bytearray object!");
+        }
+    }
+
+    bytearray() : bytearray("", 0) {}
+
+    explicit bytearray(const std::string &s) : bytearray(s.data(), s.size()) {}
+
+    size_t size() const { return static_cast<size_t>(PyByteArray_Size(m_ptr)); }
+
+    explicit operator std::string() const {
+        char *buffer = PyByteArray_AS_STRING(m_ptr);
+        ssize_t size = PyByteArray_GET_SIZE(m_ptr);
+        return std::string(buffer, static_cast<size_t>(size));
+    }
+};
+// Note: breathe >= 4.17.0 will fail to build docs if the below two constructors
+// are included in the doxygen group; close here and reopen after as a workaround
+/// @} pytypes
+
+/// \addtogroup pytypes
+/// @{
+class none : public object {
+public:
+    PYBIND11_OBJECT(none, object, detail::PyNone_Check)
+    none() : object(Py_None, borrowed_t{}) {}
+};
+
+class ellipsis : public object {
+public:
+    PYBIND11_OBJECT(ellipsis, object, detail::PyEllipsis_Check)
+    ellipsis() : object(Py_Ellipsis, borrowed_t{}) {}
+};
+
+class bool_ : public object {
+public:
+    PYBIND11_OBJECT_CVT(bool_, object, PyBool_Check, raw_bool)
+    bool_() : object(Py_False, borrowed_t{}) {}
+    // Allow implicit conversion from and to `bool`:
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    bool_(bool value) : object(value ? Py_True : Py_False, borrowed_t{}) {}
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator bool() const { return (m_ptr != nullptr) && PyLong_AsLong(m_ptr) != 0; }
+
+private:
+    /// Return the truth value of an object -- always returns a new reference
+    static PyObject *raw_bool(PyObject *op) {
+        const auto value = PyObject_IsTrue(op);
+        if (value == -1) {
+            return nullptr;
+        }
+        return handle(value != 0 ? Py_True : Py_False).inc_ref().ptr();
+    }
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+// Converts a value to the given unsigned type.  If an error occurs, you get back (Unsigned) -1;
+// otherwise you get back the unsigned long or unsigned long long value cast to (Unsigned).
+// (The distinction is critically important when casting a returned -1 error value to some other
+// unsigned type: (A)-1 != (B)-1 when A and B are unsigned types of different sizes).
+template <typename Unsigned>
+Unsigned as_unsigned(PyObject *o) {
+    if (sizeof(Unsigned) <= sizeof(unsigned long)) {
+        unsigned long v = PyLong_AsUnsignedLong(o);
+        return v == (unsigned long) -1 && PyErr_Occurred() ? (Unsigned) -1 : (Unsigned) v;
+    }
+    unsigned long long v = PyLong_AsUnsignedLongLong(o);
+    return v == (unsigned long long) -1 && PyErr_Occurred() ? (Unsigned) -1 : (Unsigned) v;
+}
+PYBIND11_NAMESPACE_END(detail)
+
+class int_ : public object {
+public:
+    PYBIND11_OBJECT_CVT(int_, object, PYBIND11_LONG_CHECK, PyNumber_Long)
+    int_() : object(PyLong_FromLong(0), stolen_t{}) {}
+    // Allow implicit conversion from C++ integral types:
+    template <typename T, detail::enable_if_t<std::is_integral<T>::value, int> = 0>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    int_(T value) {
+        if (sizeof(T) <= sizeof(long)) {
+            if (std::is_signed<T>::value) {
+                m_ptr = PyLong_FromLong((long) value);
+            } else {
+                m_ptr = PyLong_FromUnsignedLong((unsigned long) value);
+            }
+        } else {
+            if (std::is_signed<T>::value) {
+                m_ptr = PyLong_FromLongLong((long long) value);
+            } else {
+                m_ptr = PyLong_FromUnsignedLongLong((unsigned long long) value);
+            }
+        }
+        if (!m_ptr) {
+            pybind11_fail("Could not allocate int object!");
+        }
+    }
+
+    template <typename T, detail::enable_if_t<std::is_integral<T>::value, int> = 0>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator T() const {
+        return std::is_unsigned<T>::value  ? detail::as_unsigned<T>(m_ptr)
+               : sizeof(T) <= sizeof(long) ? (T) PyLong_AsLong(m_ptr)
+                                           : (T) PYBIND11_LONG_AS_LONGLONG(m_ptr);
+    }
+};
+
+class float_ : public object {
+public:
+    PYBIND11_OBJECT_CVT(float_, object, PyFloat_Check, PyNumber_Float)
+    // Allow implicit conversion from float/double:
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    float_(float value) : object(PyFloat_FromDouble((double) value), stolen_t{}) {
+        if (!m_ptr) {
+            pybind11_fail("Could not allocate float object!");
+        }
+    }
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    float_(double value = .0) : object(PyFloat_FromDouble((double) value), stolen_t{}) {
+        if (!m_ptr) {
+            pybind11_fail("Could not allocate float object!");
+        }
+    }
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator float() const { return (float) PyFloat_AsDouble(m_ptr); }
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator double() const { return (double) PyFloat_AsDouble(m_ptr); }
+};
+
+class weakref : public object {
+public:
+    PYBIND11_OBJECT_CVT_DEFAULT(weakref, object, PyWeakref_Check, raw_weakref)
+    explicit weakref(handle obj, handle callback = {})
+        : object(PyWeakref_NewRef(obj.ptr(), callback.ptr()), stolen_t{}) {
+        if (!m_ptr) {
+            if (PyErr_Occurred()) {
+                throw error_already_set();
+            }
+            pybind11_fail("Could not allocate weak reference!");
+        }
+    }
+
+private:
+    static PyObject *raw_weakref(PyObject *o) { return PyWeakref_NewRef(o, nullptr); }
+};
+
+class slice : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(slice, object, PySlice_Check)
+    slice(handle start, handle stop, handle step)
+        : object(PySlice_New(start.ptr(), stop.ptr(), step.ptr()), stolen_t{}) {
+        if (!m_ptr) {
+            pybind11_fail("Could not allocate slice object!");
+        }
+    }
+
+#ifdef PYBIND11_HAS_OPTIONAL
+    slice(std::optional<ssize_t> start, std::optional<ssize_t> stop, std::optional<ssize_t> step)
+        : slice(index_to_object(start), index_to_object(stop), index_to_object(step)) {}
+#else
+    slice(ssize_t start_, ssize_t stop_, ssize_t step_)
+        : slice(int_(start_), int_(stop_), int_(step_)) {}
+#endif
+
+    bool
+    compute(size_t length, size_t *start, size_t *stop, size_t *step, size_t *slicelength) const {
+        return PySlice_GetIndicesEx((PYBIND11_SLICE_OBJECT *) m_ptr,
+                                    (ssize_t) length,
+                                    (ssize_t *) start,
+                                    (ssize_t *) stop,
+                                    (ssize_t *) step,
+                                    (ssize_t *) slicelength)
+               == 0;
+    }
+    bool compute(
+        ssize_t length, ssize_t *start, ssize_t *stop, ssize_t *step, ssize_t *slicelength) const {
+        return PySlice_GetIndicesEx(
+                   (PYBIND11_SLICE_OBJECT *) m_ptr, length, start, stop, step, slicelength)
+               == 0;
+    }
+
+private:
+    template <typename T>
+    static object index_to_object(T index) {
+        return index ? object(int_(*index)) : object(none());
+    }
+};
+
+class capsule : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(capsule, object, PyCapsule_CheckExact)
+    PYBIND11_DEPRECATED("Use reinterpret_borrow<capsule>() or reinterpret_steal<capsule>()")
+    capsule(PyObject *ptr, bool is_borrowed)
+        : object(is_borrowed ? object(ptr, borrowed_t{}) : object(ptr, stolen_t{})) {}
+
+    explicit capsule(const void *value,
+                     const char *name = nullptr,
+                     PyCapsule_Destructor destructor = nullptr)
+        : object(PyCapsule_New(const_cast<void *>(value), name, destructor), stolen_t{}) {
+        if (!m_ptr) {
+            throw error_already_set();
+        }
+    }
+
+    PYBIND11_DEPRECATED("Please use the ctor with value, name, destructor args")
+    capsule(const void *value, PyCapsule_Destructor destructor)
+        : object(PyCapsule_New(const_cast<void *>(value), nullptr, destructor), stolen_t{}) {
+        if (!m_ptr) {
+            throw error_already_set();
+        }
+    }
+
+    /// Capsule name is nullptr.
+    capsule(const void *value, void (*destructor)(void *)) {
+        initialize_with_void_ptr_destructor(value, nullptr, destructor);
+    }
+
+    capsule(const void *value, const char *name, void (*destructor)(void *)) {
+        initialize_with_void_ptr_destructor(value, name, destructor);
+    }
+
+    explicit capsule(void (*destructor)()) {
+        m_ptr = PyCapsule_New(reinterpret_cast<void *>(destructor), nullptr, [](PyObject *o) {
+            const char *name = get_name_in_error_scope(o);
+            auto destructor = reinterpret_cast<void (*)()>(PyCapsule_GetPointer(o, name));
+            if (destructor == nullptr) {
+                throw error_already_set();
+            }
+            destructor();
+        });
+
+        if (!m_ptr) {
+            throw error_already_set();
+        }
+    }
+
+    template <typename T>
+    operator T *() const { // NOLINT(google-explicit-constructor)
+        return get_pointer<T>();
+    }
+
+    /// Get the pointer the capsule holds.
+    template <typename T = void>
+    T *get_pointer() const {
+        const auto *name = this->name();
+        T *result = static_cast<T *>(PyCapsule_GetPointer(m_ptr, name));
+        if (!result) {
+            throw error_already_set();
+        }
+        return result;
+    }
+
+    /// Replaces a capsule's pointer *without* calling the destructor on the existing one.
+    void set_pointer(const void *value) {
+        if (PyCapsule_SetPointer(m_ptr, const_cast<void *>(value)) != 0) {
+            throw error_already_set();
+        }
+    }
+
+    const char *name() const {
+        const char *name = PyCapsule_GetName(m_ptr);
+        if ((name == nullptr) && PyErr_Occurred()) {
+            throw error_already_set();
+        }
+        return name;
+    }
+
+    /// Replaces a capsule's name *without* calling the destructor on the existing one.
+    void set_name(const char *new_name) {
+        if (PyCapsule_SetName(m_ptr, new_name) != 0) {
+            throw error_already_set();
+        }
+    }
+
+private:
+    static const char *get_name_in_error_scope(PyObject *o) {
+        error_scope error_guard;
+
+        const char *name = PyCapsule_GetName(o);
+        if ((name == nullptr) && PyErr_Occurred()) {
+            // write out and consume error raised by call to PyCapsule_GetName
+            PyErr_WriteUnraisable(o);
+        }
+
+        return name;
+    }
+
+    void initialize_with_void_ptr_destructor(const void *value,
+                                             const char *name,
+                                             void (*destructor)(void *)) {
+        m_ptr = PyCapsule_New(const_cast<void *>(value), name, [](PyObject *o) {
+            // guard if destructor called while err indicator is set
+            error_scope error_guard;
+            auto destructor = reinterpret_cast<void (*)(void *)>(PyCapsule_GetContext(o));
+            if (destructor == nullptr && PyErr_Occurred()) {
+                throw error_already_set();
+            }
+            const char *name = get_name_in_error_scope(o);
+            void *ptr = PyCapsule_GetPointer(o, name);
+            if (ptr == nullptr) {
+                throw error_already_set();
+            }
+
+            if (destructor != nullptr) {
+                destructor(ptr);
+            }
+        });
+
+        if (!m_ptr || PyCapsule_SetContext(m_ptr, reinterpret_cast<void *>(destructor)) != 0) {
+            throw error_already_set();
+        }
+    }
+};
+
+class tuple : public object {
+public:
+    PYBIND11_OBJECT_CVT(tuple, object, PyTuple_Check, PySequence_Tuple)
+    template <typename SzType = ssize_t,
+              detail::enable_if_t<std::is_integral<SzType>::value, int> = 0>
+    // Some compilers generate link errors when using `const SzType &` here:
+    explicit tuple(SzType size = 0) : object(PyTuple_New(ssize_t_cast(size)), stolen_t{}) {
+        if (!m_ptr) {
+            pybind11_fail("Could not allocate tuple object!");
+        }
+    }
+    size_t size() const { return (size_t) PyTuple_Size(m_ptr); }
+    bool empty() const { return size() == 0; }
+    detail::tuple_accessor operator[](size_t index) const { return {*this, index}; }
+    template <typename T, detail::enable_if_t<detail::is_pyobject<T>::value, int> = 0>
+    detail::item_accessor operator[](T &&o) const {
+        return object::operator[](std::forward<T>(o));
+    }
+    detail::tuple_iterator begin() const { return {*this, 0}; }
+    detail::tuple_iterator end() const { return {*this, PyTuple_GET_SIZE(m_ptr)}; }
+};
+
+// We need to put this into a separate function because the Intel compiler
+// fails to compile enable_if_t<all_of<is_keyword_or_ds<Args>...>::value> part below
+// (tested with ICC 2021.1 Beta 20200827).
+template <typename... Args>
+constexpr bool args_are_all_keyword_or_ds() {
+    return detail::all_of<detail::is_keyword_or_ds<Args>...>::value;
+}
+
+class dict : public object {
+public:
+    PYBIND11_OBJECT_CVT(dict, object, PyDict_Check, raw_dict)
+    dict() : object(PyDict_New(), stolen_t{}) {
+        if (!m_ptr) {
+            pybind11_fail("Could not allocate dict object!");
+        }
+    }
+    template <typename... Args,
+              typename = detail::enable_if_t<args_are_all_keyword_or_ds<Args...>()>,
+              // MSVC workaround: it can't compile an out-of-line definition, so defer the
+              // collector
+              typename collector = detail::deferred_t<detail::unpacking_collector<>, Args...>>
+    explicit dict(Args &&...args) : dict(collector(std::forward<Args>(args)...).kwargs()) {}
+
+    size_t size() const { return (size_t) PyDict_Size(m_ptr); }
+    bool empty() const { return size() == 0; }
+    detail::dict_iterator begin() const { return {*this, 0}; }
+    detail::dict_iterator end() const { return {}; }
+    void clear() /* py-non-const */ { PyDict_Clear(ptr()); }
+    template <typename T>
+    bool contains(T &&key) const {
+        auto result = PyDict_Contains(m_ptr, detail::object_or_cast(std::forward<T>(key)).ptr());
+        if (result == -1) {
+            throw error_already_set();
+        }
+        return result == 1;
+    }
+
+private:
+    /// Call the `dict` Python type -- always returns a new reference
+    static PyObject *raw_dict(PyObject *op) {
+        if (PyDict_Check(op)) {
+            return handle(op).inc_ref().ptr();
+        }
+        return PyObject_CallFunctionObjArgs((PyObject *) &PyDict_Type, op, nullptr);
+    }
+};
+
+class sequence : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(sequence, object, PySequence_Check)
+    size_t size() const {
+        ssize_t result = PySequence_Size(m_ptr);
+        if (result == -1) {
+            throw error_already_set();
+        }
+        return (size_t) result;
+    }
+    bool empty() const { return size() == 0; }
+    detail::sequence_accessor operator[](size_t index) const { return {*this, index}; }
+    template <typename T, detail::enable_if_t<detail::is_pyobject<T>::value, int> = 0>
+    detail::item_accessor operator[](T &&o) const {
+        return object::operator[](std::forward<T>(o));
+    }
+    detail::sequence_iterator begin() const { return {*this, 0}; }
+    detail::sequence_iterator end() const { return {*this, PySequence_Size(m_ptr)}; }
+};
+
+class list : public object {
+public:
+    PYBIND11_OBJECT_CVT(list, object, PyList_Check, PySequence_List)
+    template <typename SzType = ssize_t,
+              detail::enable_if_t<std::is_integral<SzType>::value, int> = 0>
+    // Some compilers generate link errors when using `const SzType &` here:
+    explicit list(SzType size = 0) : object(PyList_New(ssize_t_cast(size)), stolen_t{}) {
+        if (!m_ptr) {
+            pybind11_fail("Could not allocate list object!");
+        }
+    }
+    size_t size() const { return (size_t) PyList_Size(m_ptr); }
+    bool empty() const { return size() == 0; }
+    detail::list_accessor operator[](size_t index) const { return {*this, index}; }
+    template <typename T, detail::enable_if_t<detail::is_pyobject<T>::value, int> = 0>
+    detail::item_accessor operator[](T &&o) const {
+        return object::operator[](std::forward<T>(o));
+    }
+    detail::list_iterator begin() const { return {*this, 0}; }
+    detail::list_iterator end() const { return {*this, PyList_GET_SIZE(m_ptr)}; }
+    template <typename T>
+    void append(T &&val) /* py-non-const */ {
+        if (PyList_Append(m_ptr, detail::object_or_cast(std::forward<T>(val)).ptr()) != 0) {
+            throw error_already_set();
+        }
+    }
+    template <typename IdxType,
+              typename ValType,
+              detail::enable_if_t<std::is_integral<IdxType>::value, int> = 0>
+    void insert(const IdxType &index, ValType &&val) /* py-non-const */ {
+        if (PyList_Insert(m_ptr,
+                          ssize_t_cast(index),
+                          detail::object_or_cast(std::forward<ValType>(val)).ptr())
+            != 0) {
+            throw error_already_set();
+        }
+    }
+    void clear() /* py-non-const */ {
+        if (PyList_SetSlice(m_ptr, 0, PyList_Size(m_ptr), nullptr) == -1) {
+            throw error_already_set();
+        }
+    }
+};
+
+class args : public tuple {
+    PYBIND11_OBJECT_DEFAULT(args, tuple, PyTuple_Check)
+};
+class kwargs : public dict {
+    PYBIND11_OBJECT_DEFAULT(kwargs, dict, PyDict_Check)
+};
+
+class anyset : public object {
+public:
+    PYBIND11_OBJECT(anyset, object, PyAnySet_Check)
+    size_t size() const { return static_cast<size_t>(PySet_Size(m_ptr)); }
+    bool empty() const { return size() == 0; }
+    template <typename T>
+    bool contains(T &&val) const {
+        auto result = PySet_Contains(m_ptr, detail::object_or_cast(std::forward<T>(val)).ptr());
+        if (result == -1) {
+            throw error_already_set();
+        }
+        return result == 1;
+    }
+};
+
+class set : public anyset {
+public:
+    PYBIND11_OBJECT_CVT(set, anyset, PySet_Check, PySet_New)
+    set() : anyset(PySet_New(nullptr), stolen_t{}) {
+        if (!m_ptr) {
+            pybind11_fail("Could not allocate set object!");
+        }
+    }
+    template <typename T>
+    bool add(T &&val) /* py-non-const */ {
+        return PySet_Add(m_ptr, detail::object_or_cast(std::forward<T>(val)).ptr()) == 0;
+    }
+    void clear() /* py-non-const */ { PySet_Clear(m_ptr); }
+};
+
+class frozenset : public anyset {
+public:
+    PYBIND11_OBJECT_CVT(frozenset, anyset, PyFrozenSet_Check, PyFrozenSet_New)
+};
+
+class function : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(function, object, PyCallable_Check)
+    handle cpp_function() const {
+        handle fun = detail::get_function(m_ptr);
+        if (fun && PyCFunction_Check(fun.ptr())) {
+            return fun;
+        }
+        return handle();
+    }
+    bool is_cpp_function() const { return (bool) cpp_function(); }
+};
+
+class staticmethod : public object {
+public:
+    PYBIND11_OBJECT_CVT(staticmethod, object, detail::PyStaticMethod_Check, PyStaticMethod_New)
+};
+
+class buffer : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(buffer, object, PyObject_CheckBuffer)
+
+    buffer_info request(bool writable = false) const {
+        int flags = PyBUF_STRIDES | PyBUF_FORMAT;
+        if (writable) {
+            flags |= PyBUF_WRITABLE;
+        }
+        auto *view = new Py_buffer();
+        if (PyObject_GetBuffer(m_ptr, view, flags) != 0) {
+            delete view;
+            throw error_already_set();
+        }
+        return buffer_info(view);
+    }
+};
+
+class memoryview : public object {
+public:
+    PYBIND11_OBJECT_CVT(memoryview, object, PyMemoryView_Check, PyMemoryView_FromObject)
+
+    /** \rst
+        Creates ``memoryview`` from ``buffer_info``.
+
+        ``buffer_info`` must be created from ``buffer::request()``. Otherwise
+        throws an exception.
+
+        For creating a ``memoryview`` from objects that support buffer protocol,
+        use ``memoryview(const object& obj)`` instead of this constructor.
+     \endrst */
+    explicit memoryview(const buffer_info &info) {
+        if (!info.view()) {
+            pybind11_fail("Prohibited to create memoryview without Py_buffer");
+        }
+        // Note: PyMemoryView_FromBuffer never increments obj reference.
+        m_ptr = (info.view()->obj) ? PyMemoryView_FromObject(info.view()->obj)
+                                   : PyMemoryView_FromBuffer(info.view());
+        if (!m_ptr) {
+            pybind11_fail("Unable to create memoryview from buffer descriptor");
+        }
+    }
+
+    /** \rst
+        Creates ``memoryview`` from static buffer.
+
+        This method is meant for providing a ``memoryview`` for C/C++ buffer not
+        managed by Python. The caller is responsible for managing the lifetime
+        of ``ptr`` and ``format``, which MUST outlive the memoryview constructed
+        here.
+
+        See also: Python C API documentation for `PyMemoryView_FromBuffer`_.
+
+        .. _PyMemoryView_FromBuffer:
+           https://docs.python.org/c-api/memoryview.html#c.PyMemoryView_FromBuffer
+
+        :param ptr: Pointer to the buffer.
+        :param itemsize: Byte size of an element.
+        :param format: Pointer to the null-terminated format string. For
+            homogeneous Buffers, this should be set to
+            ``format_descriptor<T>::value``.
+        :param shape: Shape of the tensor (1 entry per dimension).
+        :param strides: Number of bytes between adjacent entries (for each
+            per dimension).
+        :param readonly: Flag to indicate if the underlying storage may be
+            written to.
+     \endrst */
+    static memoryview from_buffer(void *ptr,
+                                  ssize_t itemsize,
+                                  const char *format,
+                                  detail::any_container<ssize_t> shape,
+                                  detail::any_container<ssize_t> strides,
+                                  bool readonly = false);
+
+    static memoryview from_buffer(const void *ptr,
+                                  ssize_t itemsize,
+                                  const char *format,
+                                  detail::any_container<ssize_t> shape,
+                                  detail::any_container<ssize_t> strides) {
+        return memoryview::from_buffer(
+            const_cast<void *>(ptr), itemsize, format, std::move(shape), std::move(strides), true);
+    }
+
+    template <typename T>
+    static memoryview from_buffer(T *ptr,
+                                  detail::any_container<ssize_t> shape,
+                                  detail::any_container<ssize_t> strides,
+                                  bool readonly = false) {
+        return memoryview::from_buffer(reinterpret_cast<void *>(ptr),
+                                       sizeof(T),
+                                       format_descriptor<T>::value,
+                                       std::move(shape),
+                                       std::move(strides),
+                                       readonly);
+    }
+
+    template <typename T>
+    static memoryview from_buffer(const T *ptr,
+                                  detail::any_container<ssize_t> shape,
+                                  detail::any_container<ssize_t> strides) {
+        return memoryview::from_buffer(
+            const_cast<T *>(ptr), std::move(shape), std::move(strides), true);
+    }
+
+    /** \rst
+        Creates ``memoryview`` from static memory.
+
+        This method is meant for providing a ``memoryview`` for C/C++ buffer not
+        managed by Python. The caller is responsible for managing the lifetime
+        of ``mem``, which MUST outlive the memoryview constructed here.
+
+        See also: Python C API documentation for `PyMemoryView_FromBuffer`_.
+
+        .. _PyMemoryView_FromMemory:
+           https://docs.python.org/c-api/memoryview.html#c.PyMemoryView_FromMemory
+     \endrst */
+    static memoryview from_memory(void *mem, ssize_t size, bool readonly = false) {
+        PyObject *ptr = PyMemoryView_FromMemory(
+            reinterpret_cast<char *>(mem), size, (readonly) ? PyBUF_READ : PyBUF_WRITE);
+        if (!ptr) {
+            pybind11_fail("Could not allocate memoryview object!");
+        }
+        return memoryview(object(ptr, stolen_t{}));
+    }
+
+    static memoryview from_memory(const void *mem, ssize_t size) {
+        return memoryview::from_memory(const_cast<void *>(mem), size, true);
+    }
+
+#ifdef PYBIND11_HAS_STRING_VIEW
+    static memoryview from_memory(std::string_view mem) {
+        return from_memory(const_cast<char *>(mem.data()), static_cast<ssize_t>(mem.size()), true);
+    }
+#endif
+};
+
+/// @cond DUPLICATE
+inline memoryview memoryview::from_buffer(void *ptr,
+                                          ssize_t itemsize,
+                                          const char *format,
+                                          detail::any_container<ssize_t> shape,
+                                          detail::any_container<ssize_t> strides,
+                                          bool readonly) {
+    size_t ndim = shape->size();
+    if (ndim != strides->size()) {
+        pybind11_fail("memoryview: shape length doesn't match strides length");
+    }
+    ssize_t size = ndim != 0u ? 1 : 0;
+    for (size_t i = 0; i < ndim; ++i) {
+        size *= (*shape)[i];
+    }
+    Py_buffer view;
+    view.buf = ptr;
+    view.obj = nullptr;
+    view.len = size * itemsize;
+    view.readonly = static_cast<int>(readonly);
+    view.itemsize = itemsize;
+    view.format = const_cast<char *>(format);
+    view.ndim = static_cast<int>(ndim);
+    view.shape = shape->data();
+    view.strides = strides->data();
+    view.suboffsets = nullptr;
+    view.internal = nullptr;
+    PyObject *obj = PyMemoryView_FromBuffer(&view);
+    if (!obj) {
+        throw error_already_set();
+    }
+    return memoryview(object(obj, stolen_t{}));
+}
+/// @endcond
+/// @} pytypes
+
+/// \addtogroup python_builtins
+/// @{
+
+/// Get the length of a Python object.
+inline size_t len(handle h) {
+    ssize_t result = PyObject_Length(h.ptr());
+    if (result < 0) {
+        throw error_already_set();
+    }
+    return (size_t) result;
+}
+
+/// Get the length hint of a Python object.
+/// Returns 0 when this cannot be determined.
+inline size_t len_hint(handle h) {
+    ssize_t result = PyObject_LengthHint(h.ptr(), 0);
+    if (result < 0) {
+        // Sometimes a length can't be determined at all (eg generators)
+        // In which case simply return 0
+        PyErr_Clear();
+        return 0;
+    }
+    return (size_t) result;
+}
+
+inline str repr(handle h) {
+    PyObject *str_value = PyObject_Repr(h.ptr());
+    if (!str_value) {
+        throw error_already_set();
+    }
+    return reinterpret_steal<str>(str_value);
+}
+
+inline iterator iter(handle obj) {
+    PyObject *result = PyObject_GetIter(obj.ptr());
+    if (!result) {
+        throw error_already_set();
+    }
+    return reinterpret_steal<iterator>(result);
+}
+/// @} python_builtins
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+template <typename D>
+iterator object_api<D>::begin() const {
+    return iter(derived());
+}
+template <typename D>
+iterator object_api<D>::end() const {
+    return iterator::sentinel();
+}
+template <typename D>
+item_accessor object_api<D>::operator[](handle key) const {
+    return {derived(), reinterpret_borrow<object>(key)};
+}
+template <typename D>
+item_accessor object_api<D>::operator[](object &&key) const {
+    return {derived(), std::move(key)};
+}
+template <typename D>
+item_accessor object_api<D>::operator[](const char *key) const {
+    return {derived(), pybind11::str(key)};
+}
+template <typename D>
+obj_attr_accessor object_api<D>::attr(handle key) const {
+    return {derived(), reinterpret_borrow<object>(key)};
+}
+template <typename D>
+obj_attr_accessor object_api<D>::attr(object &&key) const {
+    return {derived(), std::move(key)};
+}
+template <typename D>
+str_attr_accessor object_api<D>::attr(const char *key) const {
+    return {derived(), key};
+}
+template <typename D>
+args_proxy object_api<D>::operator*() const {
+    return args_proxy(derived().ptr());
+}
+template <typename D>
+template <typename T>
+bool object_api<D>::contains(T &&item) const {
+    return attr("__contains__")(std::forward<T>(item)).template cast<bool>();
+}
+
+template <typename D>
+pybind11::str object_api<D>::str() const {
+    return pybind11::str(derived());
+}
+
+template <typename D>
+str_attr_accessor object_api<D>::doc() const {
+    return attr("__doc__");
+}
+
+template <typename D>
+handle object_api<D>::get_type() const {
+    return type::handle_of(derived());
+}
+
+template <typename D>
+bool object_api<D>::rich_compare(object_api const &other, int value) const {
+    int rv = PyObject_RichCompareBool(derived().ptr(), other.derived().ptr(), value);
+    if (rv == -1) {
+        throw error_already_set();
+    }
+    return rv == 1;
+}
+
+#define PYBIND11_MATH_OPERATOR_UNARY(op, fn)                                                      \
+    template <typename D>                                                                         \
+    object object_api<D>::op() const {                                                            \
+        object result = reinterpret_steal<object>(fn(derived().ptr()));                           \
+        if (!result.ptr())                                                                        \
+            throw error_already_set();                                                            \
+        return result;                                                                            \
+    }
+
+#define PYBIND11_MATH_OPERATOR_BINARY(op, fn)                                                     \
+    template <typename D>                                                                         \
+    object object_api<D>::op(object_api const &other) const {                                     \
+        object result = reinterpret_steal<object>(fn(derived().ptr(), other.derived().ptr()));    \
+        if (!result.ptr())                                                                        \
+            throw error_already_set();                                                            \
+        return result;                                                                            \
+    }
+
+#define PYBIND11_MATH_OPERATOR_BINARY_INPLACE(iop, fn)                                            \
+    template <typename D>                                                                         \
+    object object_api<D>::iop(object_api const &other) {                                          \
+        object result = reinterpret_steal<object>(fn(derived().ptr(), other.derived().ptr()));    \
+        if (!result.ptr())                                                                        \
+            throw error_already_set();                                                            \
+        return result;                                                                            \
+    }
+
+PYBIND11_MATH_OPERATOR_UNARY(operator~, PyNumber_Invert)
+PYBIND11_MATH_OPERATOR_UNARY(operator-, PyNumber_Negative)
+PYBIND11_MATH_OPERATOR_BINARY(operator+, PyNumber_Add)
+PYBIND11_MATH_OPERATOR_BINARY_INPLACE(operator+=, PyNumber_InPlaceAdd)
+PYBIND11_MATH_OPERATOR_BINARY(operator-, PyNumber_Subtract)
+PYBIND11_MATH_OPERATOR_BINARY_INPLACE(operator-=, PyNumber_InPlaceSubtract)
+PYBIND11_MATH_OPERATOR_BINARY(operator*, PyNumber_Multiply)
+PYBIND11_MATH_OPERATOR_BINARY_INPLACE(operator*=, PyNumber_InPlaceMultiply)
+PYBIND11_MATH_OPERATOR_BINARY(operator/, PyNumber_TrueDivide)
+PYBIND11_MATH_OPERATOR_BINARY_INPLACE(operator/=, PyNumber_InPlaceTrueDivide)
+PYBIND11_MATH_OPERATOR_BINARY(operator|, PyNumber_Or)
+PYBIND11_MATH_OPERATOR_BINARY_INPLACE(operator|=, PyNumber_InPlaceOr)
+PYBIND11_MATH_OPERATOR_BINARY(operator&, PyNumber_And)
+PYBIND11_MATH_OPERATOR_BINARY_INPLACE(operator&=, PyNumber_InPlaceAnd)
+PYBIND11_MATH_OPERATOR_BINARY(operator^, PyNumber_Xor)
+PYBIND11_MATH_OPERATOR_BINARY_INPLACE(operator^=, PyNumber_InPlaceXor)
+PYBIND11_MATH_OPERATOR_BINARY(operator<<, PyNumber_Lshift)
+PYBIND11_MATH_OPERATOR_BINARY_INPLACE(operator<<=, PyNumber_InPlaceLshift)
+PYBIND11_MATH_OPERATOR_BINARY(operator>>, PyNumber_Rshift)
+PYBIND11_MATH_OPERATOR_BINARY_INPLACE(operator>>=, PyNumber_InPlaceRshift)
+
+#undef PYBIND11_MATH_OPERATOR_UNARY
+#undef PYBIND11_MATH_OPERATOR_BINARY
+#undef PYBIND11_MATH_OPERATOR_BINARY_INPLACE
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/phivenv/Lib/site-packages/torch/include/pybind11/stl.h b/phivenv/Lib/site-packages/torch/include/pybind11/stl.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e26d97bed52ab5842b65c5c3a7743c820e04a77
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/pybind11/stl.h
@@ -0,0 +1,448 @@
+/*
+    pybind11/stl.h: Transparent conversion for STL data types
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include "detail/common.h"
+
+#include <deque>
+#include <list>
+#include <map>
+#include <ostream>
+#include <set>
+#include <unordered_map>
+#include <unordered_set>
+#include <valarray>
+
+// See `detail/common.h` for implementation of these guards.
+#if defined(PYBIND11_HAS_OPTIONAL)
+#    include <optional>
+#elif defined(PYBIND11_HAS_EXP_OPTIONAL)
+#    include <experimental/optional>
+#endif
+
+#if defined(PYBIND11_HAS_VARIANT)
+#    include <variant>
+#endif
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+/// Extracts an const lvalue reference or rvalue reference for U based on the type of T (e.g. for
+/// forwarding a container element).  Typically used indirect via forwarded_type(), below.
+template <typename T, typename U>
+using forwarded_type = conditional_t<std::is_lvalue_reference<T>::value,
+                                     remove_reference_t<U> &,
+                                     remove_reference_t<U> &&>;
+
+/// Forwards a value U as rvalue or lvalue according to whether T is rvalue or lvalue; typically
+/// used for forwarding a container's elements.
+template <typename T, typename U>
+constexpr forwarded_type<T, U> forward_like(U &&u) {
+    return std::forward<detail::forwarded_type<T, U>>(std::forward<U>(u));
+}
+
+// Checks if a container has a STL style reserve method.
+// This will only return true for a `reserve()` with a `void` return.
+template <typename C>
+using has_reserve_method = std::is_same<decltype(std::declval<C>().reserve(0)), void>;
+
+template <typename Type, typename Key>
+struct set_caster {
+    using type = Type;
+    using key_conv = make_caster<Key>;
+
+private:
+    template <typename T = Type, enable_if_t<has_reserve_method<T>::value, int> = 0>
+    void reserve_maybe(const anyset &s, Type *) {
+        value.reserve(s.size());
+    }
+    void reserve_maybe(const anyset &, void *) {}
+
+public:
+    bool load(handle src, bool convert) {
+        if (!isinstance<anyset>(src)) {
+            return false;
+        }
+        auto s = reinterpret_borrow<anyset>(src);
+        value.clear();
+        reserve_maybe(s, &value);
+        for (auto entry : s) {
+            key_conv conv;
+            if (!conv.load(entry, convert)) {
+                return false;
+            }
+            value.insert(cast_op<Key &&>(std::move(conv)));
+        }
+        return true;
+    }
+
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        if (!std::is_lvalue_reference<T>::value) {
+            policy = return_value_policy_override<Key>::policy(policy);
+        }
+        pybind11::set s;
+        for (auto &&value : src) {
+            auto value_ = reinterpret_steal<object>(
+                key_conv::cast(detail::forward_like<T>(value), policy, parent));
+            if (!value_ || !s.add(std::move(value_))) {
+                return handle();
+            }
+        }
+        return s.release();
+    }
+
+    PYBIND11_TYPE_CASTER(type, const_name("set[") + key_conv::name + const_name("]"));
+};
+
+template <typename Type, typename Key, typename Value>
+struct map_caster {
+    using key_conv = make_caster<Key>;
+    using value_conv = make_caster<Value>;
+
+private:
+    template <typename T = Type, enable_if_t<has_reserve_method<T>::value, int> = 0>
+    void reserve_maybe(const dict &d, Type *) {
+        value.reserve(d.size());
+    }
+    void reserve_maybe(const dict &, void *) {}
+
+public:
+    bool load(handle src, bool convert) {
+        if (!isinstance<dict>(src)) {
+            return false;
+        }
+        auto d = reinterpret_borrow<dict>(src);
+        value.clear();
+        reserve_maybe(d, &value);
+        for (auto it : d) {
+            key_conv kconv;
+            value_conv vconv;
+            if (!kconv.load(it.first.ptr(), convert) || !vconv.load(it.second.ptr(), convert)) {
+                return false;
+            }
+            value.emplace(cast_op<Key &&>(std::move(kconv)), cast_op<Value &&>(std::move(vconv)));
+        }
+        return true;
+    }
+
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        dict d;
+        return_value_policy policy_key = policy;
+        return_value_policy policy_value = policy;
+        if (!std::is_lvalue_reference<T>::value) {
+            policy_key = return_value_policy_override<Key>::policy(policy_key);
+            policy_value = return_value_policy_override<Value>::policy(policy_value);
+        }
+        for (auto &&kv : src) {
+            auto key = reinterpret_steal<object>(
+                key_conv::cast(detail::forward_like<T>(kv.first), policy_key, parent));
+            auto value = reinterpret_steal<object>(
+                value_conv::cast(detail::forward_like<T>(kv.second), policy_value, parent));
+            if (!key || !value) {
+                return handle();
+            }
+            d[std::move(key)] = std::move(value);
+        }
+        return d.release();
+    }
+
+    PYBIND11_TYPE_CASTER(Type,
+                         const_name("dict[") + key_conv::name + const_name(", ") + value_conv::name
+                             + const_name("]"));
+};
+
+template <typename Type, typename Value>
+struct list_caster {
+    using value_conv = make_caster<Value>;
+
+    bool load(handle src, bool convert) {
+        if (!isinstance<sequence>(src) || isinstance<bytes>(src) || isinstance<str>(src)) {
+            return false;
+        }
+        auto s = reinterpret_borrow<sequence>(src);
+        value.clear();
+        reserve_maybe(s, &value);
+        for (const auto &it : s) {
+            value_conv conv;
+            if (!conv.load(it, convert)) {
+                return false;
+            }
+            value.push_back(cast_op<Value &&>(std::move(conv)));
+        }
+        return true;
+    }
+
+private:
+    template <typename T = Type, enable_if_t<has_reserve_method<T>::value, int> = 0>
+    void reserve_maybe(const sequence &s, Type *) {
+        value.reserve(s.size());
+    }
+    void reserve_maybe(const sequence &, void *) {}
+
+public:
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        if (!std::is_lvalue_reference<T>::value) {
+            policy = return_value_policy_override<Value>::policy(policy);
+        }
+        list l(src.size());
+        ssize_t index = 0;
+        for (auto &&value : src) {
+            auto value_ = reinterpret_steal<object>(
+                value_conv::cast(detail::forward_like<T>(value), policy, parent));
+            if (!value_) {
+                return handle();
+            }
+            PyList_SET_ITEM(l.ptr(), index++, value_.release().ptr()); // steals a reference
+        }
+        return l.release();
+    }
+
+    PYBIND11_TYPE_CASTER(Type, const_name("list[") + value_conv::name + const_name("]"));
+};
+
+template <typename Type, typename Alloc>
+struct type_caster<std::vector<Type, Alloc>> : list_caster<std::vector<Type, Alloc>, Type> {};
+
+template <typename Type, typename Alloc>
+struct type_caster<std::deque<Type, Alloc>> : list_caster<std::deque<Type, Alloc>, Type> {};
+
+template <typename Type, typename Alloc>
+struct type_caster<std::list<Type, Alloc>> : list_caster<std::list<Type, Alloc>, Type> {};
+
+template <typename ArrayType, typename Value, bool Resizable, size_t Size = 0>
+struct array_caster {
+    using value_conv = make_caster<Value>;
+
+private:
+    template <bool R = Resizable>
+    bool require_size(enable_if_t<R, size_t> size) {
+        if (value.size() != size) {
+            value.resize(size);
+        }
+        return true;
+    }
+    template <bool R = Resizable>
+    bool require_size(enable_if_t<!R, size_t> size) {
+        return size == Size;
+    }
+
+public:
+    bool load(handle src, bool convert) {
+        if (!isinstance<sequence>(src)) {
+            return false;
+        }
+        auto l = reinterpret_borrow<sequence>(src);
+        if (!require_size(l.size())) {
+            return false;
+        }
+        size_t ctr = 0;
+        for (const auto &it : l) {
+            value_conv conv;
+            if (!conv.load(it, convert)) {
+                return false;
+            }
+            value[ctr++] = cast_op<Value &&>(std::move(conv));
+        }
+        return true;
+    }
+
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        list l(src.size());
+        ssize_t index = 0;
+        for (auto &&value : src) {
+            auto value_ = reinterpret_steal<object>(
+                value_conv::cast(detail::forward_like<T>(value), policy, parent));
+            if (!value_) {
+                return handle();
+            }
+            PyList_SET_ITEM(l.ptr(), index++, value_.release().ptr()); // steals a reference
+        }
+        return l.release();
+    }
+
+    PYBIND11_TYPE_CASTER(ArrayType,
+                         const_name<Resizable>(const_name(""), const_name("Annotated["))
+                             + const_name("list[") + value_conv::name + const_name("]")
+                             + const_name<Resizable>(const_name(""),
+                                                     const_name(", FixedSize(")
+                                                         + const_name<Size>() + const_name(")]")));
+};
+
+template <typename Type, size_t Size>
+struct type_caster<std::array<Type, Size>>
+    : array_caster<std::array<Type, Size>, Type, false, Size> {};
+
+template <typename Type>
+struct type_caster<std::valarray<Type>> : array_caster<std::valarray<Type>, Type, true> {};
+
+template <typename Key, typename Compare, typename Alloc>
+struct type_caster<std::set<Key, Compare, Alloc>>
+    : set_caster<std::set<Key, Compare, Alloc>, Key> {};
+
+template <typename Key, typename Hash, typename Equal, typename Alloc>
+struct type_caster<std::unordered_set<Key, Hash, Equal, Alloc>>
+    : set_caster<std::unordered_set<Key, Hash, Equal, Alloc>, Key> {};
+
+template <typename Key, typename Value, typename Compare, typename Alloc>
+struct type_caster<std::map<Key, Value, Compare, Alloc>>
+    : map_caster<std::map<Key, Value, Compare, Alloc>, Key, Value> {};
+
+template <typename Key, typename Value, typename Hash, typename Equal, typename Alloc>
+struct type_caster<std::unordered_map<Key, Value, Hash, Equal, Alloc>>
+    : map_caster<std::unordered_map<Key, Value, Hash, Equal, Alloc>, Key, Value> {};
+
+// This type caster is intended to be used for std::optional and std::experimental::optional
+template <typename Type, typename Value = typename Type::value_type>
+struct optional_caster {
+    using value_conv = make_caster<Value>;
+
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        if (!src) {
+            return none().release();
+        }
+        if (!std::is_lvalue_reference<T>::value) {
+            policy = return_value_policy_override<Value>::policy(policy);
+        }
+        // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
+        return value_conv::cast(*std::forward<T>(src), policy, parent);
+    }
+
+    bool load(handle src, bool convert) {
+        if (!src) {
+            return false;
+        }
+        if (src.is_none()) {
+            return true; // default-constructed value is already empty
+        }
+        value_conv inner_caster;
+        if (!inner_caster.load(src, convert)) {
+            return false;
+        }
+
+        value.emplace(cast_op<Value &&>(std::move(inner_caster)));
+        return true;
+    }
+
+    PYBIND11_TYPE_CASTER(Type, const_name("Optional[") + value_conv::name + const_name("]"));
+};
+
+#if defined(PYBIND11_HAS_OPTIONAL)
+template <typename T>
+struct type_caster<std::optional<T>> : public optional_caster<std::optional<T>> {};
+
+template <>
+struct type_caster<std::nullopt_t> : public void_caster<std::nullopt_t> {};
+#endif
+
+#if defined(PYBIND11_HAS_EXP_OPTIONAL)
+template <typename T>
+struct type_caster<std::experimental::optional<T>>
+    : public optional_caster<std::experimental::optional<T>> {};
+
+template <>
+struct type_caster<std::experimental::nullopt_t>
+    : public void_caster<std::experimental::nullopt_t> {};
+#endif
+
+/// Visit a variant and cast any found type to Python
+struct variant_caster_visitor {
+    return_value_policy policy;
+    handle parent;
+
+    using result_type = handle; // required by boost::variant in C++11
+
+    template <typename T>
+    result_type operator()(T &&src) const {
+        return make_caster<T>::cast(std::forward<T>(src), policy, parent);
+    }
+};
+
+/// Helper class which abstracts away variant's `visit` function. `std::variant` and similar
+/// `namespace::variant` types which provide a `namespace::visit()` function are handled here
+/// automatically using argument-dependent lookup. Users can provide specializations for other
+/// variant-like classes, e.g. `boost::variant` and `boost::apply_visitor`.
+template <template <typename...> class Variant>
+struct visit_helper {
+    template <typename... Args>
+    static auto call(Args &&...args) -> decltype(visit(std::forward<Args>(args)...)) {
+        return visit(std::forward<Args>(args)...);
+    }
+};
+
+/// Generic variant caster
+template <typename Variant>
+struct variant_caster;
+
+template <template <typename...> class V, typename... Ts>
+struct variant_caster<V<Ts...>> {
+    static_assert(sizeof...(Ts) > 0, "Variant must consist of at least one alternative.");
+
+    template <typename U, typename... Us>
+    bool load_alternative(handle src, bool convert, type_list<U, Us...>) {
+        auto caster = make_caster<U>();
+        if (caster.load(src, convert)) {
+            value = cast_op<U>(std::move(caster));
+            return true;
+        }
+        return load_alternative(src, convert, type_list<Us...>{});
+    }
+
+    bool load_alternative(handle, bool, type_list<>) { return false; }
+
+    bool load(handle src, bool convert) {
+        // Do a first pass without conversions to improve constructor resolution.
+        // E.g. `py::int_(1).cast<variant<double, int>>()` needs to fill the `int`
+        // slot of the variant. Without two-pass loading `double` would be filled
+        // because it appears first and a conversion is possible.
+        if (convert && load_alternative(src, false, type_list<Ts...>{})) {
+            return true;
+        }
+        return load_alternative(src, convert, type_list<Ts...>{});
+    }
+
+    template <typename Variant>
+    static handle cast(Variant &&src, return_value_policy policy, handle parent) {
+        return visit_helper<V>::call(variant_caster_visitor{policy, parent},
+                                     std::forward<Variant>(src));
+    }
+
+    using Type = V<Ts...>;
+    PYBIND11_TYPE_CASTER(Type,
+                         const_name("Union[")
+                             + ::pybind11::detail::concat(make_caster<Ts>::name...)
+                             + const_name("]"));
+};
+
+#if defined(PYBIND11_HAS_VARIANT)
+template <typename... Ts>
+struct type_caster<std::variant<Ts...>> : variant_caster<std::variant<Ts...>> {};
+
+template <>
+struct type_caster<std::monostate> : public void_caster<std::monostate> {};
+#endif
+
+PYBIND11_NAMESPACE_END(detail)
+
+inline std::ostream &operator<<(std::ostream &os, const handle &obj) {
+#ifdef PYBIND11_HAS_STRING_VIEW
+    os << str(obj).cast<std::string_view>();
+#else
+    os << (std::string) str(obj);
+#endif
+    return os;
+}
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/phivenv/Lib/site-packages/torch/include/pybind11/stl/filesystem.h b/phivenv/Lib/site-packages/torch/include/pybind11/stl/filesystem.h
new file mode 100644
index 0000000000000000000000000000000000000000..dff2628be7a95b3d68ae0752b19b02f4875cef2e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/pybind11/stl/filesystem.h
@@ -0,0 +1,124 @@
+// Copyright (c) 2021 The Pybind Development Team.
+// All rights reserved. Use of this source code is governed by a
+// BSD-style license that can be found in the LICENSE file.
+
+#pragma once
+
+#include <pybind11/cast.h>
+#include <pybind11/detail/common.h>
+#include <pybind11/detail/descr.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/pytypes.h>
+
+#include <string>
+
+#ifdef __has_include
+#    if defined(PYBIND11_CPP17)
+#        if __has_include(<filesystem>)
+#            include <filesystem>
+#            define PYBIND11_HAS_FILESYSTEM 1
+#        elif __has_include(<experimental/filesystem>)
+#            include <experimental/filesystem>
+#            define PYBIND11_HAS_EXPERIMENTAL_FILESYSTEM 1
+#        endif
+#    endif
+#endif
+
+#if !defined(PYBIND11_HAS_FILESYSTEM) && !defined(PYBIND11_HAS_EXPERIMENTAL_FILESYSTEM)           \
+    && !defined(PYBIND11_HAS_FILESYSTEM_IS_OPTIONAL)
+#    error                                                                                        \
+        "Neither #include <filesystem> nor #include <experimental/filesystem is available. (Use -DPYBIND11_HAS_FILESYSTEM_IS_OPTIONAL to ignore.)"
+#endif
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+#ifdef PYPY_VERSION
+#    define PYBIND11_REINTERPRET_CAST_VOID_PTR_IF_NOT_PYPY(...) (__VA_ARGS__)
+#else
+#    define PYBIND11_REINTERPRET_CAST_VOID_PTR_IF_NOT_PYPY(...)                                   \
+        (reinterpret_cast<void *>(__VA_ARGS__))
+#endif
+
+#if defined(PYBIND11_HAS_FILESYSTEM) || defined(PYBIND11_HAS_EXPERIMENTAL_FILESYSTEM)
+template <typename T>
+struct path_caster {
+
+private:
+    static PyObject *unicode_from_fs_native(const std::string &w) {
+#    if !defined(PYPY_VERSION)
+        return PyUnicode_DecodeFSDefaultAndSize(w.c_str(), ssize_t(w.size()));
+#    else
+        // PyPy mistakenly declares the first parameter as non-const.
+        return PyUnicode_DecodeFSDefaultAndSize(const_cast<char *>(w.c_str()), ssize_t(w.size()));
+#    endif
+    }
+
+    static PyObject *unicode_from_fs_native(const std::wstring &w) {
+        return PyUnicode_FromWideChar(w.c_str(), ssize_t(w.size()));
+    }
+
+public:
+    static handle cast(const T &path, return_value_policy, handle) {
+        if (auto py_str = unicode_from_fs_native(path.native())) {
+            return module_::import("pathlib")
+                .attr("Path")(reinterpret_steal<object>(py_str))
+                .release();
+        }
+        return nullptr;
+    }
+
+    bool load(handle handle, bool) {
+        // PyUnicode_FSConverter and PyUnicode_FSDecoder normally take care of
+        // calling PyOS_FSPath themselves, but that's broken on PyPy (PyPy
+        // issue #3168) so we do it ourselves instead.
+        PyObject *buf = PyOS_FSPath(handle.ptr());
+        if (!buf) {
+            PyErr_Clear();
+            return false;
+        }
+        PyObject *native = nullptr;
+        if constexpr (std::is_same_v<typename T::value_type, char>) {
+            if (PyUnicode_FSConverter(buf, PYBIND11_REINTERPRET_CAST_VOID_PTR_IF_NOT_PYPY(&native))
+                != 0) {
+                if (auto *c_str = PyBytes_AsString(native)) {
+                    // AsString returns a pointer to the internal buffer, which
+                    // must not be free'd.
+                    value = c_str;
+                }
+            }
+        } else if constexpr (std::is_same_v<typename T::value_type, wchar_t>) {
+            if (PyUnicode_FSDecoder(buf, PYBIND11_REINTERPRET_CAST_VOID_PTR_IF_NOT_PYPY(&native))
+                != 0) {
+                if (auto *c_str = PyUnicode_AsWideCharString(native, nullptr)) {
+                    // AsWideCharString returns a new string that must be free'd.
+                    value = c_str; // Copies the string.
+                    PyMem_Free(c_str);
+                }
+            }
+        }
+        Py_XDECREF(native);
+        Py_DECREF(buf);
+        if (PyErr_Occurred()) {
+            PyErr_Clear();
+            return false;
+        }
+        return true;
+    }
+
+    PYBIND11_TYPE_CASTER(T, const_name("os.PathLike"));
+};
+
+#endif // PYBIND11_HAS_FILESYSTEM || defined(PYBIND11_HAS_EXPERIMENTAL_FILESYSTEM)
+
+#if defined(PYBIND11_HAS_FILESYSTEM)
+template <>
+struct type_caster<std::filesystem::path> : public path_caster<std::filesystem::path> {};
+#elif defined(PYBIND11_HAS_EXPERIMENTAL_FILESYSTEM)
+template <>
+struct type_caster<std::experimental::filesystem::path>
+    : public path_caster<std::experimental::filesystem::path> {};
+#endif
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/phivenv/Lib/site-packages/torch/include/pybind11/stl_bind.h b/phivenv/Lib/site-packages/torch/include/pybind11/stl_bind.h
new file mode 100644
index 0000000000000000000000000000000000000000..368792614bb5a0a5cff114384069445dfe058a29
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/pybind11/stl_bind.h
@@ -0,0 +1,822 @@
+/*
+    pybind11/std_bind.h: Binding generators for STL data types
+
+    Copyright (c) 2016 Sergey Lyskov and Wenzel Jakob
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+#include "detail/type_caster_base.h"
+#include "cast.h"
+#include "operators.h"
+
+#include <algorithm>
+#include <sstream>
+#include <type_traits>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+/* SFINAE helper class used by 'is_comparable */
+template <typename T>
+struct container_traits {
+    template <typename T2>
+    static std::true_type
+    test_comparable(decltype(std::declval<const T2 &>() == std::declval<const T2 &>()) *);
+    template <typename T2>
+    static std::false_type test_comparable(...);
+    template <typename T2>
+    static std::true_type test_value(typename T2::value_type *);
+    template <typename T2>
+    static std::false_type test_value(...);
+    template <typename T2>
+    static std::true_type test_pair(typename T2::first_type *, typename T2::second_type *);
+    template <typename T2>
+    static std::false_type test_pair(...);
+
+    static constexpr const bool is_comparable
+        = std::is_same<std::true_type, decltype(test_comparable<T>(nullptr))>::value;
+    static constexpr const bool is_pair
+        = std::is_same<std::true_type, decltype(test_pair<T>(nullptr, nullptr))>::value;
+    static constexpr const bool is_vector
+        = std::is_same<std::true_type, decltype(test_value<T>(nullptr))>::value;
+    static constexpr const bool is_element = !is_pair && !is_vector;
+};
+
+/* Default: is_comparable -> std::false_type */
+template <typename T, typename SFINAE = void>
+struct is_comparable : std::false_type {};
+
+/* For non-map data structures, check whether operator== can be instantiated */
+template <typename T>
+struct is_comparable<
+    T,
+    enable_if_t<container_traits<T>::is_element && container_traits<T>::is_comparable>>
+    : std::true_type {};
+
+/* For a vector/map data structure, recursively check the value type
+   (which is std::pair for maps) */
+template <typename T>
+struct is_comparable<T, enable_if_t<container_traits<T>::is_vector>>
+    : is_comparable<typename recursive_container_traits<T>::type_to_check_recursively> {};
+
+template <>
+struct is_comparable<recursive_bottom> : std::true_type {};
+
+/* For pairs, recursively check the two data types */
+template <typename T>
+struct is_comparable<T, enable_if_t<container_traits<T>::is_pair>> {
+    static constexpr const bool value = is_comparable<typename T::first_type>::value
+                                        && is_comparable<typename T::second_type>::value;
+};
+
+/* Fallback functions */
+template <typename, typename, typename... Args>
+void vector_if_copy_constructible(const Args &...) {}
+template <typename, typename, typename... Args>
+void vector_if_equal_operator(const Args &...) {}
+template <typename, typename, typename... Args>
+void vector_if_insertion_operator(const Args &...) {}
+template <typename, typename, typename... Args>
+void vector_modifiers(const Args &...) {}
+
+template <typename Vector, typename Class_>
+void vector_if_copy_constructible(enable_if_t<is_copy_constructible<Vector>::value, Class_> &cl) {
+    cl.def(init<const Vector &>(), "Copy constructor");
+}
+
+template <typename Vector, typename Class_>
+void vector_if_equal_operator(enable_if_t<is_comparable<Vector>::value, Class_> &cl) {
+    using T = typename Vector::value_type;
+
+    cl.def(self == self);
+    cl.def(self != self);
+
+    cl.def(
+        "count",
+        [](const Vector &v, const T &x) { return std::count(v.begin(), v.end(), x); },
+        arg("x"),
+        "Return the number of times ``x`` appears in the list");
+
+    cl.def(
+        "remove",
+        [](Vector &v, const T &x) {
+            auto p = std::find(v.begin(), v.end(), x);
+            if (p != v.end()) {
+                v.erase(p);
+            } else {
+                throw value_error();
+            }
+        },
+        arg("x"),
+        "Remove the first item from the list whose value is x. "
+        "It is an error if there is no such item.");
+
+    cl.def(
+        "__contains__",
+        [](const Vector &v, const T &x) { return std::find(v.begin(), v.end(), x) != v.end(); },
+        arg("x"),
+        "Return true the container contains ``x``");
+}
+
+// Vector modifiers -- requires a copyable vector_type:
+// (Technically, some of these (pop and __delitem__) don't actually require copyability, but it
+// seems silly to allow deletion but not insertion, so include them here too.)
+template <typename Vector, typename Class_>
+void vector_modifiers(
+    enable_if_t<is_copy_constructible<typename Vector::value_type>::value, Class_> &cl) {
+    using T = typename Vector::value_type;
+    using SizeType = typename Vector::size_type;
+    using DiffType = typename Vector::difference_type;
+
+    auto wrap_i = [](DiffType i, SizeType n) {
+        if (i < 0) {
+            i += n;
+        }
+        if (i < 0 || (SizeType) i >= n) {
+            throw index_error();
+        }
+        return i;
+    };
+
+    cl.def(
+        "append",
+        [](Vector &v, const T &value) { v.push_back(value); },
+        arg("x"),
+        "Add an item to the end of the list");
+
+    cl.def(init([](const iterable &it) {
+        auto v = std::unique_ptr<Vector>(new Vector());
+        v->reserve(len_hint(it));
+        for (handle h : it) {
+            v->push_back(h.cast<T>());
+        }
+        return v.release();
+    }));
+
+    cl.def("clear", [](Vector &v) { v.clear(); }, "Clear the contents");
+
+    cl.def(
+        "extend",
+        [](Vector &v, const Vector &src) { v.insert(v.end(), src.begin(), src.end()); },
+        arg("L"),
+        "Extend the list by appending all the items in the given list");
+
+    cl.def(
+        "extend",
+        [](Vector &v, const iterable &it) {
+            const size_t old_size = v.size();
+            v.reserve(old_size + len_hint(it));
+            try {
+                for (handle h : it) {
+                    v.push_back(h.cast<T>());
+                }
+            } catch (const cast_error &) {
+                v.erase(v.begin() + static_cast<typename Vector::difference_type>(old_size),
+                        v.end());
+                try {
+                    v.shrink_to_fit();
+                } catch (const std::exception &) { // NOLINT(bugprone-empty-catch)
+                    // Do nothing
+                }
+                throw;
+            }
+        },
+        arg("L"),
+        "Extend the list by appending all the items in the given list");
+
+    cl.def(
+        "insert",
+        [](Vector &v, DiffType i, const T &x) {
+            // Can't use wrap_i; i == v.size() is OK
+            if (i < 0) {
+                i += v.size();
+            }
+            if (i < 0 || (SizeType) i > v.size()) {
+                throw index_error();
+            }
+            v.insert(v.begin() + i, x);
+        },
+        arg("i"),
+        arg("x"),
+        "Insert an item at a given position.");
+
+    cl.def(
+        "pop",
+        [](Vector &v) {
+            if (v.empty()) {
+                throw index_error();
+            }
+            T t = std::move(v.back());
+            v.pop_back();
+            return t;
+        },
+        "Remove and return the last item");
+
+    cl.def(
+        "pop",
+        [wrap_i](Vector &v, DiffType i) {
+            i = wrap_i(i, v.size());
+            T t = std::move(v[(SizeType) i]);
+            v.erase(std::next(v.begin(), i));
+            return t;
+        },
+        arg("i"),
+        "Remove and return the item at index ``i``");
+
+    cl.def("__setitem__", [wrap_i](Vector &v, DiffType i, const T &t) {
+        i = wrap_i(i, v.size());
+        v[(SizeType) i] = t;
+    });
+
+    /// Slicing protocol
+    cl.def(
+        "__getitem__",
+        [](const Vector &v, const slice &slice) -> Vector * {
+            size_t start = 0, stop = 0, step = 0, slicelength = 0;
+
+            if (!slice.compute(v.size(), &start, &stop, &step, &slicelength)) {
+                throw error_already_set();
+            }
+
+            auto *seq = new Vector();
+            seq->reserve((size_t) slicelength);
+
+            for (size_t i = 0; i < slicelength; ++i) {
+                seq->push_back(v[start]);
+                start += step;
+            }
+            return seq;
+        },
+        arg("s"),
+        "Retrieve list elements using a slice object");
+
+    cl.def(
+        "__setitem__",
+        [](Vector &v, const slice &slice, const Vector &value) {
+            size_t start = 0, stop = 0, step = 0, slicelength = 0;
+            if (!slice.compute(v.size(), &start, &stop, &step, &slicelength)) {
+                throw error_already_set();
+            }
+
+            if (slicelength != value.size()) {
+                throw std::runtime_error(
+                    "Left and right hand size of slice assignment have different sizes!");
+            }
+
+            for (size_t i = 0; i < slicelength; ++i) {
+                v[start] = value[i];
+                start += step;
+            }
+        },
+        "Assign list elements using a slice object");
+
+    cl.def(
+        "__delitem__",
+        [wrap_i](Vector &v, DiffType i) {
+            i = wrap_i(i, v.size());
+            v.erase(v.begin() + i);
+        },
+        "Delete the list elements at index ``i``");
+
+    cl.def(
+        "__delitem__",
+        [](Vector &v, const slice &slice) {
+            size_t start = 0, stop = 0, step = 0, slicelength = 0;
+
+            if (!slice.compute(v.size(), &start, &stop, &step, &slicelength)) {
+                throw error_already_set();
+            }
+
+            if (step == 1 && false) {
+                v.erase(v.begin() + (DiffType) start, v.begin() + DiffType(start + slicelength));
+            } else {
+                for (size_t i = 0; i < slicelength; ++i) {
+                    v.erase(v.begin() + DiffType(start));
+                    start += step - 1;
+                }
+            }
+        },
+        "Delete list elements using a slice object");
+}
+
+// If the type has an operator[] that doesn't return a reference (most notably std::vector<bool>),
+// we have to access by copying; otherwise we return by reference.
+template <typename Vector>
+using vector_needs_copy
+    = negation<std::is_same<decltype(std::declval<Vector>()[typename Vector::size_type()]),
+                            typename Vector::value_type &>>;
+
+// The usual case: access and iterate by reference
+template <typename Vector, typename Class_>
+void vector_accessor(enable_if_t<!vector_needs_copy<Vector>::value, Class_> &cl) {
+    using T = typename Vector::value_type;
+    using SizeType = typename Vector::size_type;
+    using DiffType = typename Vector::difference_type;
+    using ItType = typename Vector::iterator;
+
+    auto wrap_i = [](DiffType i, SizeType n) {
+        if (i < 0) {
+            i += n;
+        }
+        if (i < 0 || (SizeType) i >= n) {
+            throw index_error();
+        }
+        return i;
+    };
+
+    cl.def(
+        "__getitem__",
+        [wrap_i](Vector &v, DiffType i) -> T & {
+            i = wrap_i(i, v.size());
+            return v[(SizeType) i];
+        },
+        return_value_policy::reference_internal // ref + keepalive
+    );
+
+    cl.def(
+        "__iter__",
+        [](Vector &v) {
+            return make_iterator<return_value_policy::reference_internal, ItType, ItType, T &>(
+                v.begin(), v.end());
+        },
+        keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
+    );
+}
+
+// The case for special objects, like std::vector<bool>, that have to be returned-by-copy:
+template <typename Vector, typename Class_>
+void vector_accessor(enable_if_t<vector_needs_copy<Vector>::value, Class_> &cl) {
+    using T = typename Vector::value_type;
+    using SizeType = typename Vector::size_type;
+    using DiffType = typename Vector::difference_type;
+    using ItType = typename Vector::iterator;
+    cl.def("__getitem__", [](const Vector &v, DiffType i) -> T {
+        if (i < 0) {
+            i += v.size();
+            if (i < 0) {
+                throw index_error();
+            }
+        }
+        auto i_st = static_cast<SizeType>(i);
+        if (i_st >= v.size()) {
+            throw index_error();
+        }
+        return v[i_st];
+    });
+
+    cl.def(
+        "__iter__",
+        [](Vector &v) {
+            return make_iterator<return_value_policy::copy, ItType, ItType, T>(v.begin(), v.end());
+        },
+        keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
+    );
+}
+
+template <typename Vector, typename Class_>
+auto vector_if_insertion_operator(Class_ &cl, std::string const &name)
+    -> decltype(std::declval<std::ostream &>() << std::declval<typename Vector::value_type>(),
+                void()) {
+    using size_type = typename Vector::size_type;
+
+    cl.def(
+        "__repr__",
+        [name](Vector &v) {
+            std::ostringstream s;
+            s << name << '[';
+            for (size_type i = 0; i < v.size(); ++i) {
+                s << v[i];
+                if (i != v.size() - 1) {
+                    s << ", ";
+                }
+            }
+            s << ']';
+            return s.str();
+        },
+        "Return the canonical string representation of this list.");
+}
+
+// Provide the buffer interface for vectors if we have data() and we have a format for it
+// GCC seems to have "void std::vector<bool>::data()" - doing SFINAE on the existence of data()
+// is insufficient, we need to check it returns an appropriate pointer
+template <typename Vector, typename = void>
+struct vector_has_data_and_format : std::false_type {};
+template <typename Vector>
+struct vector_has_data_and_format<
+    Vector,
+    enable_if_t<std::is_same<decltype(format_descriptor<typename Vector::value_type>::format(),
+                                      std::declval<Vector>().data()),
+                             typename Vector::value_type *>::value>> : std::true_type {};
+
+// [workaround(intel)] Separate function required here
+// Workaround as the Intel compiler does not compile the enable_if_t part below
+// (tested with icc (ICC) 2021.1 Beta 20200827)
+template <typename... Args>
+constexpr bool args_any_are_buffer() {
+    return detail::any_of<std::is_same<Args, buffer_protocol>...>::value;
+}
+
+// [workaround(intel)] Separate function required here
+// [workaround(msvc)] Can't use constexpr bool in return type
+
+// Add the buffer interface to a vector
+template <typename Vector, typename Class_, typename... Args>
+void vector_buffer_impl(Class_ &cl, std::true_type) {
+    using T = typename Vector::value_type;
+
+    static_assert(vector_has_data_and_format<Vector>::value,
+                  "There is not an appropriate format descriptor for this vector");
+
+    // numpy.h declares this for arbitrary types, but it may raise an exception and crash hard
+    // at runtime if PYBIND11_NUMPY_DTYPE hasn't been called, so check here
+    format_descriptor<T>::format();
+
+    cl.def_buffer([](Vector &v) -> buffer_info {
+        return buffer_info(v.data(),
+                           static_cast<ssize_t>(sizeof(T)),
+                           format_descriptor<T>::format(),
+                           1,
+                           {v.size()},
+                           {sizeof(T)});
+    });
+
+    cl.def(init([](const buffer &buf) {
+        auto info = buf.request();
+        if (info.ndim != 1 || info.strides[0] % static_cast<ssize_t>(sizeof(T))) {
+            throw type_error("Only valid 1D buffers can be copied to a vector");
+        }
+        if (!detail::compare_buffer_info<T>::compare(info)
+            || (ssize_t) sizeof(T) != info.itemsize) {
+            throw type_error("Format mismatch (Python: " + info.format
+                             + " C++: " + format_descriptor<T>::format() + ")");
+        }
+
+        T *p = static_cast<T *>(info.ptr);
+        ssize_t step = info.strides[0] / static_cast<ssize_t>(sizeof(T));
+        T *end = p + info.shape[0] * step;
+        if (step == 1) {
+            return Vector(p, end);
+        }
+        Vector vec;
+        vec.reserve((size_t) info.shape[0]);
+        for (; p != end; p += step) {
+            vec.push_back(*p);
+        }
+        return vec;
+    }));
+
+    return;
+}
+
+template <typename Vector, typename Class_, typename... Args>
+void vector_buffer_impl(Class_ &, std::false_type) {}
+
+template <typename Vector, typename Class_, typename... Args>
+void vector_buffer(Class_ &cl) {
+    vector_buffer_impl<Vector, Class_, Args...>(
+        cl, detail::any_of<std::is_same<Args, buffer_protocol>...>{});
+}
+
+PYBIND11_NAMESPACE_END(detail)
+
+//
+// std::vector
+//
+template <typename Vector, typename holder_type = std::unique_ptr<Vector>, typename... Args>
+class_<Vector, holder_type> bind_vector(handle scope, std::string const &name, Args &&...args) {
+    using Class_ = class_<Vector, holder_type>;
+
+    // If the value_type is unregistered (e.g. a converting type) or is itself registered
+    // module-local then make the vector binding module-local as well:
+    using vtype = typename Vector::value_type;
+    auto *vtype_info = detail::get_type_info(typeid(vtype));
+    bool local = !vtype_info || vtype_info->module_local;
+
+    Class_ cl(scope, name.c_str(), pybind11::module_local(local), std::forward<Args>(args)...);
+
+    // Declare the buffer interface if a buffer_protocol() is passed in
+    detail::vector_buffer<Vector, Class_, Args...>(cl);
+
+    cl.def(init<>());
+
+    // Register copy constructor (if possible)
+    detail::vector_if_copy_constructible<Vector, Class_>(cl);
+
+    // Register comparison-related operators and functions (if possible)
+    detail::vector_if_equal_operator<Vector, Class_>(cl);
+
+    // Register stream insertion operator (if possible)
+    detail::vector_if_insertion_operator<Vector, Class_>(cl, name);
+
+    // Modifiers require copyable vector value type
+    detail::vector_modifiers<Vector, Class_>(cl);
+
+    // Accessor and iterator; return by value if copyable, otherwise we return by ref + keep-alive
+    detail::vector_accessor<Vector, Class_>(cl);
+
+    cl.def(
+        "__bool__",
+        [](const Vector &v) -> bool { return !v.empty(); },
+        "Check whether the list is nonempty");
+
+    cl.def("__len__", [](const Vector &vec) { return vec.size(); });
+
+#if 0
+    // C++ style functions deprecated, leaving it here as an example
+    cl.def(init<size_type>());
+
+    cl.def("resize",
+         (void (Vector::*) (size_type count)) & Vector::resize,
+         "changes the number of elements stored");
+
+    cl.def("erase",
+        [](Vector &v, SizeType i) {
+        if (i >= v.size())
+            throw index_error();
+        v.erase(v.begin() + i);
+    }, "erases element at index ``i``");
+
+    cl.def("empty",         &Vector::empty,         "checks whether the container is empty");
+    cl.def("size",          &Vector::size,          "returns the number of elements");
+    cl.def("push_back", (void (Vector::*)(const T&)) &Vector::push_back, "adds an element to the end");
+    cl.def("pop_back",                               &Vector::pop_back, "removes the last element");
+
+    cl.def("max_size",      &Vector::max_size,      "returns the maximum possible number of elements");
+    cl.def("reserve",       &Vector::reserve,       "reserves storage");
+    cl.def("capacity",      &Vector::capacity,      "returns the number of elements that can be held in currently allocated storage");
+    cl.def("shrink_to_fit", &Vector::shrink_to_fit, "reduces memory usage by freeing unused memory");
+
+    cl.def("clear", &Vector::clear, "clears the contents");
+    cl.def("swap",   &Vector::swap, "swaps the contents");
+
+    cl.def("front", [](Vector &v) {
+        if (v.size()) return v.front();
+        else throw index_error();
+    }, "access the first element");
+
+    cl.def("back", [](Vector &v) {
+        if (v.size()) return v.back();
+        else throw index_error();
+    }, "access the last element ");
+
+#endif
+
+    return cl;
+}
+
+//
+// std::map, std::unordered_map
+//
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+/* Fallback functions */
+template <typename, typename, typename... Args>
+void map_if_insertion_operator(const Args &...) {}
+template <typename, typename, typename... Args>
+void map_assignment(const Args &...) {}
+
+// Map assignment when copy-assignable: just copy the value
+template <typename Map, typename Class_>
+void map_assignment(
+    enable_if_t<is_copy_assignable<typename Map::mapped_type>::value, Class_> &cl) {
+    using KeyType = typename Map::key_type;
+    using MappedType = typename Map::mapped_type;
+
+    cl.def("__setitem__", [](Map &m, const KeyType &k, const MappedType &v) {
+        auto it = m.find(k);
+        if (it != m.end()) {
+            it->second = v;
+        } else {
+            m.emplace(k, v);
+        }
+    });
+}
+
+// Not copy-assignable, but still copy-constructible: we can update the value by erasing and
+// reinserting
+template <typename Map, typename Class_>
+void map_assignment(enable_if_t<!is_copy_assignable<typename Map::mapped_type>::value
+                                    && is_copy_constructible<typename Map::mapped_type>::value,
+                                Class_> &cl) {
+    using KeyType = typename Map::key_type;
+    using MappedType = typename Map::mapped_type;
+
+    cl.def("__setitem__", [](Map &m, const KeyType &k, const MappedType &v) {
+        // We can't use m[k] = v; because value type might not be default constructable
+        auto r = m.emplace(k, v);
+        if (!r.second) {
+            // value type is not copy assignable so the only way to insert it is to erase it
+            // first...
+            m.erase(r.first);
+            m.emplace(k, v);
+        }
+    });
+}
+
+template <typename Map, typename Class_>
+auto map_if_insertion_operator(Class_ &cl, std::string const &name)
+    -> decltype(std::declval<std::ostream &>() << std::declval<typename Map::key_type>()
+                                               << std::declval<typename Map::mapped_type>(),
+                void()) {
+
+    cl.def(
+        "__repr__",
+        [name](Map &m) {
+            std::ostringstream s;
+            s << name << '{';
+            bool f = false;
+            for (auto const &kv : m) {
+                if (f) {
+                    s << ", ";
+                }
+                s << kv.first << ": " << kv.second;
+                f = true;
+            }
+            s << '}';
+            return s.str();
+        },
+        "Return the canonical string representation of this map.");
+}
+
+struct keys_view {
+    virtual size_t len() = 0;
+    virtual iterator iter() = 0;
+    virtual bool contains(const handle &k) = 0;
+    virtual ~keys_view() = default;
+};
+
+struct values_view {
+    virtual size_t len() = 0;
+    virtual iterator iter() = 0;
+    virtual ~values_view() = default;
+};
+
+struct items_view {
+    virtual size_t len() = 0;
+    virtual iterator iter() = 0;
+    virtual ~items_view() = default;
+};
+
+template <typename Map>
+struct KeysViewImpl : public detail::keys_view {
+    explicit KeysViewImpl(Map &map) : map(map) {}
+    size_t len() override { return map.size(); }
+    iterator iter() override { return make_key_iterator(map.begin(), map.end()); }
+    bool contains(const handle &k) override {
+        try {
+            return map.find(k.template cast<typename Map::key_type>()) != map.end();
+        } catch (const cast_error &) {
+            return false;
+        }
+    }
+    Map &map;
+};
+
+template <typename Map>
+struct ValuesViewImpl : public detail::values_view {
+    explicit ValuesViewImpl(Map &map) : map(map) {}
+    size_t len() override { return map.size(); }
+    iterator iter() override { return make_value_iterator(map.begin(), map.end()); }
+    Map &map;
+};
+
+template <typename Map>
+struct ItemsViewImpl : public detail::items_view {
+    explicit ItemsViewImpl(Map &map) : map(map) {}
+    size_t len() override { return map.size(); }
+    iterator iter() override { return make_iterator(map.begin(), map.end()); }
+    Map &map;
+};
+
+PYBIND11_NAMESPACE_END(detail)
+
+template <typename Map, typename holder_type = std::unique_ptr<Map>, typename... Args>
+class_<Map, holder_type> bind_map(handle scope, const std::string &name, Args &&...args) {
+    using KeyType = typename Map::key_type;
+    using MappedType = typename Map::mapped_type;
+    using KeysView = detail::keys_view;
+    using ValuesView = detail::values_view;
+    using ItemsView = detail::items_view;
+    using Class_ = class_<Map, holder_type>;
+
+    // If either type is a non-module-local bound type then make the map binding non-local as well;
+    // otherwise (e.g. both types are either module-local or converting) the map will be
+    // module-local.
+    auto *tinfo = detail::get_type_info(typeid(MappedType));
+    bool local = !tinfo || tinfo->module_local;
+    if (local) {
+        tinfo = detail::get_type_info(typeid(KeyType));
+        local = !tinfo || tinfo->module_local;
+    }
+
+    Class_ cl(scope, name.c_str(), pybind11::module_local(local), std::forward<Args>(args)...);
+
+    // Wrap KeysView if it wasn't already wrapped
+    if (!detail::get_type_info(typeid(KeysView))) {
+        class_<KeysView> keys_view(scope, "KeysView", pybind11::module_local(local));
+        keys_view.def("__len__", &KeysView::len);
+        keys_view.def("__iter__",
+                      &KeysView::iter,
+                      keep_alive<0, 1>() /* Essential: keep view alive while iterator exists */
+        );
+        keys_view.def("__contains__", &KeysView::contains);
+    }
+    // Similarly for ValuesView:
+    if (!detail::get_type_info(typeid(ValuesView))) {
+        class_<ValuesView> values_view(scope, "ValuesView", pybind11::module_local(local));
+        values_view.def("__len__", &ValuesView::len);
+        values_view.def("__iter__",
+                        &ValuesView::iter,
+                        keep_alive<0, 1>() /* Essential: keep view alive while iterator exists */
+        );
+    }
+    // Similarly for ItemsView:
+    if (!detail::get_type_info(typeid(ItemsView))) {
+        class_<ItemsView> items_view(scope, "ItemsView", pybind11::module_local(local));
+        items_view.def("__len__", &ItemsView::len);
+        items_view.def("__iter__",
+                       &ItemsView::iter,
+                       keep_alive<0, 1>() /* Essential: keep view alive while iterator exists */
+        );
+    }
+
+    cl.def(init<>());
+
+    // Register stream insertion operator (if possible)
+    detail::map_if_insertion_operator<Map, Class_>(cl, name);
+
+    cl.def(
+        "__bool__",
+        [](const Map &m) -> bool { return !m.empty(); },
+        "Check whether the map is nonempty");
+
+    cl.def(
+        "__iter__",
+        [](Map &m) { return make_key_iterator(m.begin(), m.end()); },
+        keep_alive<0, 1>() /* Essential: keep map alive while iterator exists */
+    );
+
+    cl.def(
+        "keys",
+        [](Map &m) { return std::unique_ptr<KeysView>(new detail::KeysViewImpl<Map>(m)); },
+        keep_alive<0, 1>() /* Essential: keep map alive while view exists */
+    );
+
+    cl.def(
+        "values",
+        [](Map &m) { return std::unique_ptr<ValuesView>(new detail::ValuesViewImpl<Map>(m)); },
+        keep_alive<0, 1>() /* Essential: keep map alive while view exists */
+    );
+
+    cl.def(
+        "items",
+        [](Map &m) { return std::unique_ptr<ItemsView>(new detail::ItemsViewImpl<Map>(m)); },
+        keep_alive<0, 1>() /* Essential: keep map alive while view exists */
+    );
+
+    cl.def(
+        "__getitem__",
+        [](Map &m, const KeyType &k) -> MappedType & {
+            auto it = m.find(k);
+            if (it == m.end()) {
+                throw key_error();
+            }
+            return it->second;
+        },
+        return_value_policy::reference_internal // ref + keepalive
+    );
+
+    cl.def("__contains__", [](Map &m, const KeyType &k) -> bool {
+        auto it = m.find(k);
+        if (it == m.end()) {
+            return false;
+        }
+        return true;
+    });
+    // Fallback for when the object is not of the key type
+    cl.def("__contains__", [](Map &, const object &) -> bool { return false; });
+
+    // Assignment provided only if the type is copyable
+    detail::map_assignment<Map, Class_>(cl);
+
+    cl.def("__delitem__", [](Map &m, const KeyType &k) {
+        auto it = m.find(k);
+        if (it == m.end()) {
+            throw key_error();
+        }
+        m.erase(it);
+    });
+
+    // Always use a lambda in case of `using` declaration
+    cl.def("__len__", [](const Map &m) { return m.size(); });
+
+    return cl;
+}
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/phivenv/Lib/site-packages/torch/include/pybind11/type_caster_pyobject_ptr.h b/phivenv/Lib/site-packages/torch/include/pybind11/type_caster_pyobject_ptr.h
new file mode 100644
index 0000000000000000000000000000000000000000..2adb03394bf63aa3a4b2384d80cf39614d49901b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/pybind11/type_caster_pyobject_ptr.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2023 The pybind Community.
+
+#pragma once
+
+#include "detail/common.h"
+#include "detail/descr.h"
+#include "cast.h"
+#include "pytypes.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <>
+class type_caster<PyObject> {
+public:
+    static constexpr auto name = const_name("object"); // See discussion under PR #4601.
+
+    // This overload is purely to guard against accidents.
+    template <typename T,
+              detail::enable_if_t<!is_same_ignoring_cvref<T, PyObject *>::value, int> = 0>
+    static handle cast(T &&, return_value_policy, handle /*parent*/) {
+        static_assert(is_same_ignoring_cvref<T, PyObject *>::value,
+                      "Invalid C++ type T for to-Python conversion (type_caster<PyObject>).");
+        return nullptr; // Unreachable.
+    }
+
+    static handle cast(PyObject *src, return_value_policy policy, handle /*parent*/) {
+        if (src == nullptr) {
+            throw error_already_set();
+        }
+        if (PyErr_Occurred()) {
+            raise_from(PyExc_SystemError, "src != nullptr but PyErr_Occurred()");
+            throw error_already_set();
+        }
+        if (policy == return_value_policy::take_ownership) {
+            return src;
+        }
+        if (policy == return_value_policy::reference
+            || policy == return_value_policy::automatic_reference) {
+            return handle(src).inc_ref();
+        }
+        pybind11_fail("type_caster<PyObject>::cast(): unsupported return_value_policy: "
+                      + std::to_string(static_cast<int>(policy)));
+    }
+
+    bool load(handle src, bool) {
+        value = reinterpret_borrow<object>(src);
+        return true;
+    }
+
+    template <typename T>
+    using cast_op_type = PyObject *;
+
+    explicit operator PyObject *() { return value.ptr(); }
+
+private:
+    object value;
+};
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/phivenv/Lib/site-packages/torch/include/pybind11/typing.h b/phivenv/Lib/site-packages/torch/include/pybind11/typing.h
new file mode 100644
index 0000000000000000000000000000000000000000..951b626acd4754af1a8780c494f56e944a7e4570
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/pybind11/typing.h
@@ -0,0 +1,242 @@
+/*
+    pybind11/typing.h: Convenience wrapper classes for basic Python types
+    with more explicit annotations.
+
+    Copyright (c) 2023 Dustin Spicuzza <dustin@virtualroadside.com>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+#include "cast.h"
+#include "pytypes.h"
+
+#include <algorithm>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(typing)
+
+/*
+    The following types can be used to direct pybind11-generated docstrings
+    to have have more explicit types (e.g., `list[str]` instead of `list`).
+    Just use these in place of existing types.
+
+    There is no additional enforcement of types at runtime.
+*/
+
+template <typename... Types>
+class Tuple : public tuple {
+    using tuple::tuple;
+};
+
+template <typename K, typename V>
+class Dict : public dict {
+    using dict::dict;
+};
+
+template <typename T>
+class List : public list {
+    using list::list;
+};
+
+template <typename T>
+class Set : public set {
+    using set::set;
+};
+
+template <typename T>
+class Iterable : public iterable {
+    using iterable::iterable;
+};
+
+template <typename T>
+class Iterator : public iterator {
+    using iterator::iterator;
+};
+
+template <typename Signature>
+class Callable;
+
+template <typename Return, typename... Args>
+class Callable<Return(Args...)> : public function {
+    using function::function;
+};
+
+template <typename T>
+class Type : public type {
+    using type::type;
+};
+
+template <typename... Types>
+class Union : public object {
+    PYBIND11_OBJECT_DEFAULT(Union, object, PyObject_Type)
+    using object::object;
+};
+
+template <typename T>
+class Optional : public object {
+    PYBIND11_OBJECT_DEFAULT(Optional, object, PyObject_Type)
+    using object::object;
+};
+
+template <typename T>
+class TypeGuard : public bool_ {
+    using bool_::bool_;
+};
+
+template <typename T>
+class TypeIs : public bool_ {
+    using bool_::bool_;
+};
+
+class NoReturn : public none {
+    using none::none;
+};
+
+class Never : public none {
+    using none::none;
+};
+
+#if defined(__cpp_nontype_template_args) && __cpp_nontype_template_args >= 201911L
+#    define PYBIND11_TYPING_H_HAS_STRING_LITERAL
+template <size_t N>
+struct StringLiteral {
+    constexpr StringLiteral(const char (&str)[N]) { std::copy_n(str, N, name); }
+    char name[N];
+};
+
+template <StringLiteral... StrLits>
+class Literal : public object {
+    PYBIND11_OBJECT_DEFAULT(Literal, object, PyObject_Type)
+};
+
+// Example syntax for creating a TypeVar.
+// typedef typing::TypeVar<"T"> TypeVarT;
+template <StringLiteral>
+class TypeVar : public object {
+    PYBIND11_OBJECT_DEFAULT(TypeVar, object, PyObject_Type)
+    using object::object;
+};
+#endif
+
+PYBIND11_NAMESPACE_END(typing)
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <typename... Types>
+struct handle_type_name<typing::Tuple<Types...>> {
+    static constexpr auto name = const_name("tuple[")
+                                 + ::pybind11::detail::concat(make_caster<Types>::name...)
+                                 + const_name("]");
+};
+
+template <>
+struct handle_type_name<typing::Tuple<>> {
+    // PEP 484 specifies this syntax for an empty tuple
+    static constexpr auto name = const_name("tuple[()]");
+};
+
+template <typename T>
+struct handle_type_name<typing::Tuple<T, ellipsis>> {
+    // PEP 484 specifies this syntax for a variable-length tuple
+    static constexpr auto name
+        = const_name("tuple[") + make_caster<T>::name + const_name(", ...]");
+};
+
+template <typename K, typename V>
+struct handle_type_name<typing::Dict<K, V>> {
+    static constexpr auto name = const_name("dict[") + make_caster<K>::name + const_name(", ")
+                                 + make_caster<V>::name + const_name("]");
+};
+
+template <typename T>
+struct handle_type_name<typing::List<T>> {
+    static constexpr auto name = const_name("list[") + make_caster<T>::name + const_name("]");
+};
+
+template <typename T>
+struct handle_type_name<typing::Set<T>> {
+    static constexpr auto name = const_name("set[") + make_caster<T>::name + const_name("]");
+};
+
+template <typename T>
+struct handle_type_name<typing::Iterable<T>> {
+    static constexpr auto name = const_name("Iterable[") + make_caster<T>::name + const_name("]");
+};
+
+template <typename T>
+struct handle_type_name<typing::Iterator<T>> {
+    static constexpr auto name = const_name("Iterator[") + make_caster<T>::name + const_name("]");
+};
+
+template <typename Return, typename... Args>
+struct handle_type_name<typing::Callable<Return(Args...)>> {
+    using retval_type = conditional_t<std::is_same<Return, void>::value, void_type, Return>;
+    static constexpr auto name
+        = const_name("Callable[[") + ::pybind11::detail::concat(make_caster<Args>::name...)
+          + const_name("], ") + make_caster<retval_type>::name + const_name("]");
+};
+
+template <typename Return>
+struct handle_type_name<typing::Callable<Return(ellipsis)>> {
+    // PEP 484 specifies this syntax for defining only return types of callables
+    using retval_type = conditional_t<std::is_same<Return, void>::value, void_type, Return>;
+    static constexpr auto name
+        = const_name("Callable[..., ") + make_caster<retval_type>::name + const_name("]");
+};
+
+template <typename T>
+struct handle_type_name<typing::Type<T>> {
+    static constexpr auto name = const_name("type[") + make_caster<T>::name + const_name("]");
+};
+
+template <typename... Types>
+struct handle_type_name<typing::Union<Types...>> {
+    static constexpr auto name = const_name("Union[")
+                                 + ::pybind11::detail::concat(make_caster<Types>::name...)
+                                 + const_name("]");
+};
+
+template <typename T>
+struct handle_type_name<typing::Optional<T>> {
+    static constexpr auto name = const_name("Optional[") + make_caster<T>::name + const_name("]");
+};
+
+template <typename T>
+struct handle_type_name<typing::TypeGuard<T>> {
+    static constexpr auto name = const_name("TypeGuard[") + make_caster<T>::name + const_name("]");
+};
+
+template <typename T>
+struct handle_type_name<typing::TypeIs<T>> {
+    static constexpr auto name = const_name("TypeIs[") + make_caster<T>::name + const_name("]");
+};
+
+template <>
+struct handle_type_name<typing::NoReturn> {
+    static constexpr auto name = const_name("NoReturn");
+};
+
+template <>
+struct handle_type_name<typing::Never> {
+    static constexpr auto name = const_name("Never");
+};
+
+#if defined(PYBIND11_TYPING_H_HAS_STRING_LITERAL)
+template <typing::StringLiteral... Literals>
+struct handle_type_name<typing::Literal<Literals...>> {
+    static constexpr auto name = const_name("Literal[")
+                                 + pybind11::detail::concat(const_name(Literals.name)...)
+                                 + const_name("]");
+};
+template <typing::StringLiteral StrLit>
+struct handle_type_name<typing::TypeVar<StrLit>> {
+    static constexpr auto name = const_name(StrLit.name);
+};
+#endif
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/CudaIPCTypes.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/CudaIPCTypes.h
new file mode 100644
index 0000000000000000000000000000000000000000..45930e5327aaa556eb7e6e34e1fa64d91639308c
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/CudaIPCTypes.h
@@ -0,0 +1,143 @@
+#pragma once
+#ifdef USE_CUDA
+#include <c10/core/Allocator.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/cuda/CUDAException.h>
+#include <c10/util/Logging.h>
+#include <cuda_runtime_api.h>
+#include <torch/csrc/Export.h>
+#include <cstddef>
+namespace torch {
+
+TORCH_CUDA_CU_API bool CudaIPCCollect();
+
+struct CudaIPCReceivedData final {
+  CudaIPCReceivedData() = default;
+  explicit CudaIPCReceivedData(std::shared_ptr<void> shared_ptr)
+      : shared_ptr_(std::move(shared_ptr)) {}
+  std::shared_ptr<void> shared_ptr_;
+};
+
+struct CudaIPCSentData final {
+  std::string handle_;
+  uint64_t offset_;
+  uint64_t* counter_ptr_; // Reference counter shared memory block
+  at::DataPtr original_ptr_; // Original mem allocation
+  cudaEvent_t event_; // Sync cuEventDestroy
+  bool event_sync_required_;
+  at::Device device_;
+
+  CudaIPCSentData(
+      std::string handle,
+      uint64_t offset,
+      uint64_t* counter_ptr,
+      at::Device device);
+  ~CudaIPCSentData();
+
+  uint64_t counter_value();
+  std::string handle() {
+    return handle_;
+  }
+  uint64_t offset() {
+    return offset_;
+  }
+  void set_original_ptr(at::DataPtr data_ptr) {
+    original_ptr_ = std::move(data_ptr);
+  }
+};
+
+TORCH_CUDA_CU_API at::DataPtr GetNewRefCountedSentData(
+    void* data,
+    at::Device device);
+
+namespace {
+
+inline constexpr int64_t CUDA_IPC_REF_COUNTER_FILE_SIZE = 10000;
+inline constexpr int64_t CUDA_IPC_WARN_AFTER_X_BLOCKS_IN_LIMBO = 1000;
+// This was determined empirically that CUDA (v10.1 and below) have the limit
+// on the number of recorded blocking interprocess events. It is around ~22,000.
+// And to give us leeway, we picked 1000 as it gives us enough events to share
+// tensors effectively.
+inline constexpr int64_t CUDA_IPC_MAXIMUM_EVENTS_TO_USE = 1000;
+
+// All to be deleted data blocks with non zero reference counter goes there
+struct CudaIPCSentDataLimbo final {
+  ~CudaIPCSentDataLimbo();
+  bool collect();
+  void add(std::unique_ptr<CudaIPCSentData> shared_block);
+  uint64_t size();
+
+ private:
+  // TODO: Can be changed to FIFO in order to avoid full traverse on every
+  // collect()
+  std::vector<std::unique_ptr<CudaIPCSentData>> shared_blocks_;
+  std::mutex limbo_mutex_;
+};
+
+struct CudaIPCRefCountersFile final {
+  CudaIPCRefCountersFile(
+      std::string handle,
+      uint64_t size,
+      at::DataPtr data_ptr)
+      : size_(size),
+
+        handle_(std::move(handle)),
+        refcounted_shared_mem_(std::move(data_ptr)) {}
+
+  uint64_t* counter_ptr() {
+    return static_cast<uint64_t*>(refcounted_shared_mem_.get()) + next_offset_;
+  }
+
+  void set_counter(uint64_t value) {
+    *counter_ptr() = value;
+  }
+
+  bool have_offsets() {
+    return next_offset_ < size_;
+  }
+
+  bool offsets_in_use() {
+    return used_slots_;
+  }
+
+  uint64_t get_offset() {
+    return next_offset_;
+  }
+
+  void rotate_offset() {
+    next_offset_++;
+    used_slots_++;
+  }
+
+  void return_offset(uint64_t offset /* unused */) {
+    used_slots_--;
+  }
+
+  std::string handle() {
+    return handle_;
+  }
+
+ private:
+  uint64_t next_offset_{0};
+  uint64_t size_;
+  uint64_t used_slots_{0};
+  std::string handle_;
+  at::DataPtr refcounted_shared_mem_;
+};
+
+} // namespace
+} // namespace torch
+
+namespace c10 {
+namespace {
+class CudaIPCCollectCallback : public FreeMemoryCallback {
+ public:
+  bool Execute() override {
+    return torch::CudaIPCCollect();
+  }
+};
+} // namespace
+
+} // namespace c10
+
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/DataLoader.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/DataLoader.h
new file mode 100644
index 0000000000000000000000000000000000000000..405c2620b62e7663a6a847515e4a019819d2be11
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/DataLoader.h
@@ -0,0 +1,6 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-avoid-non-const-global-variables,modernize-avoid-c-arrays)
+extern PyMethodDef DataLoaderMethods[];
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/Device.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/Device.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d0a0ae2cae2be4275b169aa8f87b8029eafb7b7
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/Device.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/python_headers.h>
+
+#include <ATen/Device.h>
+
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+struct TORCH_API THPDevice {
+  PyObject_HEAD
+  at::Device device;
+};
+
+TORCH_API extern PyTypeObject THPDeviceType;
+
+inline bool THPDevice_Check(PyObject* obj) {
+  return Py_TYPE(obj) == &THPDeviceType;
+}
+
+TORCH_API PyObject* THPDevice_New(const at::Device& device);
+
+TORCH_API void THPDevice_init(PyObject* module);
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/DeviceAccelerator.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/DeviceAccelerator.h
new file mode 100644
index 0000000000000000000000000000000000000000..33865e2d6cc39c1347d7de592a73202ec308452b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/DeviceAccelerator.h
@@ -0,0 +1,8 @@
+#include <ATen/DeviceAccelerator.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch::accelerator {
+
+void initModule(PyObject* module);
+
+} // namespace torch::accelerator
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/Dtype.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/Dtype.h
new file mode 100644
index 0000000000000000000000000000000000000000..f08698da2d8b726428423b2f134a6ed7aa43892a
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/Dtype.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <c10/core/ScalarType.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/python_headers.h>
+
+constexpr int DTYPE_NAME_LEN = 64;
+
+struct TORCH_API THPDtype {
+  PyObject_HEAD
+  at::ScalarType scalar_type;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+  char name[DTYPE_NAME_LEN + 1];
+};
+
+TORCH_API extern PyTypeObject THPDtypeType;
+
+inline bool THPDtype_Check(PyObject* obj) {
+  return Py_TYPE(obj) == &THPDtypeType;
+}
+
+inline bool THPPythonScalarType_Check(PyObject* obj) {
+  return obj == (PyObject*)(&PyFloat_Type) ||
+      obj == (PyObject*)(&PyComplex_Type) || obj == (PyObject*)(&PyBool_Type) ||
+      obj == (PyObject*)(&PyLong_Type);
+}
+
+TORCH_API PyObject* THPDtype_New(
+    at::ScalarType scalar_type,
+    const std::string& name);
+
+void THPDtype_init(PyObject* module);
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/DynamicTypes.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/DynamicTypes.h
new file mode 100644
index 0000000000000000000000000000000000000000..f7d3e0855f30d0984945c70dabe43e5f0f3a7651
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/DynamicTypes.h
@@ -0,0 +1,37 @@
+#pragma once
+
+// Provides conversions between Python tensor objects and at::Tensor.
+
+#include <torch/csrc/python_headers.h>
+
+#include <ATen/Device.h>
+#include <c10/core/Backend.h>
+#include <c10/core/Layout.h>
+#include <c10/core/ScalarType.h>
+#include <c10/core/ScalarTypeToTypeMeta.h>
+#include <torch/csrc/Export.h>
+
+#include <memory>
+#include <string>
+
+struct THPDtype;
+struct THPLayout;
+
+namespace c10 {
+struct Storage;
+}
+
+namespace torch {
+void registerDtypeObject(THPDtype* dtype, at::ScalarType scalarType);
+void registerLayoutObject(THPLayout* thp_layout, at::Layout layout);
+
+TORCH_PYTHON_API PyObject* createPyObject(const at::Storage& storage);
+TORCH_PYTHON_API at::Storage createStorage(PyObject* obj);
+TORCH_PYTHON_API std::tuple<at::Storage, at::ScalarType, bool>
+createStorageGetType(PyObject* obj);
+TORCH_PYTHON_API bool isStorage(PyObject* obj);
+
+// Both methods below return a borrowed reference!
+TORCH_PYTHON_API THPDtype* getTHPDtype(at::ScalarType scalarType);
+TORCH_PYTHON_API THPLayout* getTHPLayout(at::Layout layout);
+} // namespace torch
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/Event.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/Event.h
new file mode 100644
index 0000000000000000000000000000000000000000..a494963d5ea8eb463c0b420db75cceae80ff0eb1
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/Event.h
@@ -0,0 +1,22 @@
+#ifndef THP_EVENT_INC
+#define THP_EVENT_INC
+
+#include <c10/core/Event.h>
+#include <torch/csrc/python_headers.h>
+
+struct TORCH_API THPEvent {
+  PyObject_HEAD
+  c10::Event event;
+};
+TORCH_API extern PyTypeObject* THPEventClass;
+TORCH_API extern PyTypeObject THPEventType;
+
+TORCH_API void THPEvent_init(PyObject* module);
+TORCH_API PyObject* THPEvent_new(
+    c10::DeviceType device_type,
+    c10::EventFlag flag);
+inline bool THPEvent_Check(PyObject* obj) {
+  return THPEventClass && PyObject_IsInstance(obj, (PyObject*)THPEventClass);
+}
+
+#endif // THP_EVENT_INC
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/Exceptions.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/Exceptions.h
new file mode 100644
index 0000000000000000000000000000000000000000..3328fe75334c7438dc999cccf35a3e1e8181d542
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/Exceptions.h
@@ -0,0 +1,405 @@
+#pragma once
+
+#include <exception>
+#include <memory>
+#include <string>
+#include <system_error>
+
+#include <ATen/detail/FunctionTraits.h>
+#include <c10/util/C++17.h>
+#include <c10/util/Exception.h>
+#include <c10/util/StringUtil.h>
+#include <pybind11/pybind11.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/runtime/jit_exception.h>
+#include <torch/csrc/utils/cpp_stacktraces.h>
+#include <torch/csrc/utils/pybind.h>
+
+#if defined(USE_DISTRIBUTED)
+#include <torch/csrc/distributed/c10d/exception.h>
+#endif
+
+inline void PyErr_SetString(PyObject* type, const std::string& message) {
+  PyErr_SetString(type, message.c_str());
+}
+/// NOTE [ Conversion Cpp Python Warning ]
+/// The warning handler cannot set python warnings immediately
+/// as it requires acquiring the GIL (potential deadlock)
+/// and would need to cleanly exit if the warning raised a
+/// python error. To solve this, we buffer the warnings and
+/// process them when we go back to python.
+/// This requires the two try/catch blocks below to handle the
+/// following cases:
+///   - If there is no Error raised in the inner try/catch, the
+///     buffered warnings are processed as python warnings.
+///     - If they don't raise an error, the function process with the
+///       original return code.
+///     - If any of them raise an error, the error is set (PyErr_*) and
+///       the destructor will raise a cpp exception python_error() that
+///       will be caught by the outer try/catch that will be able to change
+///       the return value of the function to reflect the error.
+///   - If an Error was raised in the inner try/catch, the inner try/catch
+///     must set the python error. The buffered warnings are then
+///     processed as cpp warnings as we cannot predict before hand
+///     whether a python warning will raise an error or not and we
+///     cannot handle two errors at the same time.
+/// This advanced handler will only be used in the current thread.
+/// If any other thread is used, warnings will be processed as
+/// cpp warnings.
+#define HANDLE_TH_ERRORS                              \
+  try {                                               \
+    torch::PyWarningHandler __enforce_warning_buffer; \
+    try {
+#define _CATCH_GENERIC_ERROR(ErrorType, PythonErrorType, retstmnt) \
+  catch (const c10::ErrorType& e) {                                \
+    auto msg = torch::get_cpp_stacktraces_enabled()                \
+        ? e.what()                                                 \
+        : e.what_without_backtrace();                              \
+    PyErr_SetString(PythonErrorType, torch::processErrorMsg(msg)); \
+    retstmnt;                                                      \
+  }
+
+// Only catch torch-specific exceptions
+#define CATCH_CORE_ERRORS(retstmnt)                                           \
+  catch (python_error & e) {                                                  \
+    e.restore();                                                              \
+    retstmnt;                                                                 \
+  }                                                                           \
+  catch (py::error_already_set & e) {                                         \
+    e.restore();                                                              \
+    retstmnt;                                                                 \
+  }                                                                           \
+  _CATCH_GENERIC_ERROR(IndexError, PyExc_IndexError, retstmnt)                \
+  _CATCH_GENERIC_ERROR(ValueError, PyExc_ValueError, retstmnt)                \
+  _CATCH_GENERIC_ERROR(TypeError, PyExc_TypeError, retstmnt)                  \
+  _CATCH_GENERIC_ERROR(                                                       \
+      NotImplementedError, PyExc_NotImplementedError, retstmnt)               \
+  _CATCH_GENERIC_ERROR(SyntaxError, PyExc_SyntaxError, retstmnt)              \
+  _CATCH_GENERIC_ERROR(LinAlgError, THPException_LinAlgError, retstmnt)       \
+  _CATCH_GENERIC_ERROR(                                                       \
+      OutOfMemoryError, THPException_OutOfMemoryError, retstmnt)              \
+  _CATCH_GENERIC_ERROR(                                                       \
+      DistBackendError, THPException_DistBackendError, retstmnt)              \
+  _CATCH_GENERIC_ERROR(                                                       \
+      DistNetworkError, THPException_DistNetworkError, retstmnt)              \
+  _CATCH_GENERIC_ERROR(                                                       \
+      DistQueueEmptyError, THPException_DistQueueEmptyError, retstmnt)        \
+  _CATCH_GENERIC_ERROR(DistStoreError, THPException_DistStoreError, retstmnt) \
+  _CATCH_GENERIC_ERROR(DistError, THPException_DistError, retstmnt)           \
+  catch (c10::AcceleratorError & e) {                                         \
+    auto exc = torch::detail::_new_accelerator_error_object(e);               \
+    PyErr_SetObject(THPException_AcceleratorError, exc);                      \
+    Py_XDECREF(exc);                                                          \
+    retstmnt;                                                                 \
+  }                                                                           \
+  _CATCH_GENERIC_ERROR(Error, PyExc_RuntimeError, retstmnt)                   \
+  catch (torch::PyTorchError & e) {                                           \
+    auto msg = torch::processErrorMsg(e.what());                              \
+    PyErr_SetString(e.python_type(), msg);                                    \
+    retstmnt;                                                                 \
+  }
+
+#define CATCH_TH_ERRORS(retstmnt) CATCH_CORE_ERRORS(retstmnt)
+
+#define CATCH_ALL_ERRORS(retstmnt)               \
+  CATCH_TH_ERRORS(retstmnt)                      \
+  catch (const std::exception& e) {              \
+    auto msg = torch::processErrorMsg(e.what()); \
+    PyErr_SetString(PyExc_RuntimeError, msg);    \
+    retstmnt;                                    \
+  }
+
+#define END_HANDLE_TH_ERRORS_PYBIND                                 \
+  }                                                                 \
+  catch (...) {                                                     \
+    __enforce_warning_buffer.set_in_exception();                    \
+    throw;                                                          \
+  }                                                                 \
+  }                                                                 \
+  catch (py::error_already_set&) {                                  \
+    throw;                                                          \
+  }                                                                 \
+  catch (py::builtin_exception&) {                                  \
+    throw;                                                          \
+  }                                                                 \
+  catch (torch::jit::JITException&) {                               \
+    throw;                                                          \
+  }                                                                 \
+  catch (const std::exception&) {                                   \
+    torch::translate_exception_to_python(std::current_exception()); \
+    throw py::error_already_set();                                  \
+  }
+
+#define END_HANDLE_TH_ERRORS_RET(retval)                            \
+  }                                                                 \
+  catch (...) {                                                     \
+    __enforce_warning_buffer.set_in_exception();                    \
+    throw;                                                          \
+  }                                                                 \
+  }                                                                 \
+  catch (const std::exception& e) {                                 \
+    torch::translate_exception_to_python(std::current_exception()); \
+    return retval;                                                  \
+  }
+
+#define END_HANDLE_TH_ERRORS END_HANDLE_TH_ERRORS_RET(nullptr)
+
+extern PyObject *THPException_FatalError, *THPException_LinAlgError,
+    *THPException_OutOfMemoryError, *THPException_DistError,
+    *THPException_DistBackendError, *THPException_DistNetworkError,
+    *THPException_DistStoreError, *THPException_DistQueueEmptyError,
+    *THPException_AcceleratorError;
+
+// Throwing this exception means that the python error flags have been already
+// set and control should be immediately returned to the interpreter.
+struct python_error : public std::exception {
+  python_error() = default;
+
+  python_error(const python_error& other)
+      : type(other.type),
+        value(other.value),
+        traceback(other.traceback),
+        message(other.message) {
+    pybind11::gil_scoped_acquire gil;
+    Py_XINCREF(type);
+    Py_XINCREF(value);
+    Py_XINCREF(traceback);
+  }
+
+  python_error(python_error&& other) noexcept
+      : type(other.type),
+        value(other.value),
+        traceback(other.traceback),
+        message(std::move(other.message)) {
+    other.type = nullptr;
+    other.value = nullptr;
+    other.traceback = nullptr;
+  }
+
+  python_error& operator=(const python_error& other) = delete;
+  python_error& operator=(python_error&& other) = delete;
+
+  // NOLINTNEXTLINE(bugprone-exception-escape)
+  ~python_error() override {
+    if (type || value || traceback) {
+      pybind11::gil_scoped_acquire gil;
+      Py_XDECREF(type);
+      Py_XDECREF(value);
+      Py_XDECREF(traceback);
+    }
+  }
+
+  const char* what() const noexcept override {
+    return message.c_str();
+  }
+
+  void build_message() {
+    // Ensure we have the GIL.
+    pybind11::gil_scoped_acquire gil;
+
+    // No errors should be set when we enter the function since PyErr_Fetch
+    // clears the error indicator.
+    TORCH_INTERNAL_ASSERT(!PyErr_Occurred());
+
+    // Default message.
+    message = "python_error";
+
+    // Try to retrieve the error message from the value.
+    if (value != nullptr) {
+      // Reference count should not be zero.
+      TORCH_INTERNAL_ASSERT(Py_REFCNT(value) > 0);
+
+      PyObject* pyStr = PyObject_Str(value);
+      if (pyStr != nullptr) {
+        PyObject* encodedString =
+            PyUnicode_AsEncodedString(pyStr, "utf-8", "strict");
+        if (encodedString != nullptr) {
+          char* bytes = PyBytes_AS_STRING(encodedString);
+          if (bytes != nullptr) {
+            // Set the message.
+            message = std::string(bytes);
+          }
+          Py_XDECREF(encodedString);
+        }
+        Py_XDECREF(pyStr);
+      }
+    }
+
+    // Clear any errors since we don't want to propagate errors for functions
+    // that are trying to build a string for the error message.
+    PyErr_Clear();
+  }
+
+  /** Saves the exception so that it can be re-thrown on a different thread */
+  inline void persist() {
+    if (type)
+      return; // Don't overwrite exceptions
+    // PyErr_Fetch overwrites the pointers
+    pybind11::gil_scoped_acquire gil;
+    Py_XDECREF(type);
+    Py_XDECREF(value);
+    Py_XDECREF(traceback);
+    PyErr_Fetch(&type, &value, &traceback);
+    build_message();
+  }
+
+  /** Sets the current Python error from this exception */
+  inline void restore() {
+    if (!type)
+      return;
+    // PyErr_Restore steals references
+    pybind11::gil_scoped_acquire gil;
+    Py_XINCREF(type);
+    Py_XINCREF(value);
+    Py_XINCREF(traceback);
+    PyErr_Restore(type, value, traceback);
+  }
+
+  PyObject* type{nullptr};
+  PyObject* value{nullptr};
+  PyObject* traceback{nullptr};
+
+  // Message to return to the user when 'what()' is invoked.
+  std::string message;
+};
+
+bool THPException_init(PyObject* module);
+
+namespace torch {
+
+// Set python current exception from a C++ exception
+TORCH_PYTHON_API void translate_exception_to_python(const std::exception_ptr&);
+
+TORCH_PYTHON_API std::string processErrorMsg(std::string str);
+
+// Abstract base class for exceptions which translate to specific Python types
+struct PyTorchError : public std::exception {
+  PyTorchError() = default;
+  PyTorchError(std::string msg_) : msg(std::move(msg_)) {}
+  virtual PyObject* python_type() = 0;
+  const char* what() const noexcept override {
+    return msg.c_str();
+  }
+  std::string msg;
+};
+
+// Declare a printf-like function on gcc & clang
+// The compiler can then warn on invalid format specifiers
+#ifdef __GNUC__
+#define TORCH_FORMAT_FUNC(FORMAT_INDEX, VA_ARGS_INDEX) \
+  __attribute__((format(printf, FORMAT_INDEX, VA_ARGS_INDEX)))
+#else
+#define TORCH_FORMAT_FUNC(FORMAT_INDEX, VA_ARGS_INDEX)
+#endif
+
+// Translates to Python TypeError
+struct TypeError : public PyTorchError {
+  using PyTorchError::PyTorchError;
+  TORCH_PYTHON_API TypeError(const char* format, ...) TORCH_FORMAT_FUNC(2, 3);
+  PyObject* python_type() override {
+    return PyExc_TypeError;
+  }
+};
+
+// Translates to Python AttributeError
+struct AttributeError : public PyTorchError {
+  using PyTorchError::PyTorchError;
+  PyObject* python_type() override {
+    return PyExc_AttributeError;
+  }
+};
+
+// ATen warning handler for Python
+struct PyWarningHandler {
+  // Move actual handler into a separate class with a noexcept
+  // destructor. Otherwise, we need to force all WarningHandler
+  // subclasses to have a noexcept(false) destructor.
+  struct InternalHandler : at::WarningHandler {
+    ~InternalHandler() override = default;
+    void process(const c10::Warning& warning) override;
+
+    std::vector<c10::Warning> warning_buffer_;
+  };
+
+ public:
+  /// See NOTE [ Conversion Cpp Python Warning ] for noexcept justification
+  TORCH_PYTHON_API PyWarningHandler() noexcept(true);
+  // NOLINTNEXTLINE(bugprone-exception-escape)
+  TORCH_PYTHON_API ~PyWarningHandler() noexcept(false);
+
+  /** Call if an exception has been thrown
+
+   *  Necessary to determine if it is safe to throw from the destructor since
+   *  std::uncaught_exception is buggy on some platforms and generally
+   *  unreliable across dynamic library calls.
+   */
+  void set_in_exception() {
+    in_exception_ = true;
+  }
+
+ private:
+  InternalHandler internal_handler_;
+  at::WarningHandler* prev_handler_;
+  bool in_exception_;
+};
+
+namespace detail {
+
+struct noop_gil_scoped_release {
+  // user-defined constructor (i.e. not defaulted) to avoid
+  // unused-variable warnings at usage sites of this class
+  // NOLINTNEXTLINE(modernize-use-equals-default)
+  noop_gil_scoped_release() {}
+};
+
+template <bool release_gil>
+using conditional_gil_scoped_release = std::conditional_t<
+    release_gil,
+    pybind11::gil_scoped_release,
+    noop_gil_scoped_release>;
+
+template <typename Func, size_t i>
+using Arg = typename invoke_traits<Func>::template arg<i>::type;
+
+template <typename Func, size_t... Is, bool release_gil>
+auto wrap_pybind_function_impl_(
+    Func&& f,
+    std::index_sequence<Is...>,
+    std::bool_constant<release_gil>) {
+  namespace py = pybind11;
+
+  // f=f is needed to handle function references on older compilers
+  return [f = std::forward<Func>(f)](Arg<Func, Is>... args) {
+    HANDLE_TH_ERRORS
+    conditional_gil_scoped_release<release_gil> no_gil;
+    return std::invoke(f, std::forward<Arg<Func, Is>>(args)...);
+    END_HANDLE_TH_ERRORS_PYBIND
+  };
+}
+
+PyObject* _new_accelerator_error_object(const c10::AcceleratorError&);
+} // namespace detail
+
+// Wrap a function with TH error and warning handling.
+// Returns a function object suitable for registering with pybind11.
+template <typename Func>
+auto wrap_pybind_function(Func&& f) {
+  using traits = invoke_traits<Func>;
+  return torch::detail::wrap_pybind_function_impl_(
+      std::forward<Func>(f),
+      std::make_index_sequence<traits::arity>{},
+      std::false_type{});
+}
+
+// Wrap a function with TH error, warning handling and releases the GIL.
+// Returns a function object suitable for registering with pybind11.
+template <typename Func>
+auto wrap_pybind_function_no_gil(Func&& f) {
+  using traits = invoke_traits<Func>;
+  return torch::detail::wrap_pybind_function_impl_(
+      std::forward<Func>(f),
+      std::make_index_sequence<traits::arity>{},
+      std::true_type{});
+}
+
+} // namespace torch
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/Export.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/Export.h
new file mode 100644
index 0000000000000000000000000000000000000000..b441b88375254ebffd50179e6dfe73f681ca6a80
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/Export.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <c10/macros/Export.h>
+
+#ifdef THP_BUILD_MAIN_LIB
+#define TORCH_PYTHON_API C10_EXPORT
+#else
+#define TORCH_PYTHON_API C10_IMPORT
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/Generator.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/Generator.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f3cfcf0f4dca9572342ecbbc35833aacc3f65f7
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/Generator.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <ATen/core/Generator.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/python_headers.h>
+
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+struct THPGenerator {
+  PyObject_HEAD
+  at::Generator cdata;
+};
+
+// Creates a new Python object wrapping the default at::Generator. The reference
+// is borrowed. The caller should ensure that the at::Generator object lifetime
+// last at least as long as the Python wrapper.
+TORCH_PYTHON_API PyObject* THPGenerator_initDefaultGenerator(
+    const at::Generator& cdata);
+
+#define THPGenerator_Check(obj) PyObject_IsInstance(obj, THPGeneratorClass)
+
+TORCH_PYTHON_API extern PyObject* THPGeneratorClass;
+
+bool THPGenerator_init(PyObject* module);
+
+TORCH_PYTHON_API PyObject* THPGenerator_Wrap(const at::Generator& gen);
+
+TORCH_PYTHON_API at::Generator THPGenerator_Unwrap(PyObject* state);
+
+// Creates a new Python object for a Generator. The Generator must not already
+// have a PyObject* associated with it.
+PyObject* THPGenerator_NewWithVar(PyTypeObject* type, at::Generator gen);
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/Layout.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/Layout.h
new file mode 100644
index 0000000000000000000000000000000000000000..e6a721f7a7aafbb3f8ba2b1feede6bf18b027632
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/Layout.h
@@ -0,0 +1,26 @@
+#pragma once
+#include <torch/csrc/Export.h>
+#include <torch/csrc/python_headers.h>
+
+#include <ATen/Layout.h>
+
+#include <string>
+
+const int LAYOUT_NAME_LEN = 64;
+
+struct THPLayout {
+  PyObject_HEAD
+  at::Layout layout;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+  char name[LAYOUT_NAME_LEN + 1];
+};
+
+TORCH_PYTHON_API extern PyTypeObject THPLayoutType;
+
+inline bool THPLayout_Check(PyObject* obj) {
+  return Py_TYPE(obj) == &THPLayoutType;
+}
+
+PyObject* THPLayout_New(at::Layout layout, const std::string& name);
+
+void THPLayout_init(PyObject* module);
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/MemoryFormat.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/MemoryFormat.h
new file mode 100644
index 0000000000000000000000000000000000000000..bdbd30ba59dd184c67c8898fa74e0aa53b67f00b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/MemoryFormat.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/python_headers.h>
+
+#include <c10/core/MemoryFormat.h>
+
+#include <string>
+
+const int MEMORY_FORMAT_NAME_LEN = 64;
+
+struct THPMemoryFormat {
+  PyObject_HEAD
+  at::MemoryFormat memory_format;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+  char name[MEMORY_FORMAT_NAME_LEN + 1];
+};
+
+TORCH_PYTHON_API extern PyTypeObject THPMemoryFormatType;
+
+inline bool THPMemoryFormat_Check(PyObject* obj) {
+  return Py_TYPE(obj) == &THPMemoryFormatType;
+}
+
+PyObject* THPMemoryFormat_New(
+    at::MemoryFormat memory_format,
+    const std::string& name);
+
+void THPMemoryFormat_init(PyObject* module);
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/Module.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/Module.h
new file mode 100644
index 0000000000000000000000000000000000000000..f9f59d4d1fe350b92baba30b684c995891b0404c
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/Module.h
@@ -0,0 +1,6 @@
+#ifndef THP_MODULE_INC
+#define THP_MODULE_INC
+
+#define THP_STATELESS_ATTRIBUTE_NAME "_torch"
+
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/PyInterpreter.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/PyInterpreter.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c55f44b815425a4610bd479f3fa1b8a129e6457
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/PyInterpreter.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <c10/core/impl/PyInterpreter.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch::detail {
+TORCH_PYTHON_API py::handle getTorchApiFunction(const c10::OperatorHandle& op);
+}
+
+// TODO: Move these to a proper namespace
+TORCH_PYTHON_API c10::impl::PyInterpreter* getPyInterpreter();
+TORCH_PYTHON_API bool isMainPyInterpreter();
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/QScheme.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/QScheme.h
new file mode 100644
index 0000000000000000000000000000000000000000..025acb99361b8b5e84ca8f0e4c236e04becbd96a
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/QScheme.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/python_headers.h>
+
+#include <c10/core/QScheme.h>
+
+#include <string>
+
+constexpr int QSCHEME_NAME_LEN = 64;
+
+struct THPQScheme {
+  PyObject_HEAD
+  at::QScheme qscheme;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+  char name[QSCHEME_NAME_LEN + 1];
+};
+
+TORCH_PYTHON_API extern PyTypeObject THPQSchemeType;
+
+inline bool THPQScheme_Check(PyObject* obj) {
+  return Py_TYPE(obj) == &THPQSchemeType;
+}
+
+PyObject* THPQScheme_New(at::QScheme qscheme, const std::string& name);
+
+void THPQScheme_init(PyObject* module);
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/Size.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/Size.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ae355e39b32280be7229fd730b4ba0f736f0d53
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/Size.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/python_headers.h>
+#include <cstdint>
+
+TORCH_PYTHON_API extern PyTypeObject THPSizeType;
+
+#define THPSize_Check(obj) (Py_TYPE(obj) == &THPSizeType)
+
+PyObject* THPSize_New(const torch::autograd::Variable& t);
+PyObject* THPSize_NewFromSizes(int64_t dim, const int64_t* sizes);
+PyObject* THPSize_NewFromSymSizes(const at::Tensor& t);
+
+void THPSize_init(PyObject* module);
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/Storage.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/Storage.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae721f01cc989d4dd4fbbb3cd5175a53b5ddb0d5
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/Storage.h
@@ -0,0 +1,60 @@
+#ifndef THP_STORAGE_INC
+#define THP_STORAGE_INC
+
+#include <Python.h>
+#include <c10/core/Storage.h>
+#include <torch/csrc/Exceptions.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/Types.h>
+
+#define THPStorageStr "torch.UntypedStorage"
+
+struct THPStorage {
+  PyObject_HEAD
+  c10::MaybeOwned<c10::Storage> cdata;
+  bool is_hermetic;
+};
+
+TORCH_PYTHON_API PyObject* THPStorage_Wrap(c10::Storage storage);
+TORCH_PYTHON_API PyObject* THPStorage_NewWithStorage(
+    PyTypeObject* type,
+    c10::Storage _storage,
+    c10::impl::PyInterpreterStatus status,
+    bool allow_preexisting_pyobj = false);
+TORCH_PYTHON_API extern PyTypeObject* THPStorageClass;
+
+inline bool THPStorage_CheckTypeExact(PyTypeObject* tp) {
+  return tp == THPStorageClass;
+}
+
+inline bool THPStorage_CheckExact(PyObject* obj) {
+  return THPStorage_CheckTypeExact(Py_TYPE(obj));
+}
+
+inline bool THPStorage_Check(PyObject* obj) {
+  if (!THPStorageClass)
+    return false;
+
+  const auto result = PyObject_IsInstance(obj, (PyObject*)THPStorageClass);
+  if (result == -1)
+    throw python_error();
+  return result;
+}
+
+bool THPStorage_init(PyObject* module);
+void THPStorage_postInit(PyObject* module);
+
+void THPStorage_assertNotNull(THPStorage* storage);
+TORCH_PYTHON_API void THPStorage_assertNotNull(PyObject* obj);
+
+TORCH_PYTHON_API extern PyTypeObject THPStorageType;
+
+inline const c10::Storage& THPStorage_Unpack(THPStorage* storage) {
+  return *storage->cdata;
+}
+
+inline const c10::Storage& THPStorage_Unpack(PyObject* obj) {
+  return THPStorage_Unpack(reinterpret_cast<THPStorage*>(obj));
+}
+
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/StorageMethods.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/StorageMethods.h
new file mode 100644
index 0000000000000000000000000000000000000000..67918fc036f975bb2e58f2fbdedda1ad9d8d347d
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/StorageMethods.h
@@ -0,0 +1,8 @@
+#ifndef THP_STORAGE_METHODS_INC
+#define THP_STORAGE_METHODS_INC
+
+#include <Python.h>
+
+PyMethodDef* THPStorage_getMethods();
+
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/StorageSharing.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/StorageSharing.h
new file mode 100644
index 0000000000000000000000000000000000000000..d00ee368e3a556d23b628bdafb3da4d46f78becb
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/StorageSharing.h
@@ -0,0 +1,8 @@
+#ifndef THP_STORAGE_SHARING_INC
+#define THP_STORAGE_SHARING_INC
+
+#include <Python.h>
+
+PyMethodDef* THPStorage_getSharingMethods();
+
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/Stream.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/Stream.h
new file mode 100644
index 0000000000000000000000000000000000000000..b331bb03deb1ec5309784e1ed4832905dc243b41
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/Stream.h
@@ -0,0 +1,27 @@
+#ifndef THP_STREAM_INC
+#define THP_STREAM_INC
+
+#include <c10/core/Stream.h>
+#include <c10/macros/Export.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/python_headers.h>
+
+struct THPStream {
+  PyObject_HEAD
+  int64_t stream_id;
+  int64_t device_type;
+  int64_t device_index;
+  // Used to switch stream context management, initialized lazily.
+  PyObject* context;
+};
+extern TORCH_API PyTypeObject* THPStreamClass;
+
+void THPStream_init(PyObject* module);
+
+inline bool THPStream_Check(PyObject* obj) {
+  return THPStreamClass && PyObject_IsInstance(obj, (PyObject*)THPStreamClass);
+}
+
+TORCH_PYTHON_API PyObject* THPStream_Wrap(const c10::Stream& stream);
+
+#endif // THP_STREAM_INC
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/THConcat.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/THConcat.h
new file mode 100644
index 0000000000000000000000000000000000000000..129571bade3d3e1e8cda01471d37d85078b394d6
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/THConcat.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#define TH_CONCAT_STRING_2(x, y) TH_CONCAT_STRING_2_EXPAND(x, y)
+#define TH_CONCAT_STRING_2_EXPAND(x, y) #x #y
+
+#define TH_CONCAT_STRING_3(x, y, z) TH_CONCAT_STRING_3_EXPAND(x, y, z)
+#define TH_CONCAT_STRING_3_EXPAND(x, y, z) #x #y #z
+
+#define TH_CONCAT_STRING_4(x, y, z, w) TH_CONCAT_STRING_4_EXPAND(x, y, z, w)
+#define TH_CONCAT_STRING_4_EXPAND(x, y, z, w) #x #y #z #w
+
+#define TH_CONCAT_2(x, y) TH_CONCAT_2_EXPAND(x, y)
+#define TH_CONCAT_2_EXPAND(x, y) x##y
+
+#define TH_CONCAT_3(x, y, z) TH_CONCAT_3_EXPAND(x, y, z)
+#define TH_CONCAT_3_EXPAND(x, y, z) x##y##z
+
+#define TH_CONCAT_4_EXPAND(x, y, z, w) x##y##z##w
+#define TH_CONCAT_4(x, y, z, w) TH_CONCAT_4_EXPAND(x, y, z, w)
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/THP.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/THP.h
new file mode 100644
index 0000000000000000000000000000000000000000..1094e98a519d54677f754e379ab2e6773bbb80de
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/THP.h
@@ -0,0 +1,30 @@
+#ifndef THP_H
+#define THP_H
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/python_headers.h>
+
+// Back-compatibility macros, Thanks to http://cx-oracle.sourceforge.net/
+// define PyInt_* macros for Python 3.x.  NB: We must include Python.h first,
+// otherwise we'll incorrectly conclude PyInt_Check isn't defined!
+#ifndef PyInt_Check
+#define PyInt_Check PyLong_Check
+#define PyInt_FromLong PyLong_FromLong
+#define PyInt_AsLong PyLong_AsLong
+#define PyInt_Type PyLong_Type
+#endif
+
+#include <torch/csrc/Exceptions.h>
+#include <torch/csrc/Generator.h>
+#include <torch/csrc/Module.h>
+#include <torch/csrc/Size.h>
+#include <torch/csrc/Storage.h>
+#include <torch/csrc/Types.h>
+#include <torch/csrc/utils.h> // This requires defined Storage and Tensor types
+#include <torch/csrc/utils/byte_order.h>
+
+#include <torch/csrc/serialization.h>
+
+#include <torch/csrc/autograd/python_autograd.h>
+
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/TypeInfo.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/TypeInfo.h
new file mode 100644
index 0000000000000000000000000000000000000000..e826da2f1e2a683edd6f88b69340ee551b17343b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/TypeInfo.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/python_headers.h>
+
+#include <ATen/ATen.h>
+
+struct THPDTypeInfo {
+  PyObject_HEAD
+  at::ScalarType type;
+};
+
+struct THPFInfo : THPDTypeInfo {};
+
+struct THPIInfo : THPDTypeInfo {};
+
+TORCH_PYTHON_API extern PyTypeObject THPFInfoType;
+TORCH_PYTHON_API extern PyTypeObject THPIInfoType;
+
+inline bool THPFInfo_Check(PyObject* obj) {
+  return Py_TYPE(obj) == &THPFInfoType;
+}
+
+inline bool THPIInfo_Check(PyObject* obj) {
+  return Py_TYPE(obj) == &THPIInfoType;
+}
+
+void THPDTypeInfo_init(PyObject* module);
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/Types.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/Types.h
new file mode 100644
index 0000000000000000000000000000000000000000..b4756ce33f64b2efad4b17676c9242554b900245
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/Types.h
@@ -0,0 +1,13 @@
+#ifndef THP_TYPES_INC
+#define THP_TYPES_INC
+
+#include <cstddef>
+
+#ifndef INT64_MAX
+#include <cstdint>
+#endif
+
+template <typename T>
+struct THPTypeInfo {};
+
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/all.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/all.h
new file mode 100644
index 0000000000000000000000000000000000000000..1f55ba2986b235d73923b829c4df997a6c6ec0ed
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/all.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#if !defined(_MSC_VER) && __cplusplus < 201703L
+#error C++17 or later compatible compiler is required to use PyTorch.
+#endif
+
+#include <torch/autograd.h>
+#include <torch/cuda.h>
+#include <torch/data.h>
+#include <torch/enum.h>
+#include <torch/fft.h>
+#include <torch/jit.h>
+#include <torch/mps.h>
+#include <torch/nested.h>
+#include <torch/nn.h>
+#include <torch/optim.h>
+#include <torch/serialize.h>
+#include <torch/sparse.h>
+#include <torch/special.h>
+#include <torch/types.h>
+#include <torch/utils.h>
+#include <torch/version.h>
+#include <torch/xpu.h>
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/arg.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/arg.h
new file mode 100644
index 0000000000000000000000000000000000000000..70c07e30aece4294c58c53a80b241a33b9c6425f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/arg.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <utility>
+
+#define TORCH_ARG(T, name)                                                \
+ public:                                                                  \
+  inline auto name(const T& new_##name) -> decltype(*this) { /* NOLINT */ \
+    this->name##_ = new_##name;                                           \
+    return *this;                                                         \
+  }                                                                       \
+  inline auto name(T&& new_##name) -> decltype(*this) { /* NOLINT */      \
+    this->name##_ = std::move(new_##name);                                \
+    return *this;                                                         \
+  }                                                                       \
+  inline const T& name() const noexcept { /* NOLINT */                    \
+    return this->name##_;                                                 \
+  }                                                                       \
+  inline T& name() noexcept { /* NOLINT */                                \
+    return this->name##_;                                                 \
+  }                                                                       \
+                                                                          \
+ private:                                                                 \
+  T name##_ /* NOLINT */
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/autograd.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/autograd.h
new file mode 100644
index 0000000000000000000000000000000000000000..4f6ae577c0b7ebcce2c56c177d590dd4f5426919
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/autograd.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#include <torch/csrc/autograd/autograd.h>
+#include <torch/csrc/autograd/autograd_not_implemented_fallback.h>
+#include <torch/csrc/autograd/custom_function.h>
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/cuda.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..f860660d13f0c5aeaafd08d1e6b05e3d8de67d40
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/cuda.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <c10/core/Device.h>
+#include <c10/macros/Export.h>
+
+#include <cstdint>
+
+namespace torch::cuda {
+
+/// Returns the number of CUDA devices available.
+c10::DeviceIndex TORCH_API device_count();
+
+/// Returns true if at least one CUDA device is available.
+bool TORCH_API is_available();
+
+/// Returns true if CUDA is available, and CuDNN is available.
+bool TORCH_API cudnn_is_available();
+
+/// Sets the seed for the current GPU.
+void TORCH_API manual_seed(uint64_t seed);
+
+/// Sets the seed for all available GPUs.
+void TORCH_API manual_seed_all(uint64_t seed);
+
+/// Waits for all kernels in all streams on a CUDA device to complete.
+void TORCH_API synchronize(int64_t device_index = -1);
+
+} // namespace torch::cuda
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data.h
new file mode 100644
index 0000000000000000000000000000000000000000..e9c1198a7b086647022d961a07b4a9e4f1110c77
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <torch/data/dataloader.h>
+#include <torch/data/datasets.h>
+#include <torch/data/samplers.h>
+#include <torch/data/transforms.h>
+
+// Some "exports".
+
+namespace torch::data {
+using datasets::BatchDataset; // NOLINT
+using datasets::Dataset; // NOLINT
+} // namespace torch::data
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/dataloader.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/dataloader.h
new file mode 100644
index 0000000000000000000000000000000000000000..16aea86ec73bf7914597c9d3158204c695eac37a
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/dataloader.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include <torch/data/dataloader/stateful.h>
+#include <torch/data/dataloader/stateless.h>
+
+#include <torch/csrc/utils/variadic.h>
+
+#include <c10/util/Exception.h>
+
+#include <cstddef>
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+namespace torch::data {
+
+/// Creates a `DataLoader` instance for a stateless `dataset`, a `sampler` and
+/// some `options`.
+template <typename Dataset, typename Sampler>
+std::enable_if_t<
+    !Dataset::is_stateful,
+    std::unique_ptr<StatelessDataLoader<Dataset, Sampler>>>
+make_data_loader(Dataset dataset, Sampler sampler, DataLoaderOptions options) {
+  return std::make_unique<StatelessDataLoader<Dataset, Sampler>>(
+      std::move(dataset), std::move(sampler), options);
+}
+
+/// Creates a `DataLoader` instance for a stateless `dataset` and some
+/// `options`. A sampler (by default a `RandomSampler`) will be constructed from
+/// the size of the dataset.
+template <typename Sampler = samplers::RandomSampler, typename Dataset>
+std::enable_if_t<
+    !Dataset::is_stateful && std::is_constructible_v<Sampler, size_t>,
+    std::unique_ptr<StatelessDataLoader<Dataset, Sampler>>>
+make_data_loader(
+    Dataset dataset,
+    DataLoaderOptions options = DataLoaderOptions()) {
+  const std::optional<size_t> size = dataset.size();
+  TORCH_CHECK(
+      size.has_value(),
+      "Expected the dataset to be sized in "
+      "order to construct the Sampler");
+  return make_data_loader(std::move(dataset), Sampler(*size), options);
+}
+
+/// Creates a `DataLoader` for a stateful `dataset` and some `options`.
+template <typename Dataset, typename = std::enable_if_t<Dataset::is_stateful>>
+std::unique_ptr<StatefulDataLoader<Dataset>> make_data_loader(
+    Dataset dataset,
+    DataLoaderOptions options = DataLoaderOptions()) {
+  return std::make_unique<StatefulDataLoader<Dataset>>(
+      std::move(dataset), options);
+}
+} // namespace torch::data
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/dataloader/base.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/dataloader/base.h
new file mode 100644
index 0000000000000000000000000000000000000000..0e67df98f071dde76e1cdd3ae65023e079d31270
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/dataloader/base.h
@@ -0,0 +1,254 @@
+#pragma once
+
+#include <torch/data/dataloader_options.h>
+#include <torch/data/detail/data_shuttle.h>
+#include <torch/data/detail/sequencers.h>
+#include <torch/data/iterator.h>
+#include <torch/data/samplers/random.h>
+#include <torch/data/worker_exception.h>
+#include <torch/types.h>
+
+#include <torch/csrc/utils/variadic.h>
+
+#include <c10/util/Exception.h>
+#include <c10/util/irange.h>
+
+#include <cstddef>
+#include <exception>
+#include <memory>
+#include <thread>
+#include <utility>
+#include <vector>
+
+namespace torch::data {
+template <typename Dataset, typename Batch, typename BatchRequest>
+class DataLoaderBase {
+ public:
+  using BatchType = Batch;
+  using BatchRequestType = BatchRequest;
+
+  /// Constructs a new DataLoader from a `dataset` to sample from, `options`
+  /// to configure the DataLoader with, and a `sampler` that specifies the
+  /// sampling strategy.
+  DataLoaderBase(
+      DataLoaderOptions options,
+      std::unique_ptr<Dataset> main_thread_dataset = nullptr)
+      : options_(options),
+        main_thread_dataset_(std::move(main_thread_dataset)),
+        sequencer_(new_sequencer()) {}
+
+  DataLoaderBase(const DataLoaderBase&) = delete;
+  DataLoaderBase(DataLoaderBase&&) = delete;
+  DataLoaderBase& operator=(const DataLoaderBase&) = delete;
+  DataLoaderBase& operator=(DataLoaderBase&&) = delete;
+  // NOLINTNEXTLINE(bugprone-exception-escape)
+  virtual ~DataLoaderBase() {
+    join();
+  }
+
+  /// Returns an iterator into the DataLoader. The lifetime of the iterator is
+  /// bound to the DataLoader. In C++ standards language, the category of the
+  /// iterator is `OutputIterator`. See
+  /// https://en.cppreference.com/w/cpp/named_req/OutputIterator for what this
+  /// means. In short: you may increment the iterator and dereference it, but
+  /// cannot go back, or step forward more than one position at a time. When the
+  /// DataLoader is exhausted, it will compare equal with the special
+  /// "sentinel" iterator returned by `DataLoader::end()`. Most of the time, you
+  /// should only use range-for loops to loop over the DataLoader, but
+  /// standard algorithms like `std::copy(dataloader.begin(), dataloader.end(),
+  /// output_iterator)`  are supported too.
+  Iterator<Batch> begin() {
+    TORCH_CHECK(
+        shuttle_.in_flight_jobs() == 0,
+        "Attempted to get a new DataLoader iterator "
+        "while another iterator is not yet exhausted");
+    reset();
+    return Iterator<Batch>(std::make_unique<detail::ValidIterator<Batch>>(
+        [this] { return this->next(); }));
+  }
+
+  /// Returns a special "sentinel" iterator that compares equal with a
+  /// non-sentinel iterator once the DataLoader is exhausted.
+  Iterator<Batch> end() {
+    return Iterator<Batch>(std::make_unique<detail::SentinelIterator<Batch>>());
+  }
+
+  /// Joins the DataLoader's worker threads and drains internal queues.
+  /// This function may only be invoked from the main thread (in which the
+  /// DataLoader lives).
+  void join() {
+    if (joined_) {
+      return;
+    }
+    shuttle_.drain();
+    // Send one 'quit' message per worker. Since a worker dies (exits its
+    // thread) after receiving this message, each `QuitWorker()` message will be
+    // read by exactly one worker.
+    for ([[maybe_unused]] const auto w : c10::irange(options_.workers)) {
+      push_job(QuitWorker());
+    }
+    for (auto& worker : workers_) {
+      worker.join();
+    }
+    joined_ = true;
+  }
+
+  /// Returns the options with which the DataLoader was configured.
+  const FullDataLoaderOptions& options() const noexcept {
+    return options_;
+  }
+
+ protected:
+  /// Simple mix-in to give something a sequence number.
+  struct Sequenced {
+    Sequenced() = default;
+    Sequenced(size_t sqn) : sequence_number(sqn) {}
+    size_t sequence_number;
+  };
+
+  struct QuitWorker {};
+
+  /// A `Job` is either a `BatchRequest` (new indices to fetch data at) or a
+  /// `QuitWorker` object, to indicate the worker should shut down.
+  struct Job : Sequenced {
+    Job() = default;
+    Job(QuitWorker q, size_t sqn) : Sequenced(sqn), quit(q) {}
+    Job(BatchRequest&& i, size_t sqn)
+        : Sequenced(sqn), batch_request(std::move(i)) {}
+    std::optional<QuitWorker> quit;
+    std::optional<BatchRequest> batch_request;
+  };
+
+  /// The finished result of a job.
+  struct Result : Sequenced {
+    Result() = default;
+    Result(std::optional<Batch>&& b, size_t sqn)
+        : Sequenced(sqn), batch(std::move(b)) {}
+    Result(std::exception_ptr exception, size_t sqn)
+        : Sequenced(sqn), exception(std::move(exception)) {}
+    std::optional<Batch> batch;
+    std::exception_ptr exception;
+  };
+
+  /// Subclass hook for getting the next batch request. The stateless case will
+  /// ask the sampler for a new batch request (e.g. a vector of indices), while
+  /// the stateful one will simply return the batch size.
+  virtual std::optional<BatchRequestType> get_batch_request() = 0;
+
+  /// Resets the internal state of the DataLoader, optionally pre-fetching
+  /// new jobs.
+  virtual void reset() {
+    shuttle_.drain();
+    sequence_number_ = 0;
+    sequencer_ = new_sequencer();
+    prefetch();
+  }
+
+  /// Schedules `requested_jobs` many new batches to be fetched. The actual
+  /// number of jobs scheduled may be less if the DataLoader exhausts.
+  void prefetch(size_t requested_jobs) {
+    for ([[maybe_unused]] const auto r : c10::irange(requested_jobs)) {
+      if (auto batch_request = get_batch_request()) {
+        this->push_job(std::move(*batch_request));
+      } else {
+        break;
+      }
+    }
+  }
+
+  /// Schedules the maximum number of jobs (based on the `max_jobs` option).
+  void prefetch() {
+    prefetch(options_.max_jobs);
+  }
+
+  /// Returns the next batch of data, or an empty `optional` if the DataLoader
+  /// is exhausted. This operation will block until a batch is available if one
+  /// is still expected.
+  std::optional<BatchType> next() {
+    if (options_.workers > 0) {
+      while (std::optional<Result> result = this->pop_result()) {
+        if (result->exception) {
+          throw WorkerException(result->exception);
+        } else if (result->batch) {
+          prefetch(1);
+          return std::move(result->batch);
+        }
+      }
+    } else if (auto batch_request = get_batch_request()) {
+      return this->main_thread_dataset_->get_batch(std::move(*batch_request));
+    }
+    return std::nullopt;
+  }
+
+  /// The function that worker threads run.
+  void worker_thread(Dataset& dataset) {
+    while (true) {
+      auto job = shuttle_.pop_job();
+      if (job.quit) {
+        break;
+      }
+      try {
+        auto batch = dataset.get_batch(std::move(*job.batch_request));
+        shuttle_.push_result({std::move(batch), job.sequence_number});
+      } catch (...) {
+        shuttle_.push_result({std::current_exception(), job.sequence_number});
+      }
+    }
+  }
+
+  /// Convenience method that calls `shuttle_.push_job()` with the next sequence
+  /// number.
+  template <typename T>
+  void push_job(T value) {
+    shuttle_.push_job({std::move(value), sequence_number_++});
+  }
+
+  /// Convenience method that gets the next result from the sequencer.
+  std::optional<Result> pop_result() {
+    return sequencer_->next(
+        [this] { return this->shuttle_.pop_result(this->options_.timeout); });
+  }
+
+  /// Convenience method that creates a new sequencer based on the
+  /// `enforce_ordering` option.
+  std::unique_ptr<detail::sequencers::Sequencer<Result>> new_sequencer() {
+    if (options_.enforce_ordering) {
+      return std::make_unique<detail::sequencers::OrderedSequencer<Result>>(
+          options_.max_jobs);
+    }
+    return std::make_unique<detail::sequencers::NoSequencer<Result>>();
+  }
+
+  /// The options the DataLoader was configured with.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const FullDataLoaderOptions options_;
+
+  /// The dataset for the main thread, only has a value if the number of
+  /// worker threads was configured as zero, meaning the main thread has to do
+  /// all the work (synchronously). NOTE: Really want this to be on the heap
+  /// when empty, therefore `unique_ptr` and not `optional`.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::unique_ptr<Dataset> main_thread_dataset_;
+
+  /// The sequence number for the *next* batch to be retrieved from the
+  /// dataset.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  size_t sequence_number_ = 0;
+
+  /// The worker threads, running the `worker_thread()` method.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::vector<std::thread> workers_;
+
+  /// The `DataShuttle` which takes care of the life cycle of a job.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  detail::DataShuttle<Job, Result> shuttle_;
+
+  /// The `Sequencer`, which handles optional ordering of batches.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::unique_ptr<detail::sequencers::Sequencer<Result>> sequencer_;
+
+  /// True if the DataLoader has joined its worker threads.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  bool joined_ = false;
+};
+} // namespace torch::data
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/dataloader/stateful.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/dataloader/stateful.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0034f4c44f08512aad1ac5f3db86c977effd2d7
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/dataloader/stateful.h
@@ -0,0 +1,61 @@
+#pragma once
+
+#include <c10/util/irange.h>
+#include <torch/data/dataloader/base.h>
+
+#include <cstddef>
+#include <thread>
+#include <utility>
+
+namespace torch::data {
+
+/// A dataloader for stateful datasets.
+///
+/// A dataloader for stateful datatasets differs from one for stateless
+/// datasets one in that the dataset is shared among worker threads, and that
+/// this dataset is itself responsible for producing batches rather than
+/// depending on a sampler. The statefulness here actually refers to the
+/// dataset. The StatefulDataLoader simply alters the data loading algorithm to
+/// accommodate the stateful, shared nature of the dataset. Note that the
+/// dataset must be thread safe if more than one worker thread is used.
+///
+/// A stateful dataloader is created by calling `make_data_loader` with a
+/// stateful dataset.
+template <typename Dataset>
+class StatefulDataLoader : public DataLoaderBase<
+                               Dataset,
+                               typename Dataset::BatchType::value_type,
+                               typename Dataset::BatchRequestType> {
+ public:
+  using super = DataLoaderBase<
+      Dataset,
+      typename Dataset::BatchType::value_type,
+      typename Dataset::BatchRequestType>;
+  using typename super::BatchRequestType;
+
+  /// Constructs the `StatefulDataLoader` from a `dataset` and some `options`.
+  StatefulDataLoader(Dataset dataset, DataLoaderOptions options)
+      : super(options, std::make_unique<Dataset>(std::move(dataset))) {
+    for ([[maybe_unused]] const auto _ : c10::irange(this->options_.workers)) {
+      // As opposed to the stateless case, here all worker threads access the
+      // same underlying dataset.
+      this->workers_.emplace_back(
+          [this] { this->worker_thread(*this->main_thread_dataset_); });
+    }
+  }
+
+ private:
+  /// Resets the internal state of the dataloader and the dataset.
+  void reset() override {
+    this->main_thread_dataset_->reset();
+    // Call the base class method last because it calls `prefetch()`
+    super::reset();
+  }
+
+  /// For stateful datasets, the batch request is always the batch size. The
+  /// dataset is responsible for determining what goes into the batch next.
+  std::optional<BatchRequestType> get_batch_request() override {
+    return this->options_.batch_size;
+  }
+};
+} // namespace torch::data
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/dataloader/stateless.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/dataloader/stateless.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ccc9157087362af08b1feeb54b17c62a066cfb8
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/dataloader/stateless.h
@@ -0,0 +1,80 @@
+#pragma once
+
+#include <torch/data/dataloader/base.h>
+#include <torch/data/worker_exception.h>
+
+#include <c10/util/Exception.h>
+#include <c10/util/irange.h>
+
+#include <cstddef>
+#include <thread>
+#include <utility>
+
+namespace torch::data {
+
+/// A dataloader for stateless datasets.
+///
+/// This dataloader follows the traditional PyTorch dataloader design, whereby a
+/// (possibly) stateful sampler produces *batch requests* for a stateless
+/// dataset, which acts as a simple batch request to batch mapping. The batch
+/// request will often be an array of indices, and if the dataset is a simple
+/// image dataset, the dataset would produce the images at those indices.
+template <typename Dataset, typename Sampler>
+class StatelessDataLoader : public DataLoaderBase<
+                                Dataset,
+                                typename Dataset::BatchType,
+                                typename Sampler::BatchRequestType> {
+ public:
+  using super = DataLoaderBase<
+      Dataset,
+      typename Dataset::BatchType,
+      typename Sampler::BatchRequestType>;
+  using typename super::BatchRequestType;
+
+  /// Constructs the `StatelessDataLoader` from a `dataset`, a `sampler` and
+  /// some `options`.
+  StatelessDataLoader(
+      Dataset dataset,
+      Sampler sampler,
+      DataLoaderOptions options)
+      : super(options), sampler_(std::move(sampler)) {
+    for (const auto w : c10::irange(this->options_.workers)) {
+      // Here we copy the dataset into the worker thread closure. Each worker
+      // has its own copy of the dataset. This means the dataset must be
+      // trivially copiable, or else we don't expect more than one worker to
+      // be in use.
+      (void)w; // Suppress unused variable warning
+      this->workers_.emplace_back(
+          [this, dataset]() mutable { this->worker_thread(dataset); });
+    }
+    if (this->options_.workers == 0) {
+      this->main_thread_dataset_ =
+          std::make_unique<Dataset>(std::move(dataset));
+    }
+  }
+
+ private:
+  /// Resets the internal state of the dataloader and the sampler.
+  void reset() override {
+    sampler_.reset();
+    // Call the base class method last because it calls `prefetch()`
+    super::reset();
+  }
+
+  /// Queries the sampler for the next batch request (possibly progressing its
+  /// internal state).
+  std::optional<BatchRequestType> get_batch_request() override {
+    auto indices = sampler_.next(this->options_.batch_size);
+    if (!indices ||
+        (indices->size() < this->options_.batch_size &&
+         this->options_.drop_last)) {
+      return std::nullopt;
+    }
+    AT_ASSERT(indices->size() > 0);
+    return indices;
+  }
+
+  /// The `Sampler` used to produce batch requests.
+  Sampler sampler_;
+};
+} // namespace torch::data
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/dataloader_options.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/dataloader_options.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae7eadf44d2c97b559abe73cc3ec2f9e1453cfe5
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/dataloader_options.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/types.h>
+
+#include <chrono>
+#include <cstddef>
+
+namespace torch::data {
+
+/// Options to configure a `DataLoader`.
+struct DataLoaderOptions {
+  DataLoaderOptions() = default;
+  /* implicit */ DataLoaderOptions(size_t batch_size)
+      : batch_size_(batch_size) {}
+
+  /// The size of each batch to fetch.
+  TORCH_ARG(size_t, batch_size) = 1;
+
+  /// The number of worker threads to launch. If zero, the main thread will
+  /// synchronously perform the data loading.
+  TORCH_ARG(size_t, workers) = 0;
+
+  /// The maximum number of jobs to enqueue for fetching by worker threads.
+  /// Defaults to two times the number of worker threads.
+  TORCH_ARG(std::optional<size_t>, max_jobs);
+
+  /// An optional limit on the time to wait for the next batch.
+  TORCH_ARG(std::optional<std::chrono::milliseconds>, timeout);
+
+  /// Whether to enforce ordering of batches when multiple are loaded
+  /// asynchronously by worker threads. Set to `false` for better performance if
+  /// you do not care about determinism.
+  TORCH_ARG(bool, enforce_ordering) = true;
+
+  /// Whether to omit the last batch if it contains less than `batch_size`
+  /// examples.
+  TORCH_ARG(bool, drop_last) = false;
+};
+
+/// Like `DataLoaderOptions`, but without any unconfigured state.
+/// `DataLoaderOptions` has some options that depend on other options
+/// (`max_jobs` => `2 * workers`). In the spirit of properly using the C++ type
+/// system, `DataLoaderOptions` allows only setting values. To access values,
+/// you must create a `FullDataLoaderOptions` from a `DataLoaderOptions`
+/// instance, which will do any necessary coalescing.
+struct FullDataLoaderOptions {
+  explicit FullDataLoaderOptions(DataLoaderOptions options)
+      : batch_size(options.batch_size()),
+        workers(options.workers()),
+        max_jobs(options.max_jobs().value_or(2 * workers)),
+        timeout(options.timeout()),
+        enforce_ordering(options.enforce_ordering()),
+        drop_last(options.drop_last()) {}
+
+  size_t batch_size;
+  size_t workers;
+  size_t max_jobs;
+  std::optional<std::chrono::milliseconds> timeout;
+  bool enforce_ordering;
+  bool drop_last;
+};
+} // namespace torch::data
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets.h
new file mode 100644
index 0000000000000000000000000000000000000000..b9b55ddddbb15df9865b4f6470af9ee076eb256e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/data/datasets/base.h>
+#include <torch/data/datasets/chunk.h>
+#include <torch/data/datasets/map.h>
+#include <torch/data/datasets/mnist.h>
+#include <torch/data/datasets/shared.h>
+#include <torch/data/datasets/stateful.h>
+#include <torch/data/datasets/tensor.h>
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/base.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/base.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c44abe04dbb28003ac27680421f86139e75dc3d
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/base.h
@@ -0,0 +1,96 @@
+#pragma once
+
+#include <torch/data/example.h>
+#include <torch/types.h>
+
+#include <c10/util/ArrayRef.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+namespace torch::data::datasets {
+template <typename S, typename T>
+class MapDataset;
+template <typename D, typename T>
+MapDataset<D, T> map(D, T); // NOLINT
+} // namespace torch::data::datasets
+
+namespace torch::data::datasets {
+namespace detail {
+template <typename T>
+struct is_optional : std::false_type {};
+template <typename T>
+struct is_optional<std::optional<T>> : std::true_type {};
+} // namespace detail
+
+/// A dataset that can yield data only in batches.
+template <
+    typename Self,
+    typename Batch = std::vector<Example<>>,
+    typename BatchRequest = ArrayRef<size_t>>
+class BatchDataset {
+ public:
+  using SelfType = Self;
+  using BatchType = Batch;
+  using BatchRequestType = BatchRequest;
+  constexpr static bool is_stateful = detail::is_optional<BatchType>::value;
+
+  virtual ~BatchDataset() = default;
+
+  /// Returns a batch of data given an index.
+  virtual Batch get_batch(BatchRequest request) = 0;
+
+  /// Returns the size of the dataset, or an empty std::optional if it is
+  /// unsized.
+  virtual std::optional<size_t> size() const = 0;
+
+  /// Creates a `MapDataset` that applies the given `transform` to this dataset.
+  template <typename TransformType>
+  MapDataset<Self, TransformType> map(TransformType transform) & {
+    return datasets::map(static_cast<Self&>(*this), std::move(transform));
+  }
+
+  /// Creates a `MapDataset` that applies the given `transform` to this dataset.
+  template <typename TransformType>
+  MapDataset<Self, TransformType> map(TransformType transform) && {
+    return datasets::map(
+        std::move(static_cast<Self&>(*this)), std::move(transform));
+  }
+};
+
+/// A dataset that can yield data in batches, or as individual examples.
+///
+/// A `Dataset` is a `BatchDataset`, because it supports random access and
+/// therefore batched access is implemented (by default) by calling the random
+/// access indexing function for each index in the requested batch of indices.
+/// This can be customized.
+template <typename Self, typename SingleExample = Example<>>
+class Dataset : public BatchDataset<Self, std::vector<SingleExample>> {
+ public:
+  using ExampleType = SingleExample;
+
+  /// Returns the example at the given index.
+  virtual ExampleType get(size_t index) = 0;
+
+  /// Returns a batch of data.
+  /// The default implementation calls `get()` for every requested index
+  /// in the batch.
+  std::vector<ExampleType> get_batch(ArrayRef<size_t> indices) override {
+    std::vector<ExampleType> batch;
+    batch.reserve(indices.size());
+    for (const auto i : indices) {
+      batch.push_back(get(i));
+    }
+    return batch;
+  }
+};
+
+/// A `StreamDataset` represents a dataset that is a potentially infinite
+/// stream. It takes as batch index only a number, which is the batch size, and
+/// yields that many elements from the stream.
+template <typename Self, typename Batch = std::vector<Example<>>>
+using StreamDataset = BatchDataset<Self, Batch, /*BatchRequest=*/size_t>;
+} // namespace torch::data::datasets
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/chunk.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/chunk.h
new file mode 100644
index 0000000000000000000000000000000000000000..17293f90d073b6f92e9f00d19b99f379c7455635
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/chunk.h
@@ -0,0 +1,527 @@
+#pragma once
+
+#include <c10/util/irange.h>
+#include <torch/arg.h>
+#include <torch/data/datasets/stateful.h>
+#include <torch/data/samplers.h>
+#include <queue>
+#include <thread>
+#include <utility>
+
+#include <torch/serialize.h>
+
+namespace torch::data::datasets {
+
+/// Interface for chunk reader, which performs data chunking and reading of
+/// entire chunks.
+///
+/// A chunk could be an entire file, such as an audio data file or an image,
+/// or part of a file in the case of a large text-file split based on seek
+/// positions.
+template <
+    typename ExampleType_,
+    typename ChunkType_ = std::vector<ExampleType_>>
+class ChunkDataReader {
+ public:
+  virtual ~ChunkDataReader() = default;
+
+  using ChunkType = ChunkType_;
+  using ExampleType = ExampleType_;
+
+  /// Read an entire chunk.
+  virtual ChunkType read_chunk(size_t chunk_index) = 0;
+
+  /// Returns the number of chunks available in this reader.
+  virtual size_t chunk_count() = 0;
+
+  /// This will clear any internal state associate with this reader.
+  virtual void reset() = 0;
+};
+
+namespace detail {
+/// BatchDataBuffer manages a queue of UnwrappedBatchData. After a new chunk is
+/// loaded, BatchDataBuffer splits it into small batches and push them into the
+/// queue. When get_batch is called from data loader, it pops cached batches and
+/// return. If the cache is empty, it either waits to load more chunks or return
+/// null if all chunks are loaded.
+template <
+    typename UnwrappedBatch,
+    typename ExampleSampler = samplers::RandomSampler>
+class BatchDataBuffer {
+ public:
+  using UnwrappedBatchType = UnwrappedBatch;
+  using BatchType = std::optional<UnwrappedBatchType>;
+  using BatchRequestType = typename ExampleSampler::BatchRequestType;
+
+  BatchDataBuffer(
+      size_t batch_size,
+      ExampleSampler& example_sampler,
+      size_t queue_capacity)
+      : batch_size_(batch_size),
+        example_sampler_(example_sampler),
+        queue_capacity_(queue_capacity) {}
+
+  /// Return batch data from the queue. Called from the ChunkDataset main
+  /// thread.
+  BatchType get_batch() {
+    std::unique_lock<std::mutex> lock(queue_mutex_);
+    cv_read_.wait(lock, [this] {
+      // wait till there is available data in the queue or if all chunks are
+      // loaded (i.e. the dataset is exhausted for this epoch)
+      return (
+          this->total_example_count_in_queue_ >= batch_size_ || this->stop_);
+    });
+    if (batch_queue_.empty()) {
+      AT_ASSERT(stop_);
+      // All batches have been retrieved. Return an empty batch.
+      return std::nullopt;
+    }
+
+    UnwrappedBatchData batch = std::move(batch_queue_.front());
+    batch_queue_.pop();
+    if (batch.exception) {
+      throw WorkerException(batch.exception);
+    }
+
+    total_example_count_in_queue_ -= batch.batch_data.size();
+    lock.unlock();
+    cv_write_.notify_all();
+
+    return batch.batch_data;
+  }
+
+  /// Push preloaded chunks to batch queue. Called from the ChunkDataset worker
+  /// threads.
+  void add_chunk_data(UnwrappedBatchType data) {
+    std::unique_lock<std::mutex> lock(queue_mutex_);
+    cv_write_.wait(lock, [this] {
+      // stop loading if we have preloaded enough data.
+      return this->total_example_count_in_queue_ < this->queue_capacity_ ||
+          this->stop_;
+    });
+    if (stop_) {
+      // When stop_ is true, it means no further chunk loading is necessary.
+      // Return without any further processing.
+      return;
+    }
+
+    auto data_size = data.size();
+    auto remaining_size = data_size;
+    example_sampler_.reset(data_size);
+
+    auto fill_batch = [&](size_t example_count, UnwrappedBatchType& batch) {
+      auto batch_example_indices = this->example_sampler_.next(example_count);
+      AT_ASSERT(
+          batch_example_indices &&
+          batch_example_indices.value().size() == example_count);
+      BatchRequestType& indices = batch_example_indices.value();
+      for (size_t i : indices) {
+        TORCH_CHECK(i < data_size, "Index out of range");
+        batch.emplace_back(std::move(data[i]));
+      }
+      remaining_size -= example_count;
+    };
+
+    if (!batch_queue_.empty()) {
+      // if the queue has existing data, and the last batch doesn't have enough
+      // examples to fill a batch_size batch, add more example to this batch
+      // first.
+      auto& batch = batch_queue_.back();
+      size_t current_count = batch.batch_data.size();
+      if (current_count < batch_size_) {
+        auto example_count =
+            std::min(remaining_size, batch_size_ - current_count);
+        fill_batch(example_count, batch.batch_data);
+      }
+    }
+
+    // If we still have data remaining after filling the last pushed batch, add
+    // them to the queue too.
+    while (remaining_size > 0) {
+      UnwrappedBatchType current_batch;
+
+      // Allocate the batch memory ahead of time.
+      current_batch.reserve(batch_size_);
+
+      auto example_count = std::min(remaining_size, batch_size_);
+      fill_batch(example_count, current_batch);
+      batch_queue_.emplace(std::move(current_batch));
+    }
+    total_example_count_in_queue_ += data_size;
+    lock.unlock();
+    cv_read_.notify_all();
+  }
+
+  /// Push exceptions thrown during preloading into batch queue. Called from
+  /// the ChunkDataset worker threads.
+  void add_chunk_data(std::exception_ptr e_ptr) {
+    std::unique_lock<std::mutex> lock(queue_mutex_);
+    cv_write_.wait(lock, [this] {
+      // stop loading if we have preloaded enough data.
+      return (
+          this->total_example_count_in_queue_ < this->queue_capacity_ ||
+          this->stop_);
+    });
+    if (stop_) {
+      // When stop_ is true, it means this current thread needs to be tore down,
+      // the batch buffer will be discarded, so no need to enqueue any new
+      // exceptions.
+      return;
+    }
+
+    batch_queue_.emplace(e_ptr);
+    lock.unlock();
+    cv_read_.notify_all();
+  }
+
+  void stop() {
+    {
+      // Hold the lock before changing stop_ to prevent a race condition which
+      // can cause a deadlock. To be more specific, conditional variable
+      // cv_write_ waits on predicate stop_ in add_chunk_data(). The wait
+      // happens in two steps: 1) while still holding the lock, check if
+      // predicate is true; 2) if it is true, proceeds, otherwise, release the
+      // lock and wait until notified. Without holding a lock, cv_write_'s
+      // notification can happen in between step 1) and 2). In that case, as
+      // cv_write_ is not in waiting status yet, so the notification is lost and
+      // cv_write_ will sleep forever. By taking a lock before changing
+      // predicate stop_, it is ensured updating and evaluating stop_ always
+      // happen in a synchronized way
+      std::lock_guard<std::mutex> lock(queue_mutex_);
+      stop_ = true;
+    }
+
+    // notify all writers, wake them from wait to exit current method.
+    cv_write_.notify_all();
+    // notify all readers too.
+    cv_read_.notify_all();
+  }
+  /// The batch size is needed to create batches from the chunk data. Similar to
+  /// regular dataloader where the batches are created with prefetches,
+  /// BatchDataBuffer perform the batch creation using the provided batch size.
+  size_t batch_size_ = 0;
+
+  /// count of total example stored in the queue
+  size_t total_example_count_in_queue_ = 0;
+
+  /// struct that contains a raw unwrapped batch unit. An unwrapped batch unit
+  /// is the raw data without 'optional' wrapper. It can be a collection of
+  /// images, utterances, e.t.c.
+  struct UnwrappedBatchData {
+    explicit UnwrappedBatchData(UnwrappedBatchType data)
+        : batch_data(std::move(data)) {}
+
+    explicit UnwrappedBatchData(std::exception_ptr e)
+        : exception(std::move(e)) {}
+
+    /// batch data to return
+    UnwrappedBatchType batch_data;
+
+    /// exception pointer which captures any abnormal exceptions while creating
+    /// the batch.
+    std::exception_ptr exception;
+  };
+
+  /// local cache to store example batches from loaded chunk
+  std::queue<UnwrappedBatchData> batch_queue_;
+
+  // sync batch_queue_ update.
+  std::mutex queue_mutex_;
+
+  std::condition_variable cv_read_;
+  std::condition_variable cv_write_;
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  ExampleSampler& example_sampler_;
+
+  // configurable maximum number of elements the queue can hold at one time.
+  size_t queue_capacity_;
+
+  // When set to true, it wakes the writer threads from the wait and exit
+  // current function call. This is needed when ChunkDataSet.Reset is called
+  // while the previous epoch is not exhausted yet. When ChunkDataset is waiting
+  // its preloader to finish previous work before tearing down the thread, the
+  // preloader could be still waiting for the conditional variable, thus cause
+  // the program to hang. This boolean is used to break this waiting condition.
+  bool stop_ = false;
+};
+} // namespace detail
+
+/// Options to configure a `ChunkDataset`.
+struct ChunkDatasetOptions {
+  ChunkDatasetOptions() = delete;
+  ChunkDatasetOptions(
+      size_t preloader_count,
+      size_t batch_size,
+      size_t cache_size = 2048,
+      size_t cross_chunk_shuffle_count = 1)
+      : preloader_count_(preloader_count),
+        batch_size_(batch_size),
+        cache_size_(cache_size),
+        cross_chunk_shuffle_count_(cross_chunk_shuffle_count) {
+    TORCH_CHECK(
+        preloader_count_ > 0,
+        "Preloader count is 0. At least one preloader needs to be specified.");
+    TORCH_CHECK(
+        batch_size_ > 0,
+        "Batch size is 0. A positive batch size needs to be specified.");
+    TORCH_CHECK(
+        cache_size_ > 0,
+        "Cache size is 0. A positive cache size needs to be specified.");
+    TORCH_CHECK(
+        cache_size_ >= batch_size_,
+        "Cache size is less than batch size. Cache needs to be large enough to "
+        "hold at least one batch.");
+    TORCH_CHECK(
+        cross_chunk_shuffle_count_ > 0,
+        "cross_chunk_shuffle_count needs to be greater than 0.");
+  }
+
+  /// The number of worker thread to preload chunk data.
+  TORCH_ARG(size_t, preloader_count);
+
+  /// The size of each batch.
+  TORCH_ARG(size_t, batch_size);
+
+  /// The capacity of the queue for batch caching.
+  TORCH_ARG(size_t, cache_size) = 2048;
+
+  // The number of chunks to perform cross-chunk shuffling. Default to 1 meaning
+  // no cross-chunk shuffling. When it is equal to n (n > 1), n random
+  // chunks will be loaded at once and example shuffling will be performed
+  // across all those n chunks.
+  // Note: Usually the default config (1 chunk shuffle + example shuffle) is
+  // good enough to generate random distributed data. Use this parameter only if
+  // you know cross-shuffle is needed in your case. Also there is a performance
+  // penalty when this value is greater than 1, as we need to do extra merge
+  // between multiple chunks before performing example sampling.
+  TORCH_ARG(size_t, cross_chunk_shuffle_count) = 1;
+};
+
+/// A stateful dataset that support hierarchical sampling and prefetching of
+/// entre chunks.
+///
+/// Unlike regular dataset, chunk dataset require two samplers to operate and
+/// keeps an internal state. `ChunkSampler` selects, which chunk to load next,
+/// while the `ExampleSampler` determines the order of Examples that are
+/// returned in each `get_batch` call. The hierarchical sampling approach used
+/// here is inspired by this paper
+/// http://martin.zinkevich.org/publications/nips2010.pdf
+template <
+    typename ChunkReader,
+    typename ChunkSampler = samplers::RandomSampler,
+    typename ExampleSampler = samplers::RandomSampler>
+class ChunkDataset final
+    : public StatefulDataset<
+          ChunkDataset<ChunkReader, ChunkSampler, ExampleSampler>,
+          typename ChunkReader::BatchType,
+          size_t> {
+ public:
+  using BatchType = std::optional<typename ChunkReader::BatchType>;
+  using UnwrappedBatchType = typename ChunkReader::BatchType;
+  using BatchRequestType = size_t;
+  using ChunkSamplerType = ChunkSampler;
+  using ExampleSamplerType = ExampleSampler;
+
+  ChunkDataset(
+      ChunkReader chunk_reader,
+      ChunkSampler chunk_sampler,
+      ExampleSampler example_sampler,
+      ChunkDatasetOptions options,
+      std::function<void(UnwrappedBatchType&)> preprocessing_policy =
+          std::function<void(UnwrappedBatchType&)>())
+      : chunk_reader_(std::move(chunk_reader)),
+        chunk_sampler_(std::move(chunk_sampler)),
+        example_sampler_(std::move(example_sampler)),
+        options_(options),
+        preprocessing_policy_(std::move(preprocessing_policy)),
+        quit_worker_(false),
+        running_preloaders_(0) {}
+
+  ~ChunkDataset() override {
+    // stop batch buffer first.
+    if (batch_buffer_) {
+      batch_buffer_->stop();
+    }
+    free_workers();
+  }
+
+  /// Default get_batch method of BatchDataset. This method returns
+  /// Example batches created from the preloaded chunks. The implementation
+  /// is dataset agnostic and does not need overriding in different chunk
+  /// datasets.
+  BatchType get_batch(size_t batch_size) override {
+    TORCH_CHECK(
+        batch_buffer_ != nullptr,
+        "Dataset needs to call reset() before calling get_batch().");
+
+    TORCH_CHECK(
+        batch_size == options_.batch_size(),
+        "The requested batch size does not match with the initialized batch size.\n"
+        " The requested batch size is ",
+        batch_size,
+        ", while the dataset is created with batch size equal to ",
+        options_.batch_size());
+    return batch_buffer_->get_batch();
+  }
+
+  /// Helper method around get_batch as `batch_size` is not strictly necessary
+  BatchType get_batch() {
+    return get_batch(options_.batch_size());
+  }
+
+  /// This will clear any internal state and starts the internal prefetching
+  /// mechanism for the chunk dataset.
+  void reset() override {
+    // We need this to support partial data reads via dataloader iterator.
+    if (batch_buffer_) {
+      batch_buffer_->stop();
+    }
+    // free workers from previous reset if there is any.
+    free_workers();
+    preload_threads_.clear();
+
+    if (!load_checkpoint_) {
+      chunk_reader_.reset();
+      chunk_sampler_.reset(chunk_reader_.chunk_count());
+      load_checkpoint_ = false;
+    }
+
+    // Throw out any existing cached batch in the buffer and re-creates a new
+    // chunk buffer.
+    batch_buffer_ = std::make_unique<
+        detail::BatchDataBuffer<UnwrappedBatchType, ExampleSamplerType>>(
+        options_.batch_size(), example_sampler_, options_.cache_size());
+
+    // create new workers for this new epoch.
+    quit_worker_ = false;
+
+    AT_ASSERT(running_preloaders_ == 0);
+    running_preloaders_ = options_.preloader_count();
+    for (const auto i : c10::irange(options_.preloader_count())) {
+      preload_threads_.emplace_back([this, i]() { this->preloader(i); });
+    }
+  }
+
+  /// size is not used for chunk dataset.
+  std::optional<size_t> size() const override {
+    return std::nullopt;
+  }
+
+  // provide a references to chunk sampler. Used mainly in distributed data
+  // loading to set the epoch number for the sampler.
+  ChunkSamplerType& chunk_sampler() {
+    return chunk_sampler_;
+  }
+
+  void save(serialize::OutputArchive& archive) const override {
+    std::lock_guard<std::mutex> lock(chunk_index_guard_);
+    chunk_sampler_.save(archive);
+  }
+
+  void load(serialize::InputArchive& archive) override {
+    std::lock_guard<std::mutex> lock(chunk_index_guard_);
+    chunk_sampler_.load(archive);
+    load_checkpoint_ = true;
+  }
+
+ private:
+  /// running on worker thread to preload chunk data.
+  void preloader(size_t id) {
+    while (!quit_worker_.load()) {
+      try {
+        std::vector<size_t> chunk_idx;
+        {
+          std::lock_guard<std::mutex> lock(chunk_index_guard_);
+          if (auto chunk_sampler_result = chunk_sampler_.next(
+                  this->options_.cross_chunk_shuffle_count())) {
+            chunk_idx = chunk_sampler_result.value();
+          } else {
+            break;
+          }
+        }
+        UnwrappedBatchType data = chunk_reader_.read_chunk(chunk_idx[0]);
+        for (const auto i : c10::irange(1, chunk_idx.size())) {
+          auto chunk_data = chunk_reader_.read_chunk(chunk_idx[i]);
+          std::move(
+              chunk_data.begin(), chunk_data.end(), std::back_inserter(data));
+        }
+        if (preprocessing_policy_) {
+          preprocessing_policy_(data);
+        }
+        if (!data.empty()) { // skip empty chunks.
+          batch_buffer_->add_chunk_data(std::move(data));
+        }
+      } catch (...) {
+        batch_buffer_->add_chunk_data(std::current_exception());
+      }
+    }
+    AT_ASSERT(running_preloaders_.load() > 0);
+    --running_preloaders_;
+    if (running_preloaders_.load() == 0) {
+      // all preloaders are completed, so we can notify the batch_buffer.
+      batch_buffer_->stop();
+    }
+  }
+
+  /// Block the current thread until the workers finish execution and exit.
+  void free_workers() {
+    if (!quit_worker_.load()) {
+      quit_worker_ = true;
+      for (auto& worker_thread : preload_threads_) {
+        worker_thread.join();
+      }
+    }
+  }
+
+ private:
+  // Templated class that defines what is a chunk and how to read chunk data.
+  // When a chunk is returned by chunk_reader_, ChunkDataset split it into
+  // batches and caches them in batch_buffer_.
+  ChunkReader chunk_reader_;
+
+  // chunk sampler to shuffle different chunks
+  ChunkSamplerType chunk_sampler_;
+
+  // example sampler to shuffle examples in a specific chunk
+  ExampleSamplerType example_sampler_;
+
+  // batch data buffer which holds chunk data from preloading thread.
+  std::shared_ptr<
+      detail::BatchDataBuffer<UnwrappedBatchType, ExampleSamplerType>>
+      batch_buffer_;
+
+  // worker thread pool
+  std::vector<std::thread> preload_threads_;
+
+  /// The options the Dataset was configured with.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const ChunkDatasetOptions options_;
+
+  // function pointer wrapper to apply custom processing over chunk data. This
+  // is considered an advanced parameter for developers who want to apply a
+  // pre-process to the chunk data before sampling into minibatch.
+  // Different than the collate function, this policy is applied on the chunk
+  // level, instead of minibatch level. When a chunk of data is loaded (multiple
+  // chunks if cross_chunk_shuffle_count_ is greater than 1), this policy is
+  // applied to the full loaded data. It is useful if developers want to
+  // perform pre-processing (like bucketing) to the chunk data before
+  // example sampler samples the data. By default it's an empty pointer and no
+  // action will be taken.
+  std::function<void(UnwrappedBatchType&)> preprocessing_policy_;
+
+  // indicate whether the worker thread can be teared down
+  std::atomic<bool> quit_worker_;
+
+  // keep track of running preloaders to notify batch buffer. A value 0
+  // indicates that the chunk loading is completed.
+  std::atomic<size_t> running_preloaders_;
+
+  // mutex to synchronize chunk sampler next() call.
+  mutable std::mutex chunk_index_guard_;
+
+  // boolean value to indicate whether we need to load the checkpoint for
+  // chunk_sampler_.
+  bool load_checkpoint_{false};
+};
+} // namespace torch::data::datasets
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/map.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/map.h
new file mode 100644
index 0000000000000000000000000000000000000000..b21404dd5dd0203392023439053e9aff17c3857d
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/map.h
@@ -0,0 +1,114 @@
+#pragma once
+
+#include <torch/data/datasets/base.h>
+#include <torch/types.h>
+
+#include <c10/util/ArrayRef.h>
+
+#include <cstddef>
+#include <type_traits>
+#include <utility>
+
+namespace torch::data::datasets {
+namespace detail {
+template <bool C, typename T>
+using optional_if_t = std::conditional_t<C, std::optional<T>, T>;
+} // namespace detail
+
+/// A `MapDataset` is a dataset that applies a transform to a source dataset.
+template <typename SourceDataset, typename AppliedTransform>
+class MapDataset : public BatchDataset<
+                       MapDataset<SourceDataset, AppliedTransform>,
+                       detail::optional_if_t<
+                           SourceDataset::is_stateful,
+                           typename AppliedTransform::OutputBatchType>,
+                       typename SourceDataset::BatchRequestType> {
+ public:
+  using DatasetType = SourceDataset;
+  using TransformType = AppliedTransform;
+  using BatchRequestType = typename SourceDataset::BatchRequestType;
+  using OutputBatchType = detail::optional_if_t<
+      SourceDataset::is_stateful,
+      typename AppliedTransform::OutputBatchType>;
+
+  MapDataset(DatasetType dataset, TransformType transform)
+      : dataset_(std::move(dataset)), transform_(std::move(transform)) {}
+
+  /// Gets a batch from the source dataset and applies the transform to it,
+  /// returning the result.
+  OutputBatchType get_batch(BatchRequestType indices) override {
+    return get_batch_impl(std::move(indices));
+  }
+
+  /// Returns the size of the source dataset.
+  // NOLINTNEXTLINE(bugprone-exception-escape)
+  std::optional<size_t> size() const noexcept override {
+    return dataset_.size();
+  }
+
+  /// Calls `reset()` on the underlying dataset.
+  /// NOTE: Stateless datasets do not have a reset() method, so a call to this
+  /// method will only compile for stateful datasets (which have a reset()
+  /// method).
+  void reset() {
+    dataset_.reset();
+  }
+
+  /// Returns the underlying dataset.
+  const SourceDataset& dataset() noexcept {
+    return dataset_;
+  }
+
+  /// Returns the transform being applied.
+  const AppliedTransform& transform() noexcept {
+    return transform_;
+  }
+
+ private:
+  /// The implementation of `get_batch()` for the stateless case, which simply
+  /// applies the transform to the output of `get_batch()` from the dataset.
+  template <
+      typename D = SourceDataset,
+      typename = std::enable_if_t<!D::is_stateful>>
+  OutputBatchType get_batch_impl(BatchRequestType indices) {
+    return transform_.apply_batch(dataset_.get_batch(std::move(indices)));
+  }
+
+  /// The implementation of `get_batch()` for the stateful case. Here, we follow
+  /// the semantics of `Optional.map()` in many functional languages, which
+  /// applies a transformation to the optional's content when the optional
+  /// contains a value, and returns a new optional (of a different type)  if the
+  /// original optional returned by `get_batch()` was empty.
+  template <typename D = SourceDataset>
+  std::enable_if_t<D::is_stateful, OutputBatchType> get_batch_impl(
+      BatchRequestType indices) {
+    if (auto batch = dataset_.get_batch(std::move(indices))) {
+      return transform_.apply_batch(std::move(*batch));
+    }
+    return std::nullopt;
+  }
+
+  /// The underlying dataset being transformed.
+  SourceDataset dataset_;
+
+  // The transformation that is applied to batches received from the dataset.
+  AppliedTransform transform_;
+};
+
+/// Creates a `MapDataset` with the given dataset and transform.
+template <typename DatasetType, typename TransformType>
+MapDataset<DatasetType, TransformType> map(
+    DatasetType dataset,
+    TransformType transform) {
+  static_assert(
+      std::is_same_v<
+          std::conditional_t<
+              DatasetType::is_stateful,
+              typename DatasetType::BatchType::value_type,
+              typename DatasetType::BatchType>,
+          typename TransformType::InputBatchType>,
+      "BatchType type of dataset does not match input type of transform");
+  return {std::move(dataset), std::move(transform)};
+}
+
+} // namespace torch::data::datasets
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/mnist.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/mnist.h
new file mode 100644
index 0000000000000000000000000000000000000000..aa8a3fc8f46ea7f029549bf3c7959a74f2e7677a
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/mnist.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include <torch/data/datasets/base.h>
+#include <torch/data/example.h>
+#include <torch/types.h>
+
+#include <torch/csrc/Export.h>
+
+#include <cstddef>
+#include <string>
+
+namespace torch::data::datasets {
+/// The MNIST dataset.
+class TORCH_API MNIST : public Dataset<MNIST> {
+ public:
+  /// The mode in which the dataset is loaded.
+  enum class Mode { kTrain, kTest };
+
+  /// Loads the MNIST dataset from the `root` path.
+  ///
+  /// The supplied `root` path should contain the *content* of the unzipped
+  /// MNIST dataset, available from http://yann.lecun.com/exdb/mnist.
+  explicit MNIST(const std::string& root, Mode mode = Mode::kTrain);
+
+  /// Returns the `Example` at the given `index`.
+  Example<> get(size_t index) override;
+
+  /// Returns the size of the dataset.
+  std::optional<size_t> size() const override;
+
+  /// Returns true if this is the training subset of MNIST.
+  // NOLINTNEXTLINE(bugprone-exception-escape)
+  bool is_train() const noexcept;
+
+  /// Returns all images stacked into a single tensor.
+  const Tensor& images() const;
+
+  /// Returns all targets stacked into a single tensor.
+  const Tensor& targets() const;
+
+ private:
+  Tensor images_, targets_;
+};
+} // namespace torch::data::datasets
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/shared.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/shared.h
new file mode 100644
index 0000000000000000000000000000000000000000..0699b8e757d0b9de3c4f2c02ce1475b57fdf36e7
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/shared.h
@@ -0,0 +1,79 @@
+#pragma once
+
+#include <torch/data/datasets/base.h>
+
+#include <memory>
+#include <utility>
+
+namespace torch::data::datasets {
+
+/// A dataset that wraps another dataset in a shared pointer and implements the
+/// `BatchDataset` API, delegating all calls to the shared instance. This is
+/// useful when you want all worker threads in the dataloader to access the same
+/// dataset instance. The dataset must take care of synchronization and
+/// thread-safe access itself.
+///
+/// Use `torch::data::datasets::make_shared_dataset()` to create a new
+/// `SharedBatchDataset` like you would a `std::shared_ptr`.
+template <typename UnderlyingDataset>
+class SharedBatchDataset : public BatchDataset<
+                               SharedBatchDataset<UnderlyingDataset>,
+                               typename UnderlyingDataset::BatchType,
+                               typename UnderlyingDataset::BatchRequestType> {
+ public:
+  using BatchType = typename UnderlyingDataset::BatchType;
+  using BatchRequestType = typename UnderlyingDataset::BatchRequestType;
+
+  /// Constructs a new `SharedBatchDataset` from a `shared_ptr` to the
+  /// `UnderlyingDataset`.
+  /* implicit */ SharedBatchDataset(
+      std::shared_ptr<UnderlyingDataset> shared_dataset)
+      : dataset_(std::move(shared_dataset)) {}
+
+  /// Calls `get_batch` on the underlying dataset.
+  BatchType get_batch(BatchRequestType request) override {
+    return dataset_->get_batch(std::move(request));
+  }
+
+  /// Returns the `size` from the underlying dataset.
+  std::optional<size_t> size() const override {
+    return dataset_->size();
+  }
+
+  /// Accesses the underlying dataset.
+  UnderlyingDataset& operator*() {
+    return *dataset_;
+  }
+
+  /// Accesses the underlying dataset.
+  const UnderlyingDataset& operator*() const {
+    return *dataset_;
+  }
+
+  /// Accesses the underlying dataset.
+  UnderlyingDataset* operator->() {
+    return dataset_.get();
+  }
+
+  /// Accesses the underlying dataset.
+  const UnderlyingDataset* operator->() const {
+    return dataset_.get();
+  }
+
+  /// Calls `reset()` on the underlying dataset.
+  void reset() {
+    dataset_->reset();
+  }
+
+ private:
+  std::shared_ptr<UnderlyingDataset> dataset_;
+};
+
+/// Constructs a new `SharedBatchDataset` by creating a
+/// `shared_ptr<UnderlyingDatase>`. All arguments are forwarded to
+/// `make_shared<UnderlyingDataset>`.
+template <typename UnderlyingDataset, typename... Args>
+SharedBatchDataset<UnderlyingDataset> make_shared_dataset(Args&&... args) {
+  return std::make_shared<UnderlyingDataset>(std::forward<Args>(args)...);
+}
+} // namespace torch::data::datasets
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/stateful.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/stateful.h
new file mode 100644
index 0000000000000000000000000000000000000000..4961852d19399c43135db4a6bcdb8f84adec352b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/stateful.h
@@ -0,0 +1,64 @@
+#pragma once
+
+#include <torch/data/datasets/base.h>
+#include <torch/data/example.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace torch::serialize {
+class OutputArchive;
+class InputArchive;
+} // namespace torch::serialize
+
+namespace torch::data::datasets {
+
+/// A stateful dataset is a dataset that maintains some internal state, which
+/// will be `reset()` at the beginning of each epoch. Subclasses can override
+/// the `reset()` method to configure this behavior. Further, the return type of
+/// a stateful dataset's `get_batch()` method is always an `optional`. When the
+/// stateful dataset wants to indicate to the dataloader that its epoch has
+/// ended, it should return an empty optional. The dataloader knows to modify
+/// its implementation based on whether the dataset is stateless or stateful.
+///
+/// Note that when subclassing a from `StatefulDataset<Self, T>`, the return
+/// type of `get_batch()`, which the subclass must override, will be
+/// `optional<T>` (i.e. the type specified in the `StatefulDataset`
+/// specialization is automatically boxed into an `optional` for the dataset's
+/// `BatchType`).
+template <
+    typename Self,
+    typename Batch = std::vector<Example<>>,
+    typename BatchRequest = size_t>
+class StatefulDataset
+    : public BatchDataset<Self, std::optional<Batch>, BatchRequest> {
+ public:
+  /// Resets internal state of the dataset.
+  virtual void reset() = 0;
+
+  /// Saves the statefulDataset's state to OutputArchive.
+  virtual void save(serialize::OutputArchive& archive) const = 0;
+
+  /// Deserializes the statefulDataset's state from the `archive`.
+  virtual void load(serialize::InputArchive& archive) = 0;
+};
+
+/// Serializes a statefulDataset to `OutputArchive`.
+template <typename... Args>
+serialize::OutputArchive& operator<<(
+    serialize::OutputArchive& archive,
+    const StatefulDataset<Args...>& statefulDataset) {
+  statefulDataset.save(archive);
+  return archive;
+}
+
+/// Deserializes a statefulDataset from an `InputArchive`.
+template <typename... Args>
+serialize::InputArchive& operator>>(
+    serialize::InputArchive& archive,
+    StatefulDataset<Args...>& statefulDataset) {
+  statefulDataset.load(archive);
+  return archive;
+}
+
+} // namespace torch::data::datasets
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/tensor.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..0eb188baa345e24ce36d12690225cbcfa1c9c9b2
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/datasets/tensor.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <torch/data/datasets/base.h>
+#include <torch/data/example.h>
+#include <torch/types.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace torch::data::datasets {
+
+/// A dataset of tensors.
+/// Stores a single tensor internally, which is then indexed inside `get()`.
+struct TensorDataset : public Dataset<TensorDataset, TensorExample> {
+  /// Creates a `TensorDataset` from a vector of tensors.
+  explicit TensorDataset(const std::vector<Tensor>& tensors)
+      : TensorDataset(torch::stack(tensors)) {}
+
+  explicit TensorDataset(torch::Tensor tensor) : tensor(std::move(tensor)) {}
+
+  /// Returns a single `TensorExample`.
+  TensorExample get(size_t index) override {
+    return tensor[static_cast<int64_t>(index)];
+  }
+
+  /// Returns the number of tensors in the dataset.
+  std::optional<size_t> size() const override {
+    return tensor.size(0);
+  }
+
+  Tensor tensor;
+};
+
+} // namespace torch::data::datasets
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/detail/data_shuttle.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/detail/data_shuttle.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ca3c8c0d31dd131e8ea7e8e2ed5811bd8b404b8
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/detail/data_shuttle.h
@@ -0,0 +1,83 @@
+#pragma once
+
+#include <torch/data/detail/queue.h>
+#include <torch/types.h>
+
+#include <c10/util/Exception.h>
+#include <optional>
+
+#include <chrono>
+#include <utility>
+
+namespace torch::data::detail {
+
+/// Encapsulates the full life cycle of DataLoader jobs.
+///
+/// When a new job is enqueued to the `DataShuttle`, a counter for in-flight
+/// jobs is bumped. This job is said to be "in-flight" until its result is
+/// popped. Worker threads dequeue jobs as soon as they are available. When a
+/// worker finishes a job, it enqueues the result. Only when the main thread
+/// dequeues a result is the count of in-flight jobs decremented. When the main
+/// thread attempts to dequeue a job but no jobs are in-flight, that means the
+/// epoch is complete and `pop_result` returns an empty optional.
+template <typename Job, typename Result>
+class DataShuttle {
+ public:
+  /// Pushes a new job. Called by the main thread.
+  void push_job(Job job) {
+    new_jobs_.push(std::move(job));
+    ++in_flight_jobs_;
+  }
+
+  /// Pushes the result of a job. Called by worker threads.
+  void push_result(Result result) {
+    results_.push(std::move(result));
+  }
+
+  /// Returns the next job, blocking until there is one available. Called by
+  /// worker threads.
+  Job pop_job() {
+    return new_jobs_.pop();
+  }
+
+  /// Returns the result of a job, or nullopt if all jobs were exhausted. Called
+  /// by the main thread.
+  std::optional<Result> pop_result(
+      std::optional<std::chrono::milliseconds> timeout = std::nullopt) {
+    if (in_flight_jobs_ > 0) {
+      auto result = results_.pop(timeout);
+      --in_flight_jobs_;
+      return result;
+    }
+    return std::nullopt;
+  }
+
+  /// Discards any jobs that are not yet in flight, and waits for all in-flight
+  /// jobs to finish, discarding their result.
+  void drain() {
+    // Clear all inputs so that no further jobs are scheduled.
+    auto number_cleared = new_jobs_.clear();
+    in_flight_jobs_ -= number_cleared;
+    // Remove any outstanding results.
+    while (in_flight_jobs_ > 0) {
+      pop_result();
+    }
+  }
+
+  /// Returns the number of jobs that are still in progress.
+  /// When this number is zero, an epoch is finished.
+  size_t in_flight_jobs() const noexcept {
+    return in_flight_jobs_;
+  }
+
+ private:
+  /// The queue for jobs that are not yet in flight.
+  Queue<Job> new_jobs_;
+  /// The number of in-flight jobs.
+  /// NOTE: Not atomic because only manipulated by the main thread.
+  size_t in_flight_jobs_ = 0;
+  /// The queue for results of finished jobs.
+  Queue<Result> results_;
+};
+
+} // namespace torch::data::detail
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/detail/queue.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/detail/queue.h
new file mode 100644
index 0000000000000000000000000000000000000000..c332fca058b6ef9b9ea5e8b29e709c7a38eb7a23
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/detail/queue.h
@@ -0,0 +1,80 @@
+#pragma once
+
+#include <torch/types.h>
+
+#include <c10/util/Exception.h>
+
+#include <chrono>
+#include <condition_variable>
+#include <cstddef>
+#include <mutex>
+#include <queue>
+
+namespace torch::data::detail {
+
+/// A basic locked, blocking MPMC queue.
+///
+/// Every `push` and `pop` is guarded by a mutex. A condition variable is used
+/// to communicate insertion of new elements, such that waiting threads will be
+/// woken up if they are currently waiting inside a call to `pop()`.
+///
+/// Note that this data structure is written specifically for use with the
+/// `DataLoader`. Its behavior is tailored to this use case and may not be
+/// applicable to more general uses.
+template <typename T>
+class Queue {
+ public:
+  /// Pushes a new value to the back of the `Queue` and notifies one thread on
+  /// the waiting side about this event.
+  void push(T value) {
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      queue_.push(std::move(value));
+    }
+    cv_.notify_one();
+  }
+
+  /// Blocks until at least one element is ready to be popped from the front of
+  /// the queue. An optional `timeout` in seconds can be used to limit the time
+  /// spent waiting for an element. If the wait times out, an exception is
+  /// raised.
+  T pop(std::optional<std::chrono::milliseconds> timeout = std::nullopt) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (timeout) {
+      if (!cv_.wait_for(
+              lock, *timeout, [this] { return !this->queue_.empty(); })) {
+        // clang-format off
+        TORCH_CHECK(false,
+            "Timeout in DataLoader queue while waiting for next batch"
+            " (timeout was ", timeout->count(), " ms)");
+        // clang-format on
+      }
+    } else {
+      cv_.wait(lock, [this] { return !this->queue_.empty(); });
+    }
+    AT_ASSERT(!queue_.empty());
+    T value = queue_.front();
+    queue_.pop();
+    lock.unlock();
+    return value;
+  }
+
+  /// Empties the queue and returns the number of elements that were present at
+  /// the start of the function. No threads are notified about this event as it
+  /// is assumed to be used to drain the queue during shutdown of a
+  /// `DataLoader`.
+  size_t clear() {
+    std::lock_guard<std::mutex> lock(this->mutex_);
+    const auto size = queue_.size();
+    while (!queue_.empty()) {
+      queue_.pop();
+    }
+    return size;
+  }
+
+ private:
+  std::queue<T> queue_;
+  std::mutex mutex_;
+  std::condition_variable cv_;
+};
+} // namespace torch::data::detail
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/detail/sequencers.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/detail/sequencers.h
new file mode 100644
index 0000000000000000000000000000000000000000..85a048976e30b225fa441275c93c46601cea2b8a
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/detail/sequencers.h
@@ -0,0 +1,107 @@
+#pragma once
+
+#include <torch/types.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <vector>
+
+namespace torch::data::detail::sequencers {
+namespace detail {
+template <typename Result>
+bool buffer_contains_result(const std::vector<std::optional<Result>>& buffer) {
+  return std::any_of(
+      buffer.begin(), buffer.end(), [](const std::optional<Result>& result) {
+        return result.has_value();
+      });
+}
+} // namespace detail
+
+/// A `Sequencer` accepts a function that yields the next result of a
+/// `DataLoader` and then has the opportunity to influence the order in which
+/// these results are returned. The `NoSequencer` does not enforce any
+/// sequencing and returns any result directly. The `OrderedSequencer` instead
+/// buffers results internally to return them in order of their sequence number.
+template <typename Result>
+struct Sequencer {
+  using ResultProducer = std::function<std::optional<Result>()>;
+  virtual ~Sequencer() = default;
+  virtual std::optional<Result> next(ResultProducer next_result) = 0;
+};
+
+/// A `Sequencer` that does not enforce any ordering. It is effectively the
+/// identity function.
+template <typename Result>
+struct NoSequencer final : public Sequencer<Result> {
+  using typename Sequencer<Result>::ResultProducer;
+  std::optional<Result> next(ResultProducer next_result) override {
+    return next_result();
+  }
+};
+
+/// A `Sequencer` that buffers results and returns them in order of their
+/// sequence number. The `OrderedSequencer` maintains an internal, monotonically
+/// incrementing counter for the next sequence number it expects. If it receives
+/// a result with a higher sequence number, it will buffer it for later (when
+/// the sequence number reaches that of this result). Otherwise, if the sequence
+/// numbers match, the result is returned.
+///
+/// Implementation note: The `OrderedSequencer` is implemented with a fixed-size
+/// buffer. Let `m` be the maximum number of jobs in the data loader's queue and
+/// `s` be the current sequence number. Assume `m` jobs are scheduled in the
+/// `DataLoader`. Any new result is stored at index `job.sqn mod m` in the
+/// `OrderedSequencer`. Why are we sure sequence numbers of new jobs will not
+/// collide with sequence numbers of buffered jobs? The `OrderedSequencer` will
+/// not return from `next()` until it receives the result with sqn `s`. This
+/// means no new jobs can be scheduled in the `DataLoader` in the meantime,
+/// which enforces that as long as sqn `s` has not been received, `s + m` (which
+/// would cause a collision in the fixed-size buffer) will not yet be scheduled.
+template <typename Result>
+struct OrderedSequencer : public Sequencer<Result> {
+  using typename Sequencer<Result>::ResultProducer;
+
+  /// Constructs the `OrderedSequencer` with the maximum number of results it
+  /// will ever hold at one point in time.
+  explicit OrderedSequencer(size_t max_jobs) : buffer_(max_jobs) {}
+
+  /// Buffers results until the next one in the expected order is received.
+  std::optional<Result> next(ResultProducer next_result) override {
+    // If we already have the result for the next sqn, return it.
+    if (auto& maybe_result = buffer(next_sequence_number_)) {
+      auto result = std::move(*maybe_result);
+      buffer(next_sequence_number_++).reset();
+      return result;
+    }
+    // Otherwise wait for the next result.
+    while (true) {
+      auto result = next_result();
+      if (!result) {
+        AT_ASSERT(!detail::buffer_contains_result(buffer_));
+        break;
+      }
+      // If it was not nullopt and the sequence numbers match, return it
+      // directly and bump the sequence number.
+      if (result->sequence_number == next_sequence_number_) {
+        ++next_sequence_number_;
+        return result;
+      }
+      // Stash the result for later.
+      AT_ASSERT(!buffer(result->sequence_number).has_value());
+      buffer(result->sequence_number) = std::move(result);
+    }
+    // The result was an empty optional, so we are done with this epoch.
+    return std::nullopt;
+  }
+
+  /// Accesses the buffer at the `index` modulo the buffer size.
+  std::optional<Result>& buffer(size_t index) {
+    return buffer_.at(index % buffer_.size());
+  }
+
+  /// The monotonically increasing sequence number we expect.
+  size_t next_sequence_number_ = 0;
+
+  /// A fixed-size buffer (after construction).
+  std::vector<std::optional<Result>> buffer_;
+};
+} // namespace torch::data::detail::sequencers
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/example.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/example.h
new file mode 100644
index 0000000000000000000000000000000000000000..e08e301af1694eb51211d5452264976ba521bf9f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/example.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <torch/types.h>
+
+namespace torch::data {
+
+/// An `Example` from a dataset.
+///
+/// A dataset consists of data and an associated target (label).
+template <typename Data = at::Tensor, typename Target = at::Tensor>
+struct Example {
+  using DataType = Data;
+  using TargetType = Target;
+
+  Example() = default;
+  Example(Data data, Target target)
+      : data(std::move(data)), target(std::move(target)) {}
+
+  Data data;
+  Target target;
+};
+
+namespace example {
+using NoTarget = void;
+} // namespace example
+
+/// A specialization for `Example` that does not have a target.
+///
+/// This class exists so that code can be written for a templated `Example`
+/// type, and work both for labeled and unlabeled datasets.
+template <typename Data>
+struct Example<Data, example::NoTarget> {
+  using DataType = Data;
+  using TargetType = example::NoTarget;
+
+  Example() = default;
+  /* implicit */ Example(Data data) : data(std::move(data)) {}
+
+  // When a DataLoader returns an Example like this, that example should be
+  // implicitly convertible to the underlying data type.
+
+  operator Data&() {
+    return data;
+  }
+  operator const Data&() const {
+    return data;
+  }
+
+  Data data;
+};
+
+using TensorExample = Example<at::Tensor, example::NoTarget>;
+} // namespace torch::data
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/iterator.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..263e1bbbddb1ca00fbb802f2d22876893a5ed6a5
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/iterator.h
@@ -0,0 +1,178 @@
+#pragma once
+
+#include <torch/csrc/utils/variadic.h>
+#include <torch/types.h>
+
+#include <c10/util/Exception.h>
+
+#include <functional>
+#include <iterator>
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+namespace torch::data {
+namespace detail {
+// For increased safety and more separated logic, this implementation of
+// `Iterator` consists of a `ValidIterator` and a `SentinelIterator`. A
+// `ValidIterator` yields new batches until the `DataLoader` is exhausted. While
+// the `DataLoader` is not exhausted, `ValidIterator`s compare equal if they are
+// the same object. When the `ValidIterator` becomes exhausted, it compares
+// equal to the `SentinelIterator`, but not before. Half the code here is to
+// implement double dispatch for the comparison. Got damnit, C++.
+
+template <typename Batch>
+struct ValidIterator;
+
+template <typename Batch>
+struct SentinelIterator;
+
+/// Base class for the `ValidIterator` and `SentinelIterator`
+template <typename Batch>
+struct IteratorImpl {
+  virtual ~IteratorImpl() = default;
+  virtual void next() = 0;
+  virtual Batch& get() = 0;
+  virtual bool operator==(const IteratorImpl& other) const = 0;
+  virtual bool operator==(const ValidIterator<Batch>& other) const = 0;
+  virtual bool operator==(const SentinelIterator<Batch>& other) const = 0;
+};
+
+template <typename Batch>
+struct ValidIterator : public IteratorImpl<Batch> {
+  using BatchProducer = std::function<std::optional<Batch>()>;
+
+  explicit ValidIterator(BatchProducer next_batch)
+      : next_batch_(std::move(next_batch)) {}
+
+  /// Fetches the next batch.
+  void next() override {
+    // If we didn't get the very first batch yet, get it now.
+    lazy_initialize();
+    TORCH_CHECK(
+        batch_.has_value(), "Attempted to increment iterator past the end");
+    // Increment to the next batch.
+    batch_ = next_batch_();
+  }
+
+  /// Returns the current batch. The precondition for this operation to not
+  /// throw an exception is that it has been compared to the `SentinelIterator`
+  /// and did not compare equal.
+  Batch& get() override {
+    // If we didn't get the very first batch yet, get it now.
+    lazy_initialize();
+    TORCH_CHECK(
+        batch_.has_value(),
+        "Attempted to dereference iterator that was past the end");
+    return batch_.value();
+  }
+
+  /// Does double dispatch.
+  bool operator==(const IteratorImpl<Batch>& other) const override {
+    return other == *this;
+  }
+
+  /// A `ValidIterator` is equal to the `SentinelIterator` iff. the
+  /// `ValidIterator` has reached the end of the dataloader.
+  bool operator==(const SentinelIterator<Batch>& /* unused */) const override {
+    lazy_initialize();
+    return !batch_;
+  }
+
+  /// Returns true if the memory address of `other` equals that of `this`.
+  bool operator==(const ValidIterator<Batch>& other) const override {
+    return &other == this;
+  }
+
+  /// Gets the very first batch if it has not yet been fetched.
+  void lazy_initialize() const {
+    if (!initialized_) {
+      batch_ = next_batch_();
+      initialized_ = true;
+    }
+  }
+
+  BatchProducer next_batch_;
+  mutable std::optional<Batch> batch_;
+  mutable bool initialized_ = false;
+};
+
+template <typename Batch>
+struct SentinelIterator : public IteratorImpl<Batch> {
+  void next() override {
+    TORCH_CHECK(
+        false,
+        "Incrementing the DataLoader's past-the-end iterator is not allowed");
+  }
+
+  Batch& get() override {
+    TORCH_CHECK(
+        false,
+        "Dereferencing the DataLoader's past-the-end iterator is not allowed");
+  }
+
+  /// Does double dispatch.
+  bool operator==(const IteratorImpl<Batch>& other) const override {
+    return other == *this;
+  }
+
+  /// Calls the comparison operator between `ValidIterator` and
+  /// `SentinelIterator`.
+  bool operator==(const ValidIterator<Batch>& other) const override {
+    return other == *this;
+  }
+
+  /// Sentinel iterators always compare equal.
+  bool operator==(const SentinelIterator<Batch>& other) const override {
+    return true;
+  }
+};
+} // namespace detail
+
+template <typename Batch>
+class Iterator {
+ public:
+  // Type aliases to make the class recognized as a proper iterator.
+  using difference_type = std::ptrdiff_t;
+  using value_type = Batch;
+  using pointer = Batch*;
+  using reference = Batch&;
+  using iterator_category = std::input_iterator_tag;
+
+  explicit Iterator(std::unique_ptr<detail::IteratorImpl<Batch>> impl)
+      : impl_(std::move(impl)) {}
+
+  /// Increments the iterator.
+  /// Only permitted for valid iterators (not past the end).
+  Iterator& operator++() {
+    impl_->next();
+    return *this;
+  }
+
+  /// Returns the current batch.
+  /// Only permitted for valid iterators (not past the end).
+  Batch& operator*() {
+    return impl_->get();
+  }
+
+  /// Returns a pointer to the current batch.
+  /// Only permitted for valid iterators (not past the end).
+  Batch* operator->() {
+    return &impl_->get();
+  }
+
+  /// Compares two iterators for equality.
+  bool operator==(const Iterator& other) const {
+    return *impl_ == *other.impl_;
+  }
+
+  /// Compares two iterators for inequality.
+  bool operator!=(const Iterator& other) const {
+    return !(*this == other);
+  }
+
+ private:
+  /// Points either to a `ValidIterator` or to a `SentinelIterator`.
+  std::shared_ptr<detail::IteratorImpl<Batch>> impl_;
+};
+} // namespace torch::data
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers.h
new file mode 100644
index 0000000000000000000000000000000000000000..6159a454950d276451fd45d5d001fa30b40db918
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/data/samplers/base.h>
+#include <torch/data/samplers/custom_batch_request.h>
+#include <torch/data/samplers/distributed.h>
+#include <torch/data/samplers/random.h>
+#include <torch/data/samplers/sequential.h>
+#include <torch/data/samplers/serialize.h>
+#include <torch/data/samplers/stream.h>
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/base.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/base.h
new file mode 100644
index 0000000000000000000000000000000000000000..cab4166ce23e93f0d0e969be473aad21c47138c9
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/base.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/types.h>
+
+#include <cstddef>
+#include <mutex>
+#include <vector>
+
+namespace torch::serialize {
+class OutputArchive;
+class InputArchive;
+} // namespace torch::serialize
+
+namespace torch::data::samplers {
+/// A `Sampler` is an object that yields an index with which to access a
+/// dataset.
+template <typename BatchRequest = std::vector<size_t>>
+class Sampler {
+ public:
+  using BatchRequestType = BatchRequest;
+
+  virtual ~Sampler() = default;
+
+  /// Resets the `Sampler`'s internal state.
+  /// Typically called before a new epoch.
+  /// Optionally, accepts a new size when resetting the sampler.
+  virtual void reset(std::optional<size_t> new_size) = 0;
+
+  /// Returns the next index if possible, or an empty optional if the
+  /// sampler is exhausted for this epoch.
+  virtual std::optional<BatchRequest> next(size_t batch_size) = 0;
+
+  /// Serializes the `Sampler` to the `archive`.
+  virtual void save(serialize::OutputArchive& archive) const = 0;
+
+  /// Deserializes the `Sampler` from the `archive`.
+  virtual void load(serialize::InputArchive& archive) = 0;
+};
+
+} // namespace torch::data::samplers
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/custom_batch_request.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/custom_batch_request.h
new file mode 100644
index 0000000000000000000000000000000000000000..efefb24e83b8fbedc430002c9d5910e16acbf5c6
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/custom_batch_request.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <cstddef>
+
+namespace torch::data::samplers {
+/// A base class for custom index types.
+struct TORCH_API CustomBatchRequest {
+  CustomBatchRequest() = default;
+  CustomBatchRequest(const CustomBatchRequest&) = default;
+  CustomBatchRequest(CustomBatchRequest&&) noexcept = default;
+  virtual ~CustomBatchRequest() = default;
+
+  /// The number of elements accessed by this index.
+  virtual size_t size() const = 0;
+};
+} // namespace torch::data::samplers
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/distributed.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/distributed.h
new file mode 100644
index 0000000000000000000000000000000000000000..6821876417fbf95b43ef5e7e50e82886708cf0cd
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/distributed.h
@@ -0,0 +1,133 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/data/samplers/base.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace torch::serialize {
+class OutputArchive;
+class InputArchive;
+} // namespace torch::serialize
+
+namespace torch::data::samplers {
+
+/// A `Sampler` that selects a subset of indices to sample from and defines a
+/// sampling behavior. In a distributed setting, this selects a subset of the
+/// indices depending on the provided num_replicas and rank parameters. The
+/// `Sampler` performs a rounding operation based on the `allow_duplicates`
+/// parameter to decide the local sample count.
+template <typename BatchRequest = std::vector<size_t>>
+class DistributedSampler : public Sampler<BatchRequest> {
+ public:
+  DistributedSampler(
+      size_t size,
+      size_t num_replicas = 1,
+      size_t rank = 0,
+      bool allow_duplicates = true)
+      : size_(size),
+        num_replicas_(num_replicas),
+        rank_(rank),
+
+        allow_duplicates_(allow_duplicates) {}
+
+  /// Set the epoch for the current enumeration. This can be used to alter the
+  /// sample selection and shuffling behavior.
+  void set_epoch(size_t epoch) {
+    epoch_ = epoch;
+  }
+
+  size_t epoch() const {
+    return epoch_;
+  }
+
+ protected:
+  size_t local_sample_count() {
+    if (allow_duplicates_) {
+      return (size_ + num_replicas_ - 1) / num_replicas_;
+    } else {
+      return size_ / num_replicas_;
+    }
+  }
+
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  size_t size_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  size_t num_replicas_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  size_t rank_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  size_t epoch_{0};
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  bool allow_duplicates_;
+};
+
+/// Select samples randomly. The sampling order is shuffled at each `reset()`
+/// call.
+class TORCH_API DistributedRandomSampler : public DistributedSampler<> {
+ public:
+  DistributedRandomSampler(
+      size_t size,
+      size_t num_replicas = 1,
+      size_t rank = 0,
+      bool allow_duplicates = true);
+
+  /// Resets the `DistributedRandomSampler` to a new set of indices.
+  void reset(std::optional<size_t> new_size = std::nullopt) override;
+
+  /// Returns the next batch of indices.
+  std::optional<std::vector<size_t>> next(size_t batch_size) override;
+
+  /// Serializes the `DistributedRandomSampler` to the `archive`.
+  void save(serialize::OutputArchive& archive) const override;
+
+  /// Deserializes the `DistributedRandomSampler` from the `archive`.
+  void load(serialize::InputArchive& archive) override;
+
+  /// Returns the current index of the `DistributedRandomSampler`.
+  size_t index() const noexcept;
+
+ private:
+  void populate_indices();
+
+  size_t begin_index_;
+  size_t end_index_;
+  size_t sample_index_;
+  std::vector<size_t> all_indices_;
+};
+
+/// Select samples sequentially.
+class TORCH_API DistributedSequentialSampler : public DistributedSampler<> {
+ public:
+  DistributedSequentialSampler(
+      size_t size,
+      size_t num_replicas = 1,
+      size_t rank = 0,
+      bool allow_duplicates = true);
+
+  /// Resets the `DistributedSequentialSampler` to a new set of indices.
+  void reset(std::optional<size_t> new_size = std::nullopt) override;
+
+  /// Returns the next batch of indices.
+  std::optional<std::vector<size_t>> next(size_t batch_size) override;
+
+  /// Serializes the `DistributedSequentialSampler` to the `archive`.
+  void save(serialize::OutputArchive& archive) const override;
+
+  /// Deserializes the `DistributedSequentialSampler` from the `archive`.
+  void load(serialize::InputArchive& archive) override;
+
+  /// Returns the current index of the `DistributedSequentialSampler`.
+  size_t index() const noexcept;
+
+ private:
+  void populate_indices();
+
+  size_t begin_index_;
+  size_t end_index_;
+  size_t sample_index_;
+  std::vector<size_t> all_indices_;
+};
+
+} // namespace torch::data::samplers
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/random.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/random.h
new file mode 100644
index 0000000000000000000000000000000000000000..a8e749db55a762079ab30dd236a196b8680fd034
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/random.h
@@ -0,0 +1,48 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/data/samplers/base.h>
+#include <torch/types.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace torch::serialize {
+class OutputArchive;
+class InputArchive;
+} // namespace torch::serialize
+
+namespace torch::data::samplers {
+
+/// A `Sampler` that returns random indices.
+class TORCH_API RandomSampler : public Sampler<> {
+ public:
+  /// Constructs a `RandomSampler` with a size and dtype for the stored indices.
+  ///
+  /// The constructor will eagerly allocate all required indices, which is the
+  /// sequence `0 ... size - 1`. `index_dtype` is the data type of the stored
+  /// indices. You can change it to influence memory usage.
+  explicit RandomSampler(int64_t size, Dtype index_dtype = torch::kInt64);
+
+  ~RandomSampler() override;
+
+  /// Resets the `RandomSampler` to a new set of indices.
+  void reset(std::optional<size_t> new_size = std::nullopt) override;
+
+  /// Returns the next batch of indices.
+  std::optional<std::vector<size_t>> next(size_t batch_size) override;
+
+  /// Serializes the `RandomSampler` to the `archive`.
+  void save(serialize::OutputArchive& archive) const override;
+
+  /// Deserializes the `RandomSampler` from the `archive`.
+  void load(serialize::InputArchive& archive) override;
+
+  /// Returns the current index of the `RandomSampler`.
+  size_t index() const noexcept;
+
+ private:
+  at::Tensor indices_;
+  int64_t index_ = 0;
+};
+} // namespace torch::data::samplers
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/sequential.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/sequential.h
new file mode 100644
index 0000000000000000000000000000000000000000..c29e285e0566b402408368107bcbabc2fac1d32f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/sequential.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/data/samplers/base.h>
+#include <torch/types.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace torch::serialize {
+class OutputArchive;
+class InputArchive;
+} // namespace torch::serialize
+
+namespace torch::data::samplers {
+
+/// A `Sampler` that returns indices sequentially.
+class TORCH_API SequentialSampler : public Sampler<> {
+ public:
+  /// Creates a `SequentialSampler` that will return indices in the range
+  /// `0...size - 1`.
+  explicit SequentialSampler(size_t size);
+
+  /// Resets the `SequentialSampler` to zero.
+  void reset(std::optional<size_t> new_size = std::nullopt) override;
+
+  /// Returns the next batch of indices.
+  std::optional<std::vector<size_t>> next(size_t batch_size) override;
+
+  /// Serializes the `SequentialSampler` to the `archive`.
+  void save(serialize::OutputArchive& archive) const override;
+
+  /// Deserializes the `SequentialSampler` from the `archive`.
+  void load(serialize::InputArchive& archive) override;
+
+  /// Returns the current index of the `SequentialSampler`.
+  size_t index() const noexcept;
+
+ private:
+  size_t size_;
+  size_t index_{0};
+};
+
+} // namespace torch::data::samplers
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/serialize.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/serialize.h
new file mode 100644
index 0000000000000000000000000000000000000000..46fc7f916de8fe9c43ea898209df59ba22408c25
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/serialize.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <torch/data/samplers/base.h>
+#include <torch/serialize/archive.h>
+
+namespace torch::data::samplers {
+/// Serializes a `Sampler` into an `OutputArchive`.
+template <typename BatchRequest>
+serialize::OutputArchive& operator<<(
+    serialize::OutputArchive& archive,
+    const Sampler<BatchRequest>& sampler) {
+  sampler.save(archive);
+  return archive;
+}
+
+/// Deserializes a `Sampler` from an `InputArchive`.
+template <typename BatchRequest>
+serialize::InputArchive& operator>>(
+    serialize::InputArchive& archive,
+    Sampler<BatchRequest>& sampler) {
+  sampler.load(archive);
+  return archive;
+}
+} // namespace torch::data::samplers
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/stream.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/stream.h
new file mode 100644
index 0000000000000000000000000000000000000000..e651c19d0b929a44850b7e80b0b3821e79b237eb
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/samplers/stream.h
@@ -0,0 +1,57 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/data/samplers/base.h>
+#include <torch/data/samplers/custom_batch_request.h>
+#include <torch/types.h>
+
+#include <cstddef>
+
+namespace torch::serialize {
+class InputArchive;
+class OutputArchive;
+} // namespace torch::serialize
+
+namespace torch::data::samplers {
+
+/// A wrapper around a batch size value, which implements the
+/// `CustomBatchRequest` interface.
+struct TORCH_API BatchSize : public CustomBatchRequest {
+  explicit BatchSize(size_t size);
+  size_t size() const noexcept override;
+  operator size_t() const noexcept;
+  size_t size_;
+};
+
+/// A sampler for (potentially infinite) streams of data.
+///
+/// The major feature of the `StreamSampler` is that it does not return
+/// particular indices, but instead only the number of elements to fetch from
+/// the dataset. The dataset has to decide how to produce those elements.
+class TORCH_API StreamSampler : public Sampler<BatchSize> {
+ public:
+  /// Constructs the `StreamSampler` with the number of individual examples that
+  /// should be fetched until the sampler is exhausted.
+  explicit StreamSampler(size_t epoch_size);
+
+  /// Resets the internal state of the sampler.
+  void reset(std::optional<size_t> new_size = std::nullopt) override;
+
+  /// Returns a `BatchSize` object with the number of elements to fetch in the
+  /// next batch. This number is the minimum of the supplied `batch_size` and
+  /// the difference between the `epoch_size` and the current index. If the
+  /// `epoch_size` has been reached, returns an empty optional.
+  std::optional<BatchSize> next(size_t batch_size) override;
+
+  /// Serializes the `StreamSampler` to the `archive`.
+  void save(serialize::OutputArchive& archive) const override;
+
+  /// Deserializes the `StreamSampler` from the `archive`.
+  void load(serialize::InputArchive& archive) override;
+
+ private:
+  size_t examples_retrieved_so_far_ = 0;
+  size_t epoch_size_;
+};
+
+} // namespace torch::data::samplers
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/transforms.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/transforms.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d532dc82ca26f56ad157c07a991b654d59a4e25
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/transforms.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#include <torch/data/transforms/base.h>
+#include <torch/data/transforms/collate.h>
+#include <torch/data/transforms/lambda.h>
+#include <torch/data/transforms/stack.h>
+#include <torch/data/transforms/tensor.h>
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/transforms/base.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/transforms/base.h
new file mode 100644
index 0000000000000000000000000000000000000000..76798ce5b948a13d2019f28b646a893b5ac45995
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/transforms/base.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <torch/types.h>
+
+#include <utility>
+#include <vector>
+
+namespace torch::data::transforms {
+
+/// A transformation of a batch to a new batch.
+template <typename InputBatch, typename OutputBatch>
+class BatchTransform {
+ public:
+  using InputBatchType = InputBatch;
+  using OutputBatchType = OutputBatch;
+
+  virtual ~BatchTransform() = default;
+
+  /// Applies the transformation to the given `input_batch`.
+  virtual OutputBatch apply_batch(InputBatch input_batch) = 0;
+};
+
+/// A transformation of individual input examples to individual output examples.
+///
+/// Just like a `Dataset` is a `BatchDataset`, a `Transform` is a
+/// `BatchTransform` that can operate on the level of individual examples rather
+/// than entire batches. The batch-level transform is implemented (by default)
+/// in terms of the example-level transform, though this can be customized.
+template <typename Input, typename Output>
+class Transform
+    : public BatchTransform<std::vector<Input>, std::vector<Output>> {
+ public:
+  using InputType = Input;
+  using OutputType = Output;
+
+  /// Applies the transformation to the given `input`.
+  virtual OutputType apply(InputType input) = 0;
+
+  /// Applies the `transformation` over the entire `input_batch`.
+  std::vector<Output> apply_batch(std::vector<Input> input_batch) override {
+    std::vector<Output> output_batch;
+    output_batch.reserve(input_batch.size());
+    for (auto&& input : input_batch) {
+      output_batch.push_back(apply(std::move(input)));
+    }
+    return output_batch;
+  }
+};
+} // namespace torch::data::transforms
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/transforms/collate.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/transforms/collate.h
new file mode 100644
index 0000000000000000000000000000000000000000..252e64496e584bc63bae582103698630405cc7cb
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/transforms/collate.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <torch/data/example.h>
+#include <torch/data/transforms/lambda.h>
+
+#include <vector>
+
+namespace torch::data::transforms {
+
+/// A `Collation` is a transform that reduces a batch into a single value.
+/// The result is a `BatchDataset` that has the type of the single value as its
+/// `BatchType`.
+template <typename T, typename BatchType = std::vector<T>>
+using Collation = BatchTransform<BatchType, T>;
+
+/// A `Collate` allows passing a custom function to reduce/collate a batch
+/// into a single value. It's effectively the lambda version of `Collation`,
+/// which you could subclass and override `operator()` to achieve the same.
+///
+/// \rst
+/// .. code-block:: cpp
+///   using namespace torch::data;
+///
+///   auto dataset = datasets::MNIST("path/to/mnist")
+///     .map(transforms::Collate<Example<>>([](std::vector<Example<>> e) {
+///       return std::move(e.front());
+///     }));
+/// \endrst
+template <typename T, typename BatchType = std::vector<T>>
+using Collate = BatchLambda<BatchType, T>;
+} // namespace torch::data::transforms
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/transforms/lambda.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/transforms/lambda.h
new file mode 100644
index 0000000000000000000000000000000000000000..861fb48cd6fc8ca241fb6b450aad75223f7ac0fe
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/transforms/lambda.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <torch/data/transforms/base.h>
+
+#include <functional>
+#include <utility>
+#include <vector>
+
+namespace torch::data::transforms {
+
+/// A `BatchTransform` that applies a user-provided functor to a batch.
+template <typename Input, typename Output = Input>
+class BatchLambda : public BatchTransform<Input, Output> {
+ public:
+  using typename BatchTransform<Input, Output>::InputBatchType;
+  using typename BatchTransform<Input, Output>::OutputBatchType;
+  using FunctionType = std::function<OutputBatchType(InputBatchType)>;
+
+  /// Constructs the `BatchLambda` from the given `function` object.
+  explicit BatchLambda(FunctionType function)
+      : function_(std::move(function)) {}
+
+  /// Applies the user-provided function object to the `input_batch`.
+  OutputBatchType apply_batch(InputBatchType input_batch) override {
+    return function_(std::move(input_batch));
+  }
+
+ private:
+  FunctionType function_;
+};
+
+// A `Transform` that applies a user-provided functor to individual examples.
+template <typename Input, typename Output = Input>
+class Lambda : public Transform<Input, Output> {
+ public:
+  using typename Transform<Input, Output>::InputType;
+  using typename Transform<Input, Output>::OutputType;
+  using FunctionType = std::function<Output(Input)>;
+
+  /// Constructs the `Lambda` from the given `function` object.
+  explicit Lambda(FunctionType function) : function_(std::move(function)) {}
+
+  /// Applies the user-provided function object to the `input`.
+  OutputType apply(InputType input) override {
+    return function_(std::move(input));
+  }
+
+ private:
+  FunctionType function_;
+};
+
+} // namespace torch::data::transforms
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/transforms/stack.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/transforms/stack.h
new file mode 100644
index 0000000000000000000000000000000000000000..0751d73cc2186fda2dd8d67a43f77f78928ee7d8
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/transforms/stack.h
@@ -0,0 +1,45 @@
+#pragma once
+
+#include <torch/data/example.h>
+#include <torch/data/transforms/collate.h>
+#include <torch/types.h>
+
+#include <utility>
+#include <vector>
+
+namespace torch::data::transforms {
+
+template <typename T = Example<>>
+struct Stack;
+
+/// A `Collation` for `Example<Tensor, Tensor>` types that stacks all data
+/// tensors into one tensor, and all target (label) tensors into one tensor.
+template <>
+struct Stack<Example<>> : public Collation<Example<>> {
+  Example<> apply_batch(std::vector<Example<>> examples) override {
+    std::vector<torch::Tensor> data, targets;
+    data.reserve(examples.size());
+    targets.reserve(examples.size());
+    for (auto& example : examples) {
+      data.push_back(std::move(example.data));
+      targets.push_back(std::move(example.target));
+    }
+    return {torch::stack(data), torch::stack(targets)};
+  }
+};
+
+/// A `Collation` for `Example<Tensor, NoTarget>` types that stacks all data
+/// tensors into one tensor.
+template <>
+struct Stack<TensorExample>
+    : public Collation<Example<Tensor, example::NoTarget>> {
+  TensorExample apply_batch(std::vector<TensorExample> examples) override {
+    std::vector<torch::Tensor> data;
+    data.reserve(examples.size());
+    for (auto& example : examples) {
+      data.push_back(std::move(example.data));
+    }
+    return torch::stack(data);
+  }
+};
+} // namespace torch::data::transforms
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/transforms/tensor.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/transforms/tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..6a965c5dbffb924310c89d6f92bddb05a18fc797
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/transforms/tensor.h
@@ -0,0 +1,73 @@
+#pragma once
+
+#include <torch/data/example.h>
+#include <torch/data/transforms/base.h>
+#include <torch/types.h>
+
+#include <functional>
+#include <utility>
+
+namespace torch::data::transforms {
+
+/// A `Transform` that is specialized for the typical `Example<Tensor, Tensor>`
+/// combination. It exposes a single `operator()` interface hook (for
+/// subclasses), and calls this function on input `Example` objects.
+template <typename Target = Tensor>
+class TensorTransform
+    : public Transform<Example<Tensor, Target>, Example<Tensor, Target>> {
+ public:
+  using E = Example<Tensor, Target>;
+  using typename Transform<E, E>::InputType;
+  using typename Transform<E, E>::OutputType;
+
+  /// Transforms a single input tensor to an output tensor.
+  virtual Tensor operator()(Tensor input) = 0;
+
+  /// Implementation of `Transform::apply` that calls `operator()`.
+  OutputType apply(InputType input) override {
+    input.data = (*this)(std::move(input.data));
+    return input;
+  }
+};
+
+/// A `Lambda` specialized for the typical `Example<Tensor, Tensor>` input type.
+template <typename Target = Tensor>
+class TensorLambda : public TensorTransform<Target> {
+ public:
+  using FunctionType = std::function<Tensor(Tensor)>;
+
+  /// Creates a `TensorLambda` from the given `function`.
+  explicit TensorLambda(FunctionType function)
+      : function_(std::move(function)) {}
+
+  /// Applies the user-provided functor to the input tensor.
+  Tensor operator()(Tensor input) override {
+    return function_(std::move(input));
+  }
+
+ private:
+  FunctionType function_;
+};
+
+/// Normalizes input tensors by subtracting the supplied mean and dividing by
+/// the given standard deviation.
+template <typename Target = Tensor>
+struct Normalize : public TensorTransform<Target> {
+  /// Constructs a `Normalize` transform. The mean and standard deviation can be
+  /// anything that is broadcastable over the input tensors (like single
+  /// scalars).
+  Normalize(ArrayRef<double> mean, ArrayRef<double> stddev)
+      : mean(torch::tensor(mean, torch::kFloat32)
+                 .unsqueeze(/*dim=*/1)
+                 .unsqueeze(/*dim=*/2)),
+        stddev(torch::tensor(stddev, torch::kFloat32)
+                   .unsqueeze(/*dim=*/1)
+                   .unsqueeze(/*dim=*/2)) {}
+
+  torch::Tensor operator()(Tensor input) override {
+    return input.sub(mean).div(stddev);
+  }
+
+  torch::Tensor mean, stddev;
+};
+} // namespace torch::data::transforms
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/worker_exception.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/worker_exception.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe65680c15733f94a1cd83882c6b30e8be5acc63
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/data/worker_exception.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <exception>
+#include <string>
+#include <utility>
+
+namespace torch::data {
+
+/// An exception thrown when a DataLoader's worker thread throws an exception,
+/// which is caught. A `WorkerException` stores an `exception_ptr` to the
+/// original exception thrown in the worker thread.
+struct WorkerException : public std::exception {
+  /// Constructs a `WorkerException` from an `exception_ptr`.
+  explicit WorkerException(std::exception_ptr original)
+      // NOLINTNEXTLINE(bugprone-throw-keyword-missing)
+      : original_exception(std::move(original)),
+        message("Caught exception in DataLoader worker thread.") {
+    try {
+      std::rethrow_exception(original_exception);
+    } catch (std::exception& e) {
+      message += " Original message: ";
+      message += e.what();
+    }
+  }
+
+  const char* what() const noexcept override {
+    return message.c_str();
+  }
+
+  /// The original exception thrown in the worker thread.
+  std::exception_ptr original_exception;
+
+  /// This exception's message (not the original exception's message).
+  std::string message;
+};
+
+} // namespace torch::data
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/detail/TensorDataContainer.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/detail/TensorDataContainer.h
new file mode 100644
index 0000000000000000000000000000000000000000..7bd4ddcd3eb60740d820b5266e8bd6f118d279a7
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/detail/TensorDataContainer.h
@@ -0,0 +1,349 @@
+#pragma once
+
+#include <ATen/Dispatch.h>
+#include <ATen/ScalarOps.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/core/grad_mode.h>
+
+#include <c10/util/irange.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/empty.h>
+#include <ATen/ops/tensor.h>
+#endif
+
+#include <initializer_list>
+
+namespace torch::detail {
+
+enum class TensorDataContainerType { Scalar, InitList, Tensor };
+
+struct TensorDataContainer;
+
+inline std::ostream& operator<<(
+    std::ostream& stream,
+    const TensorDataContainer& tensor_data_container);
+
+inline c10::ScalarType compute_desired_dtype(c10::ScalarType scalar_type) {
+  if (scalar_type == at::kInt || scalar_type == at::kLong) {
+    // C++ `torch::tensor` with an integer type or an `at::ArrayRef` /
+    // `std::vector` / (nested) braced-init-list of integer types always
+    // produces a tensor of dtype `at::kLong` (aka. int64_t), matching Python
+    // `torch.tensor` behavior.
+    return at::kLong;
+  } else if (scalar_type == at::kFloat || scalar_type == at::kDouble) {
+    // C++ `torch::tensor` with a floating-point type or an `at::ArrayRef` /
+    // `std::vector` / (nested) braced-init-list of floating-point types always
+    // produces a tensor of dtype `torch::get_default_dtype()`, matching Python
+    // `torch.tensor` behavior.
+    return at::typeMetaToScalarType(at::get_default_dtype());
+  } else {
+    return scalar_type;
+  }
+}
+
+// We use `TensorDataContainer` to support converting the following data
+// container types into the equivalent Tensor:
+//
+// 1. Arbitrarily nested braced-init-list (e.g. `{{1, 2}, {3, 4}}`).
+// 2. `at::ArrayRef` of supported tensor data types.
+// 3. `std::vector` of supported tensor data types.
+//
+// At any time, a `TensorDataContainer` object represents one of the following:
+//
+// 1. A scalar with value `scalar()` and type `scalar_type()`.
+// 2. A Tensor represented in `std::initializer_list<TensorDataContainer>` form,
+//    with value `init_list()`, Tensor scalar type `scalar_type()`, and Tensor
+//    sizes `sizes()`.
+// 3. A Tensor represented in `at::Tensor` form, with value `tensor()`, scalar
+// type `scalar_type()`,
+//    and Tensor sizes `sizes()`.
+//
+// All the infrastructure here is mostly to support converting an arbitrarily
+// nested braced-init-list to the equivalent Tensor successfully. Consider the
+// following example:
+//
+// `torch::tensor({{1}, {2}})`
+//
+// this will call into the `torch::tensor` function:
+//
+// `at::Tensor tensor(detail::TensorDataContainer tensor_data_container, const
+// at::TensorOptions& options = {})`
+//
+// the compiler will first try to convert `{{1}, {2}}` to `TensorDataContainer`
+// type:
+//
+// `TensorDataContainer({{1}, {2}})`
+//
+// which matches to the
+// `TensorDataContainer(std::initializer_list<TensorDataContainer>)`
+// constructor, and in an attempt to convert `{1}` and `{2}` to
+// `TensorDataContainer`, it calls the following:
+//
+// `TensorDataContainer({1})`  (same call path happens for `{2}`, and we'll just
+// focus on `{1}` here)
+//
+// At this point, theoretically there are two plausible ways for `{1}` to be
+// matched to one of the constructors of `TensorDataContainer`:
+//
+// 1. It can be a list-initialization of a scalar value, thus matching
+// `TensorDataContainer(int value)`.
+// 2. It can be converted to `std::initializer_list<TensorDataContainer>`, thus
+// matching
+//    `TensorDataContainer(std::initializer_list<TensorDataContainer>)`.
+//
+// How does the compiler decide which one to choose? According to
+// `https://en.cppreference.com/w/cpp/language/list_initialization`,
+// braced-init-list always prefers the constructor that takes
+// `std::initializer_list`. Hence we happily move forward with constructor #2,
+// and it calls the following:
+//
+// `TensorDataContainer(1)`
+//
+// Now it matches `TensorDataContainer(int value)`, which stores `1` as a scalar
+// value. All is good.
+struct TensorDataContainer {
+  // NOTE: For tensors with zero-size dimensions (e.g. `torch::tensor({{},
+  // {}})`), the innermost empty braced-init-list `{}` matches the default
+  // constructor of the innermost `TensorDataContainer`.
+  TensorDataContainer()
+      : sizes_({0}),
+        // NOTE: In Python, the dtype of tensors with zero-size dimensions (e.g.
+        // `torch.tensor([[], []])`) depends on the value of
+        // `torch.get_default_dtype()`, and we should do the same for the C++
+        // equivalent.
+        scalar_type_(at::typeMetaToScalarType(at::get_default_dtype())),
+        type_(TensorDataContainerType::InitList) {}
+#define TENSOR(T, S)                            \
+  TensorDataContainer(T value)                  \
+      : scalar_type_(at::k##S),                 \
+        type_(TensorDataContainerType::Scalar), \
+        scalar_(value) {}
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TENSOR)
+  AT_FORALL_COMPLEX_TYPES(TENSOR)
+#undef TENSOR
+  TensorDataContainer(std::initializer_list<TensorDataContainer> init_list)
+      : scalar_type_(init_list.begin()->scalar_type()),
+        type_(TensorDataContainerType::InitList),
+        init_list_(init_list) {
+    const TensorDataContainer& first_elem = *(init_list.begin());
+    for (const auto& elem : init_list) {
+      TORCH_CHECK(
+          elem.sizes() == first_elem.sizes(),
+          "Expected all sub-lists to have sizes: ",
+          first_elem.sizes(),
+          " (e.g. ",
+          first_elem,
+          "), ",
+          "but got sub-list ",
+          elem,
+          " with sizes: ",
+          elem.sizes());
+      TORCH_CHECK(
+          elem.scalar_type() == first_elem.scalar_type(),
+          "Expected all elements of the tensor to have the same scalar type: ",
+          first_elem.scalar_type(),
+          ", but got element of scalar type: ",
+          elem.scalar_type());
+    }
+    sizes_.reserve(first_elem.sizes().size() + 1);
+    sizes_.push_back(static_cast<int64_t>(init_list.size()));
+    sizes_.insert(
+        sizes_.end(), first_elem.sizes().begin(), first_elem.sizes().end());
+  }
+
+#define TENSOR(T, S)                                                          \
+  TensorDataContainer(at::ArrayRef<T> values)                                 \
+      : sizes_({(int64_t)values.size()}),                                     \
+        scalar_type_(at::k##S),                                               \
+        type_(TensorDataContainerType::Tensor) {                              \
+    at::AutoDispatchBelowAutograd mode;                                       \
+    if (scalar_type_ == at::kBool) {                                          \
+      tensor_ = at::tensor(values, at::TensorOptions().device(at::kCPU));     \
+    } else {                                                                  \
+      tensor_ = at::tensor(values, at::dtype(scalar_type_).device(at::kCPU)); \
+    }                                                                         \
+  }
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TENSOR)
+  AT_FORALL_COMPLEX_TYPES(TENSOR)
+#undef TENSOR
+
+  // NOTE: We need to handle `std::vector` explicitly instead of relying on an
+  // implicit conversion to `at::ArrayRef`, otherwise the following error can be
+  // thrown when calling `torch::tensor(std::vector<int>({1, 2}))`:
+  // ```
+  // error: no matching function for call to 'tensor(const std::vector<int>&)'
+  // no known conversion for argument 1 from 'const std::vector<int>' to
+  // 'torch::detail::TensorDataContainer'
+  // ```
+  //
+  // NOTE: `torch::tensor(std::vector<bool>)` is not supported for now, because
+  // ArrayRef<bool> cannot be constructed from a std::vector<bool> bitfield.
+#define TENSOR(T, S)                                \
+  TensorDataContainer(const std::vector<T>& values) \
+      : TensorDataContainer(at::ArrayRef<T>(values)) {}
+  AT_FORALL_SCALAR_TYPES_AND2(Half, BFloat16, TENSOR)
+  AT_FORALL_COMPLEX_TYPES(TENSOR)
+#undef TENSOR
+
+  bool is_scalar() const {
+    return type_ == TensorDataContainerType::Scalar;
+  }
+
+  const c10::Scalar& scalar() const {
+    TORCH_CHECK(
+        is_scalar(),
+        "Can only call `scalar()` on a TensorDataContainer that has `is_scalar() == true`");
+    return scalar_;
+  }
+
+  bool is_init_list() const {
+    return type_ == TensorDataContainerType::InitList;
+  }
+
+  const std::initializer_list<TensorDataContainer>& init_list() const {
+    TORCH_CHECK(
+        is_init_list(),
+        "Can only call `init_list()` on a TensorDataContainer that has `is_init_list() == true`");
+    return init_list_;
+  }
+
+  bool is_tensor() const {
+    return type_ == TensorDataContainerType::Tensor;
+  }
+
+  const at::Tensor& tensor() const {
+    TORCH_CHECK(
+        is_tensor(),
+        "Can only call `tensor()` on a TensorDataContainer that has `is_tensor() == true`");
+    return tensor_;
+  }
+
+  const std::vector<int64_t>& sizes() const {
+    return sizes_;
+  }
+
+  const c10::ScalarType& scalar_type() const {
+    return scalar_type_;
+  }
+
+  at::Tensor convert_to_tensor(at::TensorOptions options) const {
+    if (!options.has_dtype()) {
+      options = options.dtype(compute_desired_dtype(scalar_type_));
+    }
+
+    if (is_scalar()) {
+      at::AutoDispatchBelowAutograd mode;
+      return at::scalar_tensor(scalar_, options);
+    } else if (is_init_list()) {
+      // NOTE: Here we explicitly choose to initialize the tensor on CPU first,
+      // fill each element of the tensor, and then move the tensor to the
+      // desired device. For CUDA device, this approach only involves 1 CUDA
+      // kernel launch, and is much faster than initializing the tensor on CUDA
+      // first and then filling each element of it (which involves `N` CUDA
+      // kernel launches where `N` is the number of the elements in the tensor).
+      at::Tensor tensor = ([&]() {
+        at::AutoDispatchBelowAutograd mode;
+        return at::empty(sizes_, options.device(at::kCPU));
+      })();
+      fill_tensor(tensor);
+      return tensor.to(options.device());
+    } else if (is_tensor()) {
+      auto output = tensor_.to(options);
+      TORCH_CHECK(
+          !tensor_.is_complex() || output.is_complex(),
+          "can not do torch::tensor(complex, dtype=non-complex) because complex can not be casted to real number without loss of information");
+      return output;
+    } else {
+      TORCH_INTERNAL_ASSERT(false, "Invalid TensorDataContainer type");
+    }
+  }
+
+  void pretty_print_recursive(std::ostream& stream) const {
+    if (is_scalar()) {
+      AT_DISPATCH_ALL_TYPES_AND3(
+          at::kBool,
+          at::kHalf,
+          at::kBFloat16,
+          scalar_type_,
+          "TensorDataContainer_pretty_print_scalar",
+          [&] { stream << scalar_.to<scalar_t>(); });
+    } else if (is_init_list()) {
+      stream << "{";
+      for (const TensorDataContainer* it = init_list_.begin();
+           it != init_list_.end();
+           it++) {
+        stream << *it;
+        if (std::next(it) != init_list_.end())
+          stream << ", ";
+      }
+      stream << "}";
+    } else if (is_tensor()) {
+      stream << "{";
+      for (const auto i : c10::irange(tensor_.sizes()[0])) {
+        AT_DISPATCH_ALL_TYPES_AND3(
+            at::kBool,
+            at::kHalf,
+            at::kBFloat16,
+            scalar_type_,
+            "TensorDataContainer_pretty_print_tensor_item",
+            [&] { stream << tensor_[i].item<scalar_t>(); });
+        if (i != tensor_.sizes()[0] - 1)
+          stream << ", ";
+      }
+      stream << "}";
+    } else {
+      TORCH_INTERNAL_ASSERT(false, "Invalid TensorDataContainer type");
+    }
+  }
+
+ private:
+  void fill_tensor(at::Tensor& tensor) const {
+    if (is_scalar()) {
+      TORCH_INTERNAL_ASSERT(
+          tensor.dim() == 0,
+          "Expected a 0-dim Tensor, but got Tensor with dimensions: ",
+          tensor.dim());
+      at::NoGradGuard guard;
+      tensor.fill_(scalar_);
+    } else if (is_init_list()) {
+      TORCH_INTERNAL_ASSERT(
+          tensor.sizes()[0] == (int64_t)init_list_.size(),
+          "Expected a Tensor with size ",
+          init_list_.size(),
+          " in its first dimension, but got Tensor with size ",
+          tensor.sizes()[0],
+          " in its first dimension");
+      int64_t index = 0;
+      for (const auto& elem : init_list_) {
+        at::Tensor slice = tensor[index];
+        elem.fill_tensor(slice);
+        index++;
+      }
+    } else if (is_tensor()) {
+      TORCH_INTERNAL_ASSERT(
+          false,
+          "TensorDataContainer is already a Tensor type, `fill_tensor` should not be called");
+    } else {
+      TORCH_INTERNAL_ASSERT(false, "Invalid TensorDataContainer type");
+    }
+  }
+
+  std::vector<int64_t> sizes_;
+  c10::ScalarType scalar_type_;
+  TensorDataContainerType type_;
+  c10::Scalar scalar_;
+  std::initializer_list<TensorDataContainer> init_list_;
+  at::Tensor tensor_;
+};
+
+inline std::ostream& operator<<(
+    std::ostream& stream,
+    const TensorDataContainer& tensor_data_container) {
+  tensor_data_container.pretty_print_recursive(stream);
+  return stream;
+}
+
+} // namespace torch::detail
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/detail/static.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/detail/static.h
new file mode 100644
index 0000000000000000000000000000000000000000..f269cbc06eff210c45dcddeaa6c02600030cdc80
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/detail/static.h
@@ -0,0 +1,59 @@
+#pragma once
+
+#include <torch/csrc/utils/variadic.h>
+#include <torch/types.h>
+
+#include <cstdint>
+#include <type_traits>
+
+namespace torch::nn {
+class Module;
+} // namespace torch::nn
+
+namespace torch::detail {
+/// Detects if a type T has a forward() method.
+template <typename T>
+struct has_forward {
+  // Declare two types with differing size.
+  using yes = int8_t;
+  using no = int16_t;
+
+  // Here we declare two functions. The first is only enabled if `&U::forward`
+  // is well-formed and returns the `yes` type. In C++, the ellipsis parameter
+  // type (`...`) always puts the function at the bottom of overload resolution.
+  // This is specified in the standard as: 1) A standard conversion sequence is
+  // always better than a user-defined conversion sequence or an ellipsis
+  // conversion sequence. 2) A user-defined conversion sequence is always better
+  // than an ellipsis conversion sequence This means that if the first overload
+  // is viable, it will be preferred over the second as long as we pass any
+  // convertible type. The type of `&U::forward` is a pointer type, so we can
+  // pass e.g. 0.
+  template <typename U>
+  static yes test(decltype(&U::forward));
+  template <typename U>
+  static no test(...);
+
+  // Finally we test statically whether the size of the type returned by the
+  // selected overload is the size of the `yes` type.
+  static constexpr bool value = (sizeof(test<T>(nullptr)) == sizeof(yes));
+};
+
+template <typename Head = void, typename... Tail>
+constexpr bool check_not_lvalue_references() {
+  return (!std::is_lvalue_reference_v<Head> ||
+          std::is_const_v<std::remove_reference_t<Head>>) &&
+      check_not_lvalue_references<Tail...>();
+}
+
+template <>
+inline constexpr bool check_not_lvalue_references<void>() {
+  return true;
+}
+
+/// A type trait whose `value` member is true if `M` derives from `Module`.
+template <typename M>
+using is_module = std::is_base_of<torch::nn::Module, std::decay_t<M>>;
+
+template <typename M, typename T = void>
+using enable_if_module_t = std::enable_if_t<is_module<M>::value, T>;
+} // namespace torch::detail
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/enum.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/enum.h
new file mode 100644
index 0000000000000000000000000000000000000000..d9a564d16afe2ce8b84dccddccce063887f6dd6f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/enum.h
@@ -0,0 +1,210 @@
+#pragma once
+
+#include <string>
+#include <variant>
+
+#include <ATen/core/Reduction.h>
+#include <c10/util/Exception.h>
+#include <torch/csrc/Export.h>
+
+#define TORCH_ENUM_DECLARE(name)                                      \
+  namespace torch {                                                   \
+  namespace enumtype {                                                \
+  /*                                                                  \
+    NOTE: We need to provide the default constructor for each struct, \
+    otherwise Clang 3.8 would complain:                               \
+    ```                                                               \
+    error: default initialization of an object of const type 'const   \
+    enumtype::Enum1' without a user-provided default constructor      \
+    ```                                                               \
+  */                                                                  \
+  struct k##name {                                                    \
+    k##name() {}                                                      \
+  };                                                                  \
+  }                                                                   \
+  TORCH_API extern const enumtype::k##name k##name;                   \
+  }
+
+#define TORCH_ENUM_DEFINE(name)    \
+  namespace torch {                \
+  const enumtype::k##name k##name; \
+  }
+
+#define TORCH_ENUM_PRETTY_PRINT(name)                                         \
+  std::string operator()(const enumtype::k##name& v [[maybe_unused]]) const { \
+    std::string k("k");                                                       \
+    return k + #name;                                                         \
+  }
+
+// NOTE: Backstory on why we need the following two macros:
+//
+// Consider the following options class:
+//
+// ```
+// struct TORCH_API SomeOptions {
+//   typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+//   reduction_t; SomeOptions(reduction_t reduction = torch::kMean) :
+//   reduction_(reduction) {}
+//
+//   TORCH_ARG(reduction_t, reduction);
+// };
+// ```
+//
+// and the functional that uses it:
+//
+// ```
+// Tensor some_functional(
+//     const Tensor& input,
+//     SomeOptions options = {}) {
+//   ...
+// }
+// ```
+//
+// Normally, we would expect this to work:
+//
+// `F::some_functional(input, torch::kNone)`
+//
+// However, it throws the following error instead:
+//
+// ```
+// error: could not convert `torch::kNone` from `const torch::enumtype::kNone`
+// to `torch::nn::SomeOptions`
+// ```
+//
+// To get around this problem, we explicitly provide the following constructors
+// for `SomeOptions`:
+//
+// ```
+// SomeOptions(torch::enumtype::kNone reduction) : reduction_(torch::kNone) {}
+// SomeOptions(torch::enumtype::kMean reduction) : reduction_(torch::kMean) {}
+// SomeOptions(torch::enumtype::kSum reduction) : reduction_(torch::kSum) {}
+// ```
+//
+// so that the conversion from `torch::kNone` to `SomeOptions` would work.
+//
+// Note that we also provide the default constructor `SomeOptions() {}`, so that
+// `SomeOptions options = {}` can work.
+#define TORCH_OPTIONS_CTOR_VARIANT_ARG3(                                       \
+    OPTIONS_NAME, ARG_NAME, TYPE1, TYPE2, TYPE3)                               \
+  OPTIONS_NAME() = default;                                                    \
+  OPTIONS_NAME(torch::enumtype::TYPE1 ARG_NAME) : ARG_NAME##_(torch::TYPE1) {} \
+  OPTIONS_NAME(torch::enumtype::TYPE2 ARG_NAME) : ARG_NAME##_(torch::TYPE2) {} \
+  OPTIONS_NAME(torch::enumtype::TYPE3 ARG_NAME) : ARG_NAME##_(torch::TYPE3) {}
+
+#define TORCH_OPTIONS_CTOR_VARIANT_ARG4(                                       \
+    OPTIONS_NAME, ARG_NAME, TYPE1, TYPE2, TYPE3, TYPE4)                        \
+  OPTIONS_NAME() = default;                                                    \
+  OPTIONS_NAME(torch::enumtype::TYPE1 ARG_NAME) : ARG_NAME##_(torch::TYPE1) {} \
+  OPTIONS_NAME(torch::enumtype::TYPE2 ARG_NAME) : ARG_NAME##_(torch::TYPE2) {} \
+  OPTIONS_NAME(torch::enumtype::TYPE3 ARG_NAME) : ARG_NAME##_(torch::TYPE3) {} \
+  OPTIONS_NAME(torch::enumtype::TYPE4 ARG_NAME) : ARG_NAME##_(torch::TYPE4) {}
+
+TORCH_ENUM_DECLARE(Linear)
+TORCH_ENUM_DECLARE(Conv1D)
+TORCH_ENUM_DECLARE(Conv2D)
+TORCH_ENUM_DECLARE(Conv3D)
+TORCH_ENUM_DECLARE(ConvTranspose1D)
+TORCH_ENUM_DECLARE(ConvTranspose2D)
+TORCH_ENUM_DECLARE(ConvTranspose3D)
+TORCH_ENUM_DECLARE(Sigmoid)
+TORCH_ENUM_DECLARE(Tanh)
+TORCH_ENUM_DECLARE(ReLU)
+TORCH_ENUM_DECLARE(GELU)
+TORCH_ENUM_DECLARE(SiLU)
+TORCH_ENUM_DECLARE(Mish)
+TORCH_ENUM_DECLARE(LeakyReLU)
+TORCH_ENUM_DECLARE(FanIn)
+TORCH_ENUM_DECLARE(FanOut)
+TORCH_ENUM_DECLARE(Constant)
+TORCH_ENUM_DECLARE(Reflect)
+TORCH_ENUM_DECLARE(Replicate)
+TORCH_ENUM_DECLARE(Circular)
+TORCH_ENUM_DECLARE(Nearest)
+TORCH_ENUM_DECLARE(Bilinear)
+TORCH_ENUM_DECLARE(Bicubic)
+TORCH_ENUM_DECLARE(Trilinear)
+TORCH_ENUM_DECLARE(Area)
+TORCH_ENUM_DECLARE(NearestExact)
+TORCH_ENUM_DECLARE(Sum)
+TORCH_ENUM_DECLARE(Mean)
+TORCH_ENUM_DECLARE(Max)
+TORCH_ENUM_DECLARE(None)
+TORCH_ENUM_DECLARE(BatchMean)
+TORCH_ENUM_DECLARE(Zeros)
+TORCH_ENUM_DECLARE(Border)
+TORCH_ENUM_DECLARE(Reflection)
+TORCH_ENUM_DECLARE(RNN_TANH)
+TORCH_ENUM_DECLARE(RNN_RELU)
+TORCH_ENUM_DECLARE(LSTM)
+TORCH_ENUM_DECLARE(GRU)
+TORCH_ENUM_DECLARE(Valid)
+TORCH_ENUM_DECLARE(Same)
+
+namespace torch::enumtype {
+
+struct _compute_enum_name {
+  TORCH_ENUM_PRETTY_PRINT(Linear)
+  TORCH_ENUM_PRETTY_PRINT(Conv1D)
+  TORCH_ENUM_PRETTY_PRINT(Conv2D)
+  TORCH_ENUM_PRETTY_PRINT(Conv3D)
+  TORCH_ENUM_PRETTY_PRINT(ConvTranspose1D)
+  TORCH_ENUM_PRETTY_PRINT(ConvTranspose2D)
+  TORCH_ENUM_PRETTY_PRINT(ConvTranspose3D)
+  TORCH_ENUM_PRETTY_PRINT(Sigmoid)
+  TORCH_ENUM_PRETTY_PRINT(Tanh)
+  TORCH_ENUM_PRETTY_PRINT(ReLU)
+  TORCH_ENUM_PRETTY_PRINT(GELU)
+  TORCH_ENUM_PRETTY_PRINT(SiLU)
+  TORCH_ENUM_PRETTY_PRINT(Mish)
+  TORCH_ENUM_PRETTY_PRINT(LeakyReLU)
+  TORCH_ENUM_PRETTY_PRINT(FanIn)
+  TORCH_ENUM_PRETTY_PRINT(FanOut)
+  TORCH_ENUM_PRETTY_PRINT(Constant)
+  TORCH_ENUM_PRETTY_PRINT(Reflect)
+  TORCH_ENUM_PRETTY_PRINT(Replicate)
+  TORCH_ENUM_PRETTY_PRINT(Circular)
+  TORCH_ENUM_PRETTY_PRINT(Nearest)
+  TORCH_ENUM_PRETTY_PRINT(Bilinear)
+  TORCH_ENUM_PRETTY_PRINT(Bicubic)
+  TORCH_ENUM_PRETTY_PRINT(Trilinear)
+  TORCH_ENUM_PRETTY_PRINT(Area)
+  TORCH_ENUM_PRETTY_PRINT(NearestExact)
+  TORCH_ENUM_PRETTY_PRINT(Sum)
+  TORCH_ENUM_PRETTY_PRINT(Mean)
+  TORCH_ENUM_PRETTY_PRINT(Max)
+  TORCH_ENUM_PRETTY_PRINT(None)
+  TORCH_ENUM_PRETTY_PRINT(BatchMean)
+  TORCH_ENUM_PRETTY_PRINT(Zeros)
+  TORCH_ENUM_PRETTY_PRINT(Border)
+  TORCH_ENUM_PRETTY_PRINT(Reflection)
+  TORCH_ENUM_PRETTY_PRINT(RNN_TANH)
+  TORCH_ENUM_PRETTY_PRINT(RNN_RELU)
+  TORCH_ENUM_PRETTY_PRINT(LSTM)
+  TORCH_ENUM_PRETTY_PRINT(GRU)
+  TORCH_ENUM_PRETTY_PRINT(Valid)
+  TORCH_ENUM_PRETTY_PRINT(Same)
+};
+
+template <typename V>
+std::string get_enum_name(V variant_enum) {
+  return std::visit(enumtype::_compute_enum_name{}, variant_enum);
+}
+
+template <typename V>
+at::Reduction::Reduction reduction_get_enum(V variant_enum) {
+  if (std::holds_alternative<enumtype::kNone>(variant_enum)) {
+    return at::Reduction::None;
+  } else if (std::holds_alternative<enumtype::kMean>(variant_enum)) {
+    return at::Reduction::Mean;
+  } else if (std::holds_alternative<enumtype::kSum>(variant_enum)) {
+    return at::Reduction::Sum;
+  } else {
+    TORCH_CHECK(
+        false,
+        get_enum_name(variant_enum),
+        " is not a valid value for reduction");
+    return at::Reduction::END;
+  }
+}
+
+} // namespace torch::enumtype
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/expanding_array.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/expanding_array.h
new file mode 100644
index 0000000000000000000000000000000000000000..2b99f32317845d38d18072de9690e2aca9ea31ff
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/expanding_array.h
@@ -0,0 +1,182 @@
+#pragma once
+
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Exception.h>
+#include <c10/util/irange.h>
+#include <optional>
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <initializer_list>
+#include <string>
+#include <vector>
+
+namespace torch {
+
+/// A utility class that accepts either a container of `D`-many values, or a
+/// single value, which is internally repeated `D` times. This is useful to
+/// represent parameters that are multidimensional, but often equally sized in
+/// all dimensions. For example, the kernel size of a 2D convolution has an `x`
+/// and `y` length, but `x` and `y` are often equal. In such a case you could
+/// just pass `3` to an `ExpandingArray<2>` and it would "expand" to `{3, 3}`.
+template <size_t D, typename T = int64_t>
+class ExpandingArray {
+ public:
+  /// Constructs an `ExpandingArray` from an `initializer_list`. The extent of
+  /// the length is checked against the `ExpandingArray`'s extent parameter `D`
+  /// at runtime.
+  /*implicit*/ ExpandingArray(std::initializer_list<T> list)
+      : ExpandingArray(c10::ArrayRef<T>(list)) {}
+
+  /// Constructs an `ExpandingArray` from an `std::vector`. The extent of
+  /// the length is checked against the `ExpandingArray`'s extent parameter `D`
+  /// at runtime.
+  /*implicit*/ ExpandingArray(std::vector<T> vec)
+      : ExpandingArray(c10::ArrayRef<T>(vec)) {}
+
+  /// Constructs an `ExpandingArray` from an `c10::ArrayRef`. The extent of
+  /// the length is checked against the `ExpandingArray`'s extent parameter `D`
+  /// at runtime.
+  /*implicit*/ ExpandingArray(c10::ArrayRef<T> values) {
+    // clang-format off
+    TORCH_CHECK(
+        values.size() == D,
+        "Expected ", D, " values, but instead got ", values.size());
+    // clang-format on
+    std::copy(values.begin(), values.end(), values_.begin());
+  }
+
+  /// Constructs an `ExpandingArray` from a single value, which is repeated `D`
+  /// times (where `D` is the extent parameter of the `ExpandingArray`).
+  /*implicit*/ ExpandingArray(T single_size) {
+    values_.fill(single_size);
+  }
+
+  /// Constructs an `ExpandingArray` from a correctly sized `std::array`.
+  /*implicit*/ ExpandingArray(const std::array<T, D>& values)
+      : values_(values) {}
+
+  /// Accesses the underlying `std::array`.
+  std::array<T, D>& operator*() {
+    return values_;
+  }
+
+  /// Accesses the underlying `std::array`.
+  const std::array<T, D>& operator*() const {
+    return values_;
+  }
+
+  /// Accesses the underlying `std::array`.
+  std::array<T, D>* operator->() {
+    return &values_;
+  }
+
+  /// Accesses the underlying `std::array`.
+  const std::array<T, D>* operator->() const {
+    return &values_;
+  }
+
+  /// Returns an `ArrayRef` to the underlying `std::array`.
+  operator c10::ArrayRef<T>() const {
+    return values_;
+  }
+
+  /// Returns the extent of the `ExpandingArray`.
+  size_t size() const noexcept {
+    return D;
+  }
+
+ protected:
+  /// The backing array.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::array<T, D> values_;
+};
+
+template <size_t D, typename T>
+std::ostream& operator<<(
+    std::ostream& stream,
+    const ExpandingArray<D, T>& expanding_array) {
+  if (expanding_array.size() == 1) {
+    return stream << expanding_array->at(0);
+  }
+  return stream << static_cast<c10::ArrayRef<T>>(expanding_array);
+}
+
+/// A utility class that accepts either a container of `D`-many
+/// `std::optional<T>` values, or a single `std::optional<T>` value, which is
+/// internally repeated `D` times. It has the additional ability to accept
+/// containers of the underlying type `T` and convert them to a container of
+/// `std::optional<T>`.
+template <size_t D, typename T = int64_t>
+class ExpandingArrayWithOptionalElem
+    : public ExpandingArray<D, std::optional<T>> {
+ public:
+  using ExpandingArray<D, std::optional<T>>::ExpandingArray;
+
+  /// Constructs an `ExpandingArrayWithOptionalElem` from an `initializer_list`
+  /// of the underlying type `T`. The extent of the length is checked against
+  /// the `ExpandingArrayWithOptionalElem`'s extent parameter `D` at runtime.
+  /*implicit*/ ExpandingArrayWithOptionalElem(std::initializer_list<T> list)
+      : ExpandingArrayWithOptionalElem(c10::ArrayRef<T>(list)) {}
+
+  /// Constructs an `ExpandingArrayWithOptionalElem` from an `std::vector` of
+  /// the underlying type `T`. The extent of the length is checked against the
+  /// `ExpandingArrayWithOptionalElem`'s extent parameter `D` at runtime.
+  /*implicit*/ ExpandingArrayWithOptionalElem(std::vector<T> vec)
+      : ExpandingArrayWithOptionalElem(c10::ArrayRef<T>(vec)) {}
+
+  /// Constructs an `ExpandingArrayWithOptionalElem` from an `c10::ArrayRef` of
+  /// the underlying type `T`. The extent of the length is checked against the
+  /// `ExpandingArrayWithOptionalElem`'s extent parameter `D` at runtime.
+  /*implicit*/ ExpandingArrayWithOptionalElem(c10::ArrayRef<T> values)
+      : ExpandingArray<D, std::optional<T>>(0) {
+    // clang-format off
+    TORCH_CHECK(
+        values.size() == D,
+        "Expected ", D, " values, but instead got ", values.size());
+    // clang-format on
+    for (const auto i : c10::irange(this->values_.size())) {
+      this->values_[i] = values[i];
+    }
+  }
+
+  /// Constructs an `ExpandingArrayWithOptionalElem` from a single value of the
+  /// underlying type `T`, which is repeated `D` times (where `D` is the extent
+  /// parameter of the `ExpandingArrayWithOptionalElem`).
+  /*implicit*/ ExpandingArrayWithOptionalElem(T single_size)
+      : ExpandingArray<D, std::optional<T>>(0) {
+    for (const auto i : c10::irange(this->values_.size())) {
+      this->values_[i] = single_size;
+    }
+  }
+
+  /// Constructs an `ExpandingArrayWithOptionalElem` from a correctly sized
+  /// `std::array` of the underlying type `T`.
+  /*implicit*/ ExpandingArrayWithOptionalElem(const std::array<T, D>& values)
+      : ExpandingArray<D, std::optional<T>>(0) {
+    for (const auto i : c10::irange(this->values_.size())) {
+      this->values_[i] = values[i];
+    }
+  }
+};
+
+template <size_t D, typename T>
+std::ostream& operator<<(
+    std::ostream& stream,
+    const ExpandingArrayWithOptionalElem<D, T>& expanding_array_with_opt_elem) {
+  if (expanding_array_with_opt_elem.size() == 1) {
+    const auto& elem = expanding_array_with_opt_elem->at(0);
+    stream << (elem.has_value() ? c10::str(elem.value()) : "None");
+  } else {
+    std::vector<std::string> str_array;
+    for (const auto& elem : *expanding_array_with_opt_elem) {
+      str_array.emplace_back(
+          elem.has_value() ? c10::str(elem.value()) : "None");
+    }
+    stream << c10::ArrayRef<std::string>(str_array);
+  }
+  return stream;
+}
+
+} // namespace torch
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/fft.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/fft.h
new file mode 100644
index 0000000000000000000000000000000000000000..bc68957f576d931650c3f6d65ec6aeeae2a996f6
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/fft.h
@@ -0,0 +1,390 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <torch/types.h>
+
+#include <utility>
+
+namespace torch::fft {
+
+/// Computes the 1 dimensional fast Fourier transform over a given dimension.
+/// See https://pytorch.org/docs/main/fft.html#torch.fft.fft.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kComplexDouble);
+/// torch::fft::fft(t);
+/// ```
+inline Tensor fft(
+    const Tensor& self,
+    std::optional<SymInt> n = std::nullopt,
+    int64_t dim = -1,
+    std::optional<std::string_view> norm = std::nullopt) {
+  return torch::fft_fft_symint(self, std::move(n), dim, norm);
+}
+
+/// Computes the 1 dimensional inverse Fourier transform over a given dimension.
+/// See https://pytorch.org/docs/main/fft.html#torch.fft.ifft.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kComplexDouble);
+/// torch::fft::ifft(t);
+/// ```
+inline Tensor ifft(
+    const Tensor& self,
+    std::optional<SymInt> n = std::nullopt,
+    int64_t dim = -1,
+    std::optional<std::string_view> norm = std::nullopt) {
+  return torch::fft_ifft_symint(self, std::move(n), dim, norm);
+}
+
+/// Computes the 2-dimensional fast Fourier transform over the given dimensions.
+/// See https://pytorch.org/docs/main/fft.html#torch.fft.fft2.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn({128, 128}, dtype=kComplexDouble);
+/// torch::fft::fft2(t);
+/// ```
+inline Tensor fft2(
+    const Tensor& self,
+    OptionalIntArrayRef s = std::nullopt,
+    IntArrayRef dim = {-2, -1},
+    std::optional<std::string_view> norm = std::nullopt) {
+  return torch::fft_fft2(self, s, dim, norm);
+}
+
+/// Computes the inverse of torch.fft.fft2
+/// See https://pytorch.org/docs/main/fft.html#torch.fft.ifft2.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn({128, 128}, dtype=kComplexDouble);
+/// torch::fft::ifft2(t);
+/// ```
+inline Tensor ifft2(
+    const Tensor& self,
+    at::OptionalIntArrayRef s = std::nullopt,
+    IntArrayRef dim = {-2, -1},
+    std::optional<std::string_view> norm = std::nullopt) {
+  return torch::fft_ifft2(self, s, dim, norm);
+}
+
+/// Computes the N dimensional fast Fourier transform over given dimensions.
+/// See https://pytorch.org/docs/main/fft.html#torch.fft.fftn.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn({128, 128}, dtype=kComplexDouble);
+/// torch::fft::fftn(t);
+/// ```
+inline Tensor fftn(
+    const Tensor& self,
+    at::OptionalIntArrayRef s = std::nullopt,
+    at::OptionalIntArrayRef dim = std::nullopt,
+    std::optional<std::string_view> norm = std::nullopt) {
+  return torch::fft_fftn(self, s, dim, norm);
+}
+
+/// Computes the N dimensional fast Fourier transform over given dimensions.
+/// See https://pytorch.org/docs/main/fft.html#torch.fft.ifftn.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn({128, 128}, dtype=kComplexDouble);
+/// torch::fft::ifftn(t);
+/// ```
+inline Tensor ifftn(
+    const Tensor& self,
+    at::OptionalIntArrayRef s = std::nullopt,
+    at::OptionalIntArrayRef dim = std::nullopt,
+    std::optional<std::string_view> norm = std::nullopt) {
+  return torch::fft_ifftn(self, s, dim, norm);
+}
+
+/// Computes the 1 dimensional FFT of real input with onesided Hermitian output.
+/// See https://pytorch.org/docs/main/fft.html#torch.fft.rfft.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128);
+/// auto T = torch::fft::rfft(t);
+/// assert(T.is_complex() && T.numel() == 128 / 2 + 1);
+/// ```
+inline Tensor rfft(
+    const Tensor& self,
+    std::optional<SymInt> n = std::nullopt,
+    int64_t dim = -1,
+    std::optional<std::string_view> norm = std::nullopt) {
+  return torch::fft_rfft_symint(self, std::move(n), dim, norm);
+}
+
+/// Computes the inverse of torch.fft.rfft
+///
+/// The input is a onesided Hermitian Fourier domain signal, with real-valued
+/// output. See https://pytorch.org/docs/main/fft.html#torch.fft.irfft
+///
+/// Example:
+/// ```
+/// auto T = torch::randn(128 / 2 + 1, torch::kComplexDouble);
+/// auto t = torch::fft::irfft(t, /*n=*/128);
+/// assert(t.is_floating_point() && T.numel() == 128);
+/// ```
+inline Tensor irfft(
+    const Tensor& self,
+    std::optional<SymInt> n = std::nullopt,
+    int64_t dim = -1,
+    std::optional<std::string_view> norm = std::nullopt) {
+  return torch::fft_irfft_symint(self, std::move(n), dim, norm);
+}
+
+/// Computes the 2-dimensional FFT of real input. Returns a onesided Hermitian
+/// output. See https://pytorch.org/docs/main/fft.html#torch.fft.rfft2
+///
+/// Example:
+/// ```
+/// auto t = torch::randn({128, 128}, dtype=kDouble);
+/// torch::fft::rfft2(t);
+/// ```
+inline Tensor rfft2(
+    const Tensor& self,
+    at::OptionalIntArrayRef s = std::nullopt,
+    IntArrayRef dim = {-2, -1},
+    std::optional<std::string_view> norm = std::nullopt) {
+  return torch::fft_rfft2(self, s, dim, norm);
+}
+
+/// Computes the inverse of torch.fft.rfft2.
+/// See https://pytorch.org/docs/main/fft.html#torch.fft.irfft2.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn({128, 128}, dtype=kComplexDouble);
+/// torch::fft::irfft2(t);
+/// ```
+inline Tensor irfft2(
+    const Tensor& self,
+    at::OptionalIntArrayRef s = std::nullopt,
+    IntArrayRef dim = {-2, -1},
+    std::optional<std::string_view> norm = std::nullopt) {
+  return torch::fft_irfft2(self, s, dim, norm);
+}
+
+/// Computes the N dimensional FFT of real input with onesided Hermitian output.
+/// See https://pytorch.org/docs/main/fft.html#torch.fft.rfftn
+///
+/// Example:
+/// ```
+/// auto t = torch::randn({128, 128}, dtype=kDouble);
+/// torch::fft::rfftn(t);
+/// ```
+inline Tensor rfftn(
+    const Tensor& self,
+    at::OptionalIntArrayRef s = std::nullopt,
+    at::OptionalIntArrayRef dim = std::nullopt,
+    std::optional<std::string_view> norm = std::nullopt) {
+  return torch::fft_rfftn(self, s, dim, norm);
+}
+
+/// Computes the inverse of torch.fft.rfftn.
+/// See https://pytorch.org/docs/main/fft.html#torch.fft.irfftn.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn({128, 128}, dtype=kComplexDouble);
+/// torch::fft::irfftn(t);
+/// ```
+inline Tensor irfftn(
+    const Tensor& self,
+    at::OptionalIntArrayRef s = std::nullopt,
+    at::OptionalIntArrayRef dim = std::nullopt,
+    std::optional<std::string_view> norm = std::nullopt) {
+  return torch::fft_irfftn(self, s, dim, norm);
+}
+
+/// Computes the 1 dimensional FFT of a onesided Hermitian signal
+///
+/// The input represents a Hermitian symmetric time domain signal. The returned
+/// Fourier domain representation of such a signal is a real-valued. See
+/// https://pytorch.org/docs/main/fft.html#torch.fft.hfft
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128 / 2 + 1, torch::kComplexDouble);
+/// auto T = torch::fft::hfft(t, /*n=*/128);
+/// assert(T.is_floating_point() && T.numel() == 128);
+/// ```
+inline Tensor hfft(
+    const Tensor& self,
+    std::optional<SymInt> n = std::nullopt,
+    int64_t dim = -1,
+    std::optional<std::string_view> norm = std::nullopt) {
+  return torch::fft_hfft_symint(self, std::move(n), dim, norm);
+}
+
+/// Computes the inverse FFT of a real-valued Fourier domain signal.
+///
+/// The output is a onesided representation of the Hermitian symmetric time
+/// domain signal. See https://pytorch.org/docs/main/fft.html#torch.fft.ihfft.
+///
+/// Example:
+/// ```
+/// auto T = torch::randn(128, torch::kDouble);
+/// auto t = torch::fft::ihfft(T);
+/// assert(t.is_complex() && T.numel() == 128 / 2 + 1);
+/// ```
+inline Tensor ihfft(
+    const Tensor& self,
+    std::optional<SymInt> n = std::nullopt,
+    int64_t dim = -1,
+    std::optional<std::string_view> norm = std::nullopt) {
+  return torch::fft_ihfft_symint(self, std::move(n), dim, norm);
+}
+
+/// Computes the 2-dimensional FFT of a Hermitian symmetric input signal.
+///
+/// The input is a onesided representation of the Hermitian symmetric time
+/// domain signal. See https://pytorch.org/docs/main/fft.html#torch.fft.hfft2.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn({128, 65}, torch::kComplexDouble);
+/// auto T = torch::fft::hfft2(t, /*s=*/{128, 128});
+/// assert(T.is_floating_point() && T.numel() == 128 * 128);
+/// ```
+inline Tensor hfft2(
+    const Tensor& self,
+    at::OptionalIntArrayRef s = std::nullopt,
+    IntArrayRef dim = {-2, -1},
+    std::optional<std::string_view> norm = std::nullopt) {
+  return torch::fft_hfft2(self, s, dim, norm);
+}
+
+/// Computes the 2-dimensional IFFT of a real input signal.
+///
+/// The output is a onesided representation of the Hermitian symmetric time
+/// domain signal. See
+/// https://pytorch.org/docs/main/fft.html#torch.fft.ihfft2.
+///
+/// Example:
+/// ```
+/// auto T = torch::randn({128, 128}, torch::kDouble);
+/// auto t = torch::fft::hfft2(T);
+/// assert(t.is_complex() && t.size(1) == 65);
+/// ```
+inline Tensor ihfft2(
+    const Tensor& self,
+    at::OptionalIntArrayRef s = std::nullopt,
+    IntArrayRef dim = {-2, -1},
+    std::optional<std::string_view> norm = std::nullopt) {
+  return torch::fft_ihfft2(self, s, dim, norm);
+}
+
+/// Computes the N-dimensional FFT of a Hermitian symmetric input signal.
+///
+/// The input is a onesided representation of the Hermitian symmetric time
+/// domain signal. See https://pytorch.org/docs/main/fft.html#torch.fft.hfftn.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn({128, 65}, torch::kComplexDouble);
+/// auto T = torch::fft::hfftn(t, /*s=*/{128, 128});
+/// assert(T.is_floating_point() && T.numel() == 128 * 128);
+/// ```
+inline Tensor hfftn(
+    const Tensor& self,
+    at::OptionalIntArrayRef s = std::nullopt,
+    IntArrayRef dim = {-2, -1},
+    std::optional<std::string_view> norm = std::nullopt) {
+  return torch::fft_hfftn(self, s, dim, norm);
+}
+
+/// Computes the N-dimensional IFFT of a real input signal.
+///
+/// The output is a onesided representation of the Hermitian symmetric time
+/// domain signal. See
+/// https://pytorch.org/docs/main/fft.html#torch.fft.ihfftn.
+///
+/// Example:
+/// ```
+/// auto T = torch::randn({128, 128}, torch::kDouble);
+/// auto t = torch::fft::hfft2(T);
+/// assert(t.is_complex() && t.size(1) == 65);
+/// ```
+inline Tensor ihfftn(
+    const Tensor& self,
+    at::OptionalIntArrayRef s = std::nullopt,
+    IntArrayRef dim = {-2, -1},
+    std::optional<std::string_view> norm = std::nullopt) {
+  return torch::fft_ihfftn(self, s, dim, norm);
+}
+
+/// Computes the discrete Fourier Transform sample frequencies for a signal of
+/// size n.
+///
+/// See https://pytorch.org/docs/main/fft.html#torch.fft.fftfreq
+///
+/// Example:
+/// ```
+/// auto frequencies = torch::fft::fftfreq(128, torch::kDouble);
+/// ```
+inline Tensor fftfreq(int64_t n, double d, const TensorOptions& options = {}) {
+  return torch::fft_fftfreq(n, d, options);
+}
+
+inline Tensor fftfreq(int64_t n, const TensorOptions& options = {}) {
+  return torch::fft_fftfreq(n, /*d=*/1.0, options);
+}
+
+/// Computes the sample frequencies for torch.fft.rfft with a signal of size n.
+///
+/// Like torch.fft.rfft, only the positive frequencies are included.
+/// See https://pytorch.org/docs/main/fft.html#torch.fft.rfftfreq
+///
+/// Example:
+/// ```
+/// auto frequencies = torch::fft::rfftfreq(128, torch::kDouble);
+/// ```
+inline Tensor rfftfreq(int64_t n, double d, const TensorOptions& options) {
+  return torch::fft_rfftfreq(n, d, options);
+}
+
+inline Tensor rfftfreq(int64_t n, const TensorOptions& options) {
+  return torch::fft_rfftfreq(n, /*d=*/1.0, options);
+}
+
+/// Reorders n-dimensional FFT output to have negative frequency terms first, by
+/// a torch.roll operation.
+///
+/// See https://pytorch.org/docs/main/fft.html#torch.fft.fftshift
+///
+/// Example:
+/// ```
+/// auto x = torch::randn({127, 4});
+/// auto centred_fft = torch::fft::fftshift(torch::fft::fftn(x));
+/// ```
+inline Tensor fftshift(
+    const Tensor& x,
+    at::OptionalIntArrayRef dim = std::nullopt) {
+  return torch::fft_fftshift(x, dim);
+}
+
+/// Inverse of torch.fft.fftshift
+///
+/// See https://pytorch.org/docs/main/fft.html#torch.fft.ifftshift
+///
+/// Example:
+/// ```
+/// auto x = torch::randn({127, 4});
+/// auto shift = torch::fft::fftshift(x)
+/// auto unshift = torch::fft::ifftshift(shift);
+/// assert(torch::allclose(x, unshift));
+/// ```
+inline Tensor ifftshift(
+    const Tensor& x,
+    at::OptionalIntArrayRef dim = std::nullopt) {
+  return torch::fft_ifftshift(x, dim);
+}
+
+} // namespace torch::fft
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/imethod.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/imethod.h
new file mode 100644
index 0000000000000000000000000000000000000000..2c7f520b157402bccb47cb136a5efe227e2c1a6a
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/imethod.h
@@ -0,0 +1,53 @@
+#pragma once
+#include <ATen/core/ivalue.h>
+#include <vector>
+
+namespace torch {
+
+class TORCH_API IMethod {
+  /*
+  IMethod provides a portable interface for torch methods, whether
+  they are backed by torchscript or python/deploy.
+
+  This is helpful since torchscript methods provide additional information
+  (e.g. FunctionSchema, Graph) which aren't available in pure python methods.
+
+  Higher level APIs should prefer depending on this interface rather
+  than a specific implementation of it, to promote portability and reuse, and
+  avoid unintentional dependencies on e.g. script methods.
+
+  Note: This API is experimental, and may evolve.
+  */
+ public:
+  using IValueList = std::vector<c10::IValue>;
+  using IValueMap = std::unordered_map<std::string, at::IValue>;
+
+  IMethod() = default;
+  IMethod(const IMethod&) = default;
+  IMethod& operator=(const IMethod&) = default;
+  IMethod(IMethod&&) noexcept = default;
+  IMethod& operator=(IMethod&&) noexcept = default;
+  virtual ~IMethod() = default;
+
+  virtual c10::IValue operator()(
+      std::vector<c10::IValue> args,
+      const IValueMap& kwargs = IValueMap()) const = 0;
+
+  virtual const std::string& name() const = 0;
+
+  // Returns an ordered list of argument names, possible in both
+  // script and python methods.  This is a more portable dependency
+  // than a ScriptMethod FunctionSchema, which has more information
+  // than can be generally expected from a python method.
+  const std::vector<std::string>& getArgumentNames() const;
+
+ protected:
+  virtual void setArgumentNames(
+      std::vector<std::string>& argumentNames) const = 0;
+
+ private:
+  mutable bool isArgumentNamesInitialized_{false};
+  mutable std::vector<std::string> argumentNames_;
+};
+
+} // namespace torch
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/jit.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/jit.h
new file mode 100644
index 0000000000000000000000000000000000000000..58e8e8b3a49054c03e904acff2513fc6c26ac2ce
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/jit.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/api/module.h>
+
+#include <memory>
+#include <string>
+
+namespace torch::jit {
+
+/// Compiles script code into an executable graph.
+///
+/// Takes a string containing functions in script syntax and compiles them into
+/// a module (graph). The returned module provides a `run_method` function
+/// that may be used to invoke the compiled functions.
+///
+/// For example:
+/// \rst
+/// .. code-block:: cpp
+///
+///   auto module = torch::jit::compile(R"JIT(
+///     def relu_script(a, b):
+///       return torch.relu(a + b)
+///     def test_while(a, i):
+///       while i < 10:
+///         a += a
+///         i += 1
+///       return a
+///   )JIT");
+///   IValue output = module->run_method("relu_script", a, b);
+/// \endrst
+TORCH_API std::shared_ptr<CompilationUnit> compile(const std::string& source);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/mps.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/mps.h
new file mode 100644
index 0000000000000000000000000000000000000000..90b8ed8598eddcb5f38289b76e8b96048b04b02c
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/mps.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+
+#include <cstddef>
+#include <cstdint>
+
+#ifdef __OBJC__
+#include <Foundation/Foundation.h>
+#include <Metal/Metal.h>
+using MTLCommandBuffer_t = id<MTLCommandBuffer>;
+using DispatchQueue_t = dispatch_queue_t;
+#else
+using MTLCommandBuffer_t = void*;
+using DispatchQueue_t = void*;
+#endif
+
+namespace torch::mps {
+
+/// Returns true if MPS device is available.
+bool TORCH_API is_available();
+
+/// Sets the RNG seed for the MPS device.
+void TORCH_API manual_seed(uint64_t seed);
+
+/// Waits for all streams on the MPS device to complete.
+/// This blocks the calling CPU thread by using the 'waitUntilCompleted()'
+/// method to wait for Metal command buffers finish executing all the
+/// encoded GPU operations before returning.
+void TORCH_API synchronize();
+
+/// Submits the currently active command buffer to run on the MPS device.
+void TORCH_API commit();
+
+/// Get the current command buffer to encode the Metal commands.
+MTLCommandBuffer_t TORCH_API get_command_buffer();
+
+/// Get the dispatch_queue_t to synchronize encoding the custom kernels
+/// with the PyTorch MPS backend.
+DispatchQueue_t TORCH_API get_dispatch_queue();
+
+} // namespace torch::mps
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nested.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nested.h
new file mode 100644
index 0000000000000000000000000000000000000000..764e93db61acfd3c3e707b86a57560c98ffe069e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nested.h
@@ -0,0 +1,93 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/core/ATen_fwd.h>
+#include <torch/csrc/api/include/torch/detail/TensorDataContainer.h>
+#include <algorithm>
+
+namespace torch::nested {
+
+/// Nested tensor
+///
+/// See
+/// https://pytorch.org/docs/main/nested.html#torch.nested.nested_tensor
+///
+/// ```
+// implemented on python object to allow torch.nested.nested_tensor to be
+// constructed with arbitrarily nested python objects - for now, only arbitrary
+// python lists and lists of Tensors
+// See torch/csrc/autograd/python_nested_functions_manual.cpp for Python
+// implementation
+// See here for C++ implementation
+inline at::Tensor nested_tensor(
+    at::TensorList nested_tensor_data,
+    const at::TensorOptions& options = {}) {
+  auto out = at::_nested_tensor_from_tensor_list(
+      nested_tensor_data,
+      c10::typeMetaToScalarType(options.dtype()),
+      std::nullopt,
+      options.device(),
+      options.pinned_memory());
+  if (options.has_requires_grad() && options.requires_grad()) {
+    out.requires_grad_(true);
+  }
+  return out;
+}
+
+inline at::Tensor nested_tensor(
+    at::ArrayRef<detail::TensorDataContainer> nested_tensor_data,
+    const at::TensorOptions& options = {}) {
+  for (const auto& tdc : nested_tensor_data) {
+    TORCH_CHECK(
+        tdc.is_init_list(),
+        "nested_tensor() not implemented for these parameters");
+  }
+  // Construct a TensorList using nested_tensor_data
+  std::vector<at::Tensor> tensor_list(nested_tensor_data.size());
+  std::transform(
+      nested_tensor_data.begin(),
+      nested_tensor_data.end(),
+      tensor_list.begin(),
+      [&](const detail::TensorDataContainer& tdc) {
+        return tdc.convert_to_tensor(options);
+      });
+  auto out = at::_nested_tensor_from_tensor_list(
+      tensor_list,
+      c10::typeMetaToScalarType(options.dtype()),
+      std::nullopt,
+      options.device(),
+      options.pinned_memory());
+  if (options.has_requires_grad() && options.requires_grad()) {
+    out.requires_grad_(true);
+  }
+  return out;
+}
+
+/// As Nested Tensor
+///
+/// See
+/// https://pytorch.org/docs/main/nested.html#torch.nested.as_nested_tensor
+///
+/// ```
+inline at::Tensor as_nested_tensor(
+    at::TensorList list,
+    std::optional<at::ScalarType> dtype = std::nullopt,
+    std::optional<at::Device> device = std::nullopt) {
+  return at::_nested_tensor_from_tensor_list(
+      list, dtype, std::nullopt, device, std::nullopt);
+}
+
+/// Nested to padded tensor
+///
+/// See
+/// https://pytorch.org/docs/main/nested.html#torch.nested.to_padded_tensor
+///
+/// ```
+inline at::Tensor to_padded_tensor(
+    const at::Tensor& self,
+    double padding,
+    at::OptionalIntArrayRef output_size = std::nullopt) {
+  return at::nested_to_padded_tensor(self, padding, output_size);
+}
+
+} // namespace torch::nested
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a0d26cd949f8c354364c2b8a793bb060f6c28d2
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/functional.h>
+#include <torch/nn/init.h>
+#include <torch/nn/module.h>
+#include <torch/nn/modules.h>
+#include <torch/nn/options.h>
+#include <torch/nn/pimpl.h>
+#include <torch/nn/utils.h>
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/cloneable.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/cloneable.h
new file mode 100644
index 0000000000000000000000000000000000000000..6e9011827faa103a1f5b4be7f07a0dbdb0993a8d
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/cloneable.h
@@ -0,0 +1,94 @@
+#pragma once
+
+#include <torch/nn/module.h>
+#include <torch/types.h>
+#include <torch/utils.h>
+
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Exception.h>
+
+#include <memory>
+#include <utility>
+
+namespace torch::nn {
+/// The `clone()` method in the base `Module` class does not have knowledge of
+/// the concrete runtime type of its subclasses. Therefore, `clone()` must
+/// either be called from within the subclass, or from a base class that has
+/// knowledge of the concrete type. `Cloneable` uses the CRTP to gain
+/// knowledge of the subclass' static type and provide an implementation of the
+/// `clone()` method. We do not want to use this pattern in the base class,
+/// because then storing a module would always require templatizing it.
+template <typename Derived>
+// NOLINTNEXTLINE(bugprone-exception-escape)
+class Cloneable : public Module {
+ public:
+  using Module::Module;
+
+  /// `reset()` must perform initialization of all members with reference
+  /// semantics, most importantly parameters, buffers and submodules.
+  virtual void reset() = 0;
+
+  /// Performs a recursive "deep copy" of the `Module`, such that all parameters
+  /// and submodules in the cloned module are different from those in the
+  /// original module.
+  std::shared_ptr<Module> clone(
+      const std::optional<Device>& device = std::nullopt) const override {
+    NoGradGuard no_grad;
+
+    const auto& self = static_cast<const Derived&>(*this);
+    auto copy = std::make_shared<Derived>(self);
+    copy->parameters_.clear();
+    copy->buffers_.clear();
+    copy->children_.clear();
+    copy->reset();
+    TORCH_CHECK(
+        copy->parameters_.size() == parameters_.size(),
+        "The cloned module does not have the same number of "
+        "parameters as the original module after calling reset(). "
+        "Are you sure you called register_parameter() inside reset() "
+        "and not the constructor?");
+    for (const auto& parameter : named_parameters(/*recurse=*/false)) {
+      auto& tensor = *parameter;
+      auto data = device && tensor.device() != *device ? tensor.to(*device)
+                                                       : tensor.clone();
+      copy->parameters_[parameter.key()].set_data(data);
+    }
+    TORCH_CHECK(
+        copy->buffers_.size() == buffers_.size(),
+        "The cloned module does not have the same number of "
+        "buffers as the original module after calling reset(). "
+        "Are you sure you called register_buffer() inside reset() "
+        "and not the constructor?");
+    for (const auto& buffer : named_buffers(/*recurse=*/false)) {
+      auto& tensor = *buffer;
+      auto data = device && tensor.device() != *device ? tensor.to(*device)
+                                                       : tensor.clone();
+      copy->buffers_[buffer.key()].set_data(data);
+    }
+    TORCH_CHECK(
+        copy->children_.size() == children_.size(),
+        "The cloned module does not have the same number of "
+        "child modules as the original module after calling reset(). "
+        "Are you sure you called register_module() inside reset() "
+        "and not the constructor?");
+    for (const auto& child : children_) {
+      copy->children_[child.key()]->clone_(*child.value(), device);
+    }
+    return copy;
+  }
+
+ private:
+  void clone_(Module& other, const std::optional<Device>& device) final {
+    // Here we are *pretty* certain that `other's` type is `Derived` (because it
+    // was registered under the same name as `this`), but you never know what
+    // crazy things `reset()` does, so `dynamic_cast` just to be safe.
+    auto clone = std::dynamic_pointer_cast<Derived>(other.clone(device));
+    TORCH_CHECK(
+        clone != nullptr,
+        "Attempted to clone submodule, but it is of a "
+        "different type than the submodule it was to be cloned into");
+    static_cast<Derived&>(*this) = *clone;
+  }
+};
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional.h
new file mode 100644
index 0000000000000000000000000000000000000000..46c705adf8def92827e91b71632c4e61dd5db168
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <torch/nn/functional/batchnorm.h>
+#include <torch/nn/functional/conv.h>
+#include <torch/nn/functional/distance.h>
+#include <torch/nn/functional/dropout.h>
+#include <torch/nn/functional/embedding.h>
+#include <torch/nn/functional/fold.h>
+#include <torch/nn/functional/instancenorm.h>
+#include <torch/nn/functional/linear.h>
+#include <torch/nn/functional/loss.h>
+#include <torch/nn/functional/normalization.h>
+#include <torch/nn/functional/padding.h>
+#include <torch/nn/functional/pixelshuffle.h>
+#include <torch/nn/functional/pooling.h>
+#include <torch/nn/functional/upsampling.h>
+#include <torch/nn/functional/vision.h>
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/activation.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/activation.h
new file mode 100644
index 0000000000000000000000000000000000000000..23fbe23c1a64ab3142579dfbad00811707a567a5
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/activation.h
@@ -0,0 +1,961 @@
+#pragma once
+
+#include <ATen/Dispatch.h>
+#include <torch/nn/functional/dropout.h>
+#include <torch/nn/functional/linear.h>
+#include <torch/nn/options/activation.h>
+#include <torch/nn/options/dropout.h>
+#include <torch/nn/options/linear.h>
+#include <torch/types.h>
+#include <limits>
+#include <utility>
+
+namespace torch::nn::functional {
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor elu(Tensor input, double alpha, bool inplace) {
+  if (inplace) {
+    return torch::elu_(input, alpha);
+  } else {
+    return torch::elu(input, alpha);
+  }
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.elu
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::ELUFuncOptions` class to
+/// learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::elu(x, F::ELUFuncOptions().alpha(0.42).inplace(true));
+/// ```
+inline Tensor elu(Tensor input, const ELUFuncOptions& options = {}) {
+  return detail::elu(std::move(input), options.alpha(), options.inplace());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor selu(Tensor input, bool inplace) {
+  if (inplace) {
+    return torch::selu_(input);
+  } else {
+    return torch::selu(input);
+  }
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.selu
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::SELUFuncOptions` class to
+/// learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::selu(input, F::SELUFuncOptions(false));
+/// ```
+inline Tensor selu(Tensor input, const SELUFuncOptions& options = {}) {
+  return detail::selu(std::move(input), options.inplace());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor hardshrink(const Tensor& input, double lambda) {
+  return torch::hardshrink(input, lambda);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.hardshrink
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::HardshrinkFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::hardshrink(x, F::HardshrinkFuncOptions().lambda(0.42));
+/// ```
+inline Tensor hardshrink(
+    const Tensor& input,
+    const HardshrinkFuncOptions& options = {}) {
+  return detail::hardshrink(input, options.lambda());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor hardtanh(
+    Tensor input,
+    double min_val,
+    double max_val,
+    bool inplace) {
+  if (inplace) {
+    return torch::hardtanh_(input, min_val, max_val);
+  } else {
+    return torch::hardtanh(input, min_val, max_val);
+  }
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.hardtanh
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::HardtanhFuncOptions` class
+/// to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::hardtanh(x,
+/// F::HardtanhFuncOptions().min_val(-1.0).max_val(1.0).inplace(true));
+/// ```
+inline Tensor hardtanh(Tensor input, const HardtanhFuncOptions& options = {}) {
+  return detail::hardtanh(
+      std::move(input),
+      options.min_val(),
+      options.max_val(),
+      options.inplace());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor leaky_relu(Tensor input, double negative_slope, bool inplace) {
+  if (inplace) {
+    return torch::leaky_relu_(input, negative_slope);
+  } else {
+    return torch::leaky_relu(input, negative_slope);
+  }
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.leaky_relu
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::LeakyReLUFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::leaky_relu(x,
+/// F::LeakyReLUFuncOptions().negative_slope(0.42).inplace(true));
+/// ```
+inline Tensor leaky_relu(
+    Tensor input,
+    const LeakyReLUFuncOptions& options = {}) {
+  return detail::leaky_relu(
+      std::move(input), options.negative_slope(), options.inplace());
+}
+
+// ============================================================================
+
+inline Tensor logsigmoid(const Tensor& input) {
+  return torch::log_sigmoid(input);
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor gumbel_softmax(
+    const Tensor& logits,
+    double tau,
+    bool hard,
+    int dim) {
+  auto gumbels =
+      -torch::empty_like(logits).exponential_().log(); // ~Gumbel(0,1)
+  gumbels = (logits + gumbels) / tau; // ~Gumbel(logits, tau)
+  auto y_soft = gumbels.softmax(dim);
+
+  torch::Tensor ret;
+  if (hard) {
+    // Straight through.
+    auto index = std::get<1>(y_soft.max(dim, /*keepdim=*/true));
+    auto y_hard = torch::zeros_like(logits).scatter_(dim, index, 1.0);
+    ret = y_hard - y_soft.detach() + y_soft;
+  } else {
+    ret = y_soft;
+  }
+  return ret;
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.gumbel_softmax
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::GumbelSoftmaxFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::gumbel_softmax(logits, F::GumbelSoftmaxFuncOptions().hard(true).dim(-1));
+/// ```
+inline Tensor gumbel_softmax(
+    const Tensor& logits,
+    const GumbelSoftmaxFuncOptions& options = {}) {
+  return detail::gumbel_softmax(
+      logits, options.tau(), options.hard(), options.dim());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor softmax(
+    const Tensor& input,
+    int64_t dim,
+    std::optional<torch::Dtype> dtype) {
+  Tensor ret;
+
+  if (dtype == std::nullopt) {
+    ret = input.softmax(dim);
+  } else {
+    ret = input.softmax(dim, dtype);
+  }
+
+  return ret;
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.softmax
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::SoftmaxFuncOptions` class
+/// to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::softmax(input, F::SoftmaxFuncOptions(1));
+/// ```
+inline Tensor softmax(const Tensor& input, const SoftmaxFuncOptions& options) {
+  return detail::softmax(input, options.dim(), options.dtype());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor softmin(
+    const Tensor& input,
+    int64_t dim,
+    std::optional<torch::Dtype> dtype) {
+  Tensor ret;
+
+  if (dtype == std::nullopt) {
+    ret = (-input).softmax(dim);
+  } else {
+    ret = (-input).softmax(dim, dtype);
+  }
+
+  return ret;
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.softmin
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::SoftminFuncOptions` class
+/// to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::softmin(input, F::SoftminFuncOptions(1));
+/// ```
+inline Tensor softmin(const Tensor& input, const SoftminFuncOptions& options) {
+  return detail::softmin(input, options.dim(), options.dtype());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor log_softmax(
+    const Tensor& input,
+    int64_t dim,
+    std::optional<torch::Dtype> dtype) {
+  Tensor ret;
+
+  if (dtype == std::nullopt) {
+    ret = input.log_softmax(dim);
+  } else {
+    ret = input.log_softmax(dim, dtype);
+  }
+
+  return ret;
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.log_softmax
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::LogSoftmaxFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::log_softmax(input, LogSoftmaxFuncOptions(1));
+/// ```
+inline Tensor log_softmax(
+    const Tensor& input,
+    const LogSoftmaxFuncOptions& options) {
+  return detail::log_softmax(input, options.dim(), options.dtype());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor glu(const Tensor& input, int64_t dim) {
+  TORCH_CHECK(
+      input.dim() != 0,
+      "glu does not support scalars because halving size must be even");
+  return torch::glu(input, dim);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.glu
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::GLUFuncOptions` class to
+/// learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::glu(input, GLUFuncOptions(1));
+/// ```
+inline Tensor glu(const Tensor& input, const GLUFuncOptions& options = {}) {
+  return detail::glu(input, options.dim());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor gelu(const Tensor& input, const std::string& approximate) {
+  return torch::gelu(input, approximate);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+inline Tensor gelu(const Tensor& input, const GELUFuncOptions& options = {}) {
+  return detail::gelu(input, options.approximate());
+}
+
+// ============================================================================
+
+inline Tensor silu(const Tensor& input) {
+  return torch::silu(input);
+}
+
+// ============================================================================
+
+inline Tensor mish(const Tensor& input) {
+  return torch::mish(input);
+}
+
+// ============================================================================
+
+inline Tensor prelu(const Tensor& input, const Tensor& weight) {
+  return torch::prelu(input, weight);
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor relu(Tensor input, bool inplace) {
+  if (inplace) {
+    return torch::relu_(input);
+  } else {
+    return torch::relu(input);
+  }
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.relu
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::ReLUFuncOptions` class to
+/// learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::relu(x, F::ReLUFuncOptions().inplace(true));
+/// ```
+inline Tensor relu(Tensor input, const ReLUFuncOptions& options = {}) {
+  return detail::relu(std::move(input), options.inplace());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor relu6(Tensor input, bool inplace) {
+  if (inplace) {
+    return torch::relu6_(input);
+  } else {
+    return torch::relu6(input);
+  }
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.relu6
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::ReLU6FuncOptions` class to
+/// learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::relu6(x, F::ReLU6FuncOptions().inplace(true));
+/// ```
+inline Tensor relu6(Tensor input, const ReLU6FuncOptions& options = {}) {
+  return detail::relu6(std::move(input), options.inplace());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor rrelu(
+    Tensor input,
+    double lower,
+    double upper,
+    bool training,
+    bool inplace) {
+  if (inplace) {
+    return torch::rrelu_(input, lower, upper, training);
+  } else {
+    return torch::rrelu(input, lower, upper, training);
+  }
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.rrelu
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::RReLUFuncOptions` class to
+/// learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::rrelu(x, F::RReLUFuncOptions().lower(0.1).upper(0.4).inplace(true));
+/// ```
+inline Tensor rrelu(Tensor input, const RReLUFuncOptions& options = {}) {
+  return detail::rrelu(
+      std::move(input),
+      options.lower(),
+      options.upper(),
+      options.training(),
+      options.inplace());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor celu(Tensor input, double alpha, bool inplace) {
+  if (inplace) {
+    return torch::celu_(input, alpha);
+  } else {
+    return torch::celu(input, alpha);
+  }
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.celu
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::CELUFuncOptions` class to
+/// learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::celu(x, F::CELUFuncOptions().alpha(0.42).inplace(true));
+/// ```
+inline Tensor celu(Tensor input, const CELUFuncOptions& options = {}) {
+  return detail::celu(std::move(input), options.alpha(), options.inplace());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor softplus(const Tensor& input, double beta, double threshold) {
+  return torch::softplus(input, beta, threshold);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.softplus
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::SoftplusFuncOptions` class
+/// to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::softplus(x, F::SoftplusFuncOptions().beta(0.5).threshold(3.0));
+/// ```
+inline Tensor softplus(
+    const Tensor& input,
+    const SoftplusFuncOptions& options = {}) {
+  return detail::softplus(input, options.beta(), options.threshold());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor softshrink(const Tensor& input, double lambda) {
+  return torch::softshrink(input, lambda);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.softshrink
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::SoftshrinkFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::softshrink(x, F::SoftshrinkFuncOptions(0.42));
+/// ```
+inline Tensor softshrink(
+    const Tensor& input,
+    const SoftshrinkFuncOptions& options = {}) {
+  return detail::softshrink(input, options.lambda());
+}
+
+// ============================================================================
+
+inline Tensor softsign(const Tensor& input) {
+  return input / (input.abs() + 1);
+}
+
+// ============================================================================
+
+inline Tensor tanhshrink(const Tensor& input) {
+  return input - input.tanh();
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor threshold(
+    Tensor input,
+    double threshold,
+    double value,
+    bool inplace) {
+  if (inplace) {
+    return torch::threshold_(input, threshold, value);
+  } else {
+    return torch::threshold(input, threshold, value);
+  }
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.threshold
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::ThresholdFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::threshold(x, F::ThresholdFuncOptions(0.5, 0.5).inplace(true));
+/// ```
+inline Tensor threshold(Tensor input, const ThresholdFuncOptions& options) {
+  return detail::threshold(
+      std::move(input),
+      options.threshold(),
+      options.value(),
+      options.inplace());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline std::tuple<Tensor, Tensor> multi_head_attention_forward(
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    int64_t embed_dim_to_check,
+    int64_t num_heads,
+    const Tensor& in_proj_weight,
+    const Tensor& in_proj_bias,
+    const Tensor& bias_k,
+    const Tensor& bias_v,
+    bool add_zero_attn,
+    double dropout_p,
+    const Tensor& out_proj_weight,
+    const Tensor& out_proj_bias,
+    bool training = true,
+    const Tensor& key_padding_mask = {},
+    bool need_weights = true,
+    const Tensor& attn_mask = {},
+    bool use_separate_proj_weight = false,
+    const Tensor& q_proj_weight = {},
+    const Tensor& k_proj_weight = {},
+    const Tensor& v_proj_weight = {},
+    const Tensor& static_k = {},
+    const Tensor& static_v = {},
+    bool average_attn_weights = true) {
+  namespace F = torch::nn::functional;
+
+  const auto query_sizes = query.sizes();
+  const auto& tgt_len = query_sizes[0];
+  const auto& bsz = query_sizes[1];
+  const auto& embed_dim = query_sizes[2];
+  TORCH_INTERNAL_ASSERT(embed_dim == embed_dim_to_check);
+  TORCH_INTERNAL_ASSERT(key.sizes() == value.sizes());
+
+  const auto head_dim = embed_dim / num_heads;
+  TORCH_CHECK(
+      head_dim * num_heads == embed_dim,
+      "embed_dim must be divisible by num_heads");
+  const auto scaling = 1 / std::sqrt(head_dim);
+
+  Tensor q, k, v;
+  if (!use_separate_proj_weight) {
+    if (torch::equal(query, key) && torch::equal(key, value)) {
+      // self-attention
+      const auto chunks =
+          F::linear(query, in_proj_weight, in_proj_bias).chunk(3, /*dim=*/-1);
+      q = chunks[0];
+      k = chunks[1];
+      v = chunks[2];
+    } else if (torch::equal(key, value)) {
+      // encoder-decoder attention
+      // This is inline in_proj function with in_proj_weight and in_proj_bias
+      auto _b = in_proj_bias;
+      int64_t _start = 0;
+      auto _end = embed_dim;
+      auto _w = in_proj_weight.slice(/*dim=*/0, _start, _end);
+      if (_b.defined()) {
+        _b = _b.slice(/*dim=*/0, _start, _end);
+      }
+      q = F::linear(query, _w, _b);
+
+      if (!key.defined()) {
+        TORCH_INTERNAL_ASSERT(!value.defined());
+        k.reset();
+        v.reset();
+      } else {
+        // This is inline in_proj function with in_proj_weight and in_proj_bias
+        _b = in_proj_bias;
+        _start = embed_dim;
+        _w = in_proj_weight.slice(/*dim=*/0, _start);
+        if (_b.defined()) {
+          _b = _b.slice(/*dim=*/0, _start);
+        }
+        const auto chunks = F::linear(key, _w, _b).chunk(2, /*dim=*/-1);
+        k = chunks[0];
+        v = chunks[1];
+      }
+    } else {
+      // This is inline in_proj function with in_proj_weight and in_proj_bias
+      auto _b = in_proj_bias;
+      int64_t _start = 0;
+      auto _end = embed_dim;
+      auto _w = in_proj_weight.slice(/*dim=*/0, _start, _end);
+      if (_b.defined()) {
+        _b = _b.slice(/*dim=*/0, _start, _end);
+      }
+      q = F::linear(query, _w, _b);
+
+      // This is inline in_proj function with in_proj_weight and in_proj_bias
+      _b = in_proj_bias;
+      _start = embed_dim;
+      _end = embed_dim * 2;
+      _w = in_proj_weight.slice(/*dim=*/0, _start, _end);
+      if (_b.defined()) {
+        _b = _b.slice(/*dim=*/0, _start, _end);
+      }
+      k = F::linear(key, _w, _b);
+
+      // This is inline in_proj function with in_proj_weight and in_proj_bias
+      _b = in_proj_bias;
+      _start = embed_dim * 2;
+      _w = in_proj_weight.slice(/*dim=*/0, _start);
+      if (_b.defined()) {
+        _b = _b.slice(0, _start);
+      }
+      v = F::linear(value, _w, _b);
+    }
+  } else {
+    const auto& q_proj_weight_non_opt = q_proj_weight;
+    {
+      const auto sizes = q_proj_weight_non_opt.sizes();
+      const auto len1 = sizes[0];
+      const auto len2 = sizes[1];
+      TORCH_CHECK(len1 == embed_dim && len2 == query.size(-1));
+    }
+
+    const auto& k_proj_weight_non_opt = k_proj_weight;
+    {
+      const auto sizes = k_proj_weight_non_opt.sizes();
+      const auto len1 = sizes[0];
+      const auto len2 = sizes[1];
+      TORCH_CHECK(len1 == embed_dim && len2 == key.size(-1));
+    }
+
+    const auto& v_proj_weight_non_opt = v_proj_weight;
+    {
+      const auto sizes = v_proj_weight_non_opt.sizes();
+      const auto len1 = sizes[0];
+      const auto len2 = sizes[1];
+      TORCH_CHECK(len1 == embed_dim && len2 == value.size(-1));
+    }
+
+    if (in_proj_bias.defined()) {
+      q = F::linear(
+          query,
+          q_proj_weight_non_opt,
+          in_proj_bias.slice(/*dim=*/0, 0, embed_dim));
+      k = F::linear(
+          key,
+          k_proj_weight_non_opt,
+          in_proj_bias.slice(/*dim=*/0, embed_dim, (embed_dim * 2)));
+      v = F::linear(
+          value,
+          v_proj_weight_non_opt,
+          in_proj_bias.slice(/*dim=*/0, (embed_dim * 2)));
+    } else {
+      q = F::linear(query, q_proj_weight_non_opt, in_proj_bias);
+      k = F::linear(key, k_proj_weight_non_opt, in_proj_bias);
+      v = F::linear(value, v_proj_weight_non_opt, in_proj_bias);
+    }
+  }
+  q = q * scaling;
+  Tensor attn_mask_ = attn_mask;
+  Tensor key_padding_mask_ = key_padding_mask;
+  if (bias_k.defined() && bias_v.defined()) {
+    if (!static_k.defined() && !static_v.defined()) {
+      k = torch::cat({k, bias_k.repeat({1, bsz, 1})});
+      v = torch::cat({v, bias_v.repeat({1, bsz, 1})});
+      if (attn_mask_.defined()) {
+        attn_mask_ = torch::cat(
+            {attn_mask_,
+             torch::zeros(
+                 {attn_mask_.size(0), 1},
+                 at::TensorOptions(attn_mask_.dtype())
+                     .device(attn_mask_.device()))},
+            /*dim=*/1);
+      }
+      if (key_padding_mask_.defined()) {
+        key_padding_mask_ = torch::cat(
+            {key_padding_mask_,
+             torch::zeros(
+                 {key_padding_mask_.size(0), 1},
+                 at::TensorOptions(key_padding_mask_.dtype())
+                     .device(key_padding_mask_.device()))},
+            /*dim=*/1);
+      }
+    } else {
+      TORCH_CHECK(!static_k.defined(), "bias cannot be added to static key.");
+      TORCH_CHECK(!static_v.defined(), "bias cannot be added to static value.");
+    }
+  } else {
+    TORCH_CHECK(!bias_k.defined());
+    TORCH_CHECK(!bias_v.defined());
+  }
+  q = q.contiguous().view({tgt_len, bsz * num_heads, head_dim}).transpose(0, 1);
+  if (k.defined()) {
+    k = k.contiguous().view({-1, bsz * num_heads, head_dim}).transpose(0, 1);
+  }
+  if (v.defined()) {
+    v = v.contiguous().view({-1, bsz * num_heads, head_dim}).transpose(0, 1);
+  }
+  if (static_k.defined()) {
+    TORCH_CHECK(static_k.size(0) == bsz * num_heads);
+    TORCH_CHECK(static_k.size(2) == head_dim);
+    k = static_k;
+  }
+  if (static_v.defined()) {
+    TORCH_CHECK(static_v.size(0) == bsz * num_heads);
+    TORCH_CHECK(static_v.size(2) == head_dim);
+    v = static_v;
+  }
+  auto src_len = k.size(1);
+  if (key_padding_mask_.defined()) {
+    TORCH_CHECK(key_padding_mask_.size(0) == bsz);
+    TORCH_CHECK(key_padding_mask_.size(1) == src_len);
+  }
+  if (add_zero_attn) {
+    src_len += 1;
+    auto k_sizes = k.sizes().vec();
+    k_sizes[1] = 1;
+    k = torch::cat(
+        {k,
+         torch::zeros(
+             k_sizes, at::TensorOptions(k.dtype()).device(k.device()))},
+        /*dim=*/1);
+    auto v_sizes = v.sizes().vec();
+    v_sizes[1] = 1;
+    v = torch::cat(
+        {v,
+         torch::zeros(
+             v_sizes, at::TensorOptions(v.dtype()).device(v.device()))},
+        /*dim=*/1);
+    if (attn_mask_.defined()) {
+      attn_mask_ = torch::cat(
+          {attn_mask_,
+           torch::zeros(
+               {attn_mask_.size(0), 1},
+               at::TensorOptions(attn_mask_.dtype())
+                   .device(attn_mask_.device()))},
+          /*dim=*/1);
+    }
+    if (key_padding_mask_.defined()) {
+      key_padding_mask_ = torch::cat(
+          {key_padding_mask_,
+           torch::zeros(
+               {key_padding_mask_.size(0), 1},
+               at::TensorOptions(key_padding_mask_.dtype())
+                   .device(key_padding_mask_.device()))},
+          /*dim=*/1);
+    }
+  }
+  auto attn_output_weights = torch::bmm(q, k.transpose(1, 2));
+  TORCH_CHECK(
+      attn_output_weights.sizes() ==
+      IntArrayRef({bsz * num_heads, tgt_len, src_len}));
+  if (attn_mask_.defined()) {
+    attn_mask_ = attn_mask_.unsqueeze(0);
+    attn_output_weights += attn_mask_;
+  }
+  if (key_padding_mask_.defined()) {
+    attn_output_weights =
+        attn_output_weights.view({bsz, num_heads, tgt_len, src_len});
+    attn_output_weights = AT_DISPATCH_FLOATING_TYPES(
+        attn_output_weights.scalar_type(),
+        "attn_output_weights.masked_fill",
+        [&]() {
+          return attn_output_weights.masked_fill(
+              key_padding_mask_.unsqueeze(1).unsqueeze(2),
+              -std::numeric_limits<scalar_t>::infinity());
+        });
+    attn_output_weights =
+        attn_output_weights.view({bsz * num_heads, tgt_len, src_len});
+  }
+  attn_output_weights = F::softmax(attn_output_weights, /*options=*/-1);
+  attn_output_weights = F::dropout(
+      attn_output_weights,
+      F::DropoutFuncOptions().p(dropout_p).training(training));
+  auto attn_output = torch::bmm(attn_output_weights, v);
+  TORCH_CHECK(
+      attn_output.sizes() == IntArrayRef({bsz * num_heads, tgt_len, head_dim}));
+  attn_output =
+      attn_output.transpose(0, 1).contiguous().view({tgt_len, bsz, embed_dim});
+  attn_output = F::linear(attn_output, out_proj_weight, out_proj_bias);
+  if (need_weights) {
+    attn_output_weights =
+        attn_output_weights.view({bsz, num_heads, tgt_len, src_len});
+    if (average_attn_weights) {
+      // average attention weights over heads
+      attn_output_weights = attn_output_weights.sum(/*dim=*/1) / num_heads;
+    }
+    return std::make_tuple(attn_output, attn_output_weights);
+  } else {
+    return std::make_tuple(attn_output, Tensor());
+  }
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+inline std::tuple<Tensor, Tensor> multi_head_attention_forward(
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    const MultiheadAttentionForwardFuncOptions& options) {
+  return detail::multi_head_attention_forward(
+      query,
+      key,
+      value,
+      options.embed_dim_to_check(),
+      options.num_heads(),
+      options.in_proj_weight(),
+      options.in_proj_bias(),
+      options.bias_k(),
+      options.bias_v(),
+      options.add_zero_attn(),
+      options.dropout_p(),
+      options.out_proj_weight(),
+      options.out_proj_bias(),
+      options.training(),
+      options.key_padding_mask(),
+      options.need_weights(),
+      options.attn_mask(),
+      options.use_separate_proj_weight(),
+      options.q_proj_weight(),
+      options.k_proj_weight(),
+      options.v_proj_weight(),
+      options.static_k(),
+      options.static_v(),
+      options.average_attn_weights());
+}
+
+} // namespace torch::nn::functional
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/batchnorm.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/batchnorm.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c9d3f7a698de0a1f59634ab4c5d912eb6200488
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/batchnorm.h
@@ -0,0 +1,79 @@
+#pragma once
+
+#include <c10/util/irange.h>
+#include <torch/nn/options/batchnorm.h>
+#include <torch/types.h>
+
+namespace torch::nn::functional {
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor batch_norm(
+    const Tensor& input,
+    const Tensor& running_mean,
+    const Tensor& running_var,
+    Tensor weight,
+    Tensor bias,
+    bool training,
+    double momentum,
+    double eps) {
+  TORCH_CHECK(
+      input.dim() >= 2,
+      "Expected at least 2 input dimensions, but got ",
+      input.dim());
+  if (training) {
+    auto size = input.sizes();
+    int64_t size_prods = size[0];
+    for (const auto i : c10::irange(size.size() - 2)) {
+      size_prods *= size[i + 2];
+    }
+    TORCH_CHECK(
+        size_prods != 1,
+        "Expected more than 1 value per channel when training, got input size ",
+        size);
+  }
+
+  return torch::batch_norm(
+      input,
+      weight,
+      bias,
+      running_mean,
+      running_var,
+      training,
+      momentum,
+      eps,
+      at::globalContext().userEnabledCuDNN());
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.batch_norm
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::BatchNormFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::batch_norm(input, mean, variance,
+/// F::BatchNormFuncOptions().weight(weight).bias(bias).momentum(0.1).eps(1e-05).training(false));
+/// ```
+inline Tensor batch_norm(
+    const Tensor& input,
+    const Tensor& running_mean,
+    const Tensor& running_var,
+    const BatchNormFuncOptions& options = {}) {
+  return detail::batch_norm(
+      input,
+      running_mean,
+      running_var,
+      options.weight(),
+      options.bias(),
+      options.training(),
+      options.momentum(),
+      options.eps());
+}
+
+} // namespace torch::nn::functional
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/conv.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/conv.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a8ec9dcafc3dc1d02ab2e2e09c99ae70fa86524
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/conv.h
@@ -0,0 +1,297 @@
+#pragma once
+
+#include <torch/nn/options/conv.h>
+#include <torch/types.h>
+
+namespace torch::nn::functional {
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+
+inline std::string padding_unwrap(enumtype::kValid) {
+  return "valid";
+}
+
+inline std::string padding_unwrap(enumtype::kSame) {
+  return "same";
+}
+
+template <size_t D>
+IntArrayRef padding_unwrap(const ExpandingArray<D>& array) {
+  return array;
+}
+
+inline Tensor conv1d(
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    ExpandingArray<1> stride,
+    const Conv1dFuncOptions::padding_t& padding,
+    ExpandingArray<1> dilation,
+    int64_t groups) {
+  return std::visit(
+      [&](const auto& pad) {
+        return torch::conv1d(
+            input, weight, bias, stride, padding_unwrap(pad), dilation, groups);
+      },
+      padding);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.conv1d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::Conv1dFuncOptions` class
+/// to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::conv1d(x, weight, F::Conv1dFuncOptions().stride(1));
+/// ```
+inline Tensor conv1d(
+    const Tensor& input,
+    const Tensor& weight,
+    const Conv1dFuncOptions& options = {}) {
+  return detail::conv1d(
+      input,
+      weight,
+      options.bias(),
+      options.stride(),
+      options.padding(),
+      options.dilation(),
+      options.groups());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor conv2d(
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    ExpandingArray<2> stride,
+    const Conv2dFuncOptions::padding_t& padding,
+    ExpandingArray<2> dilation,
+    int64_t groups) {
+  return std::visit(
+      [&](const auto& pad) {
+        return torch::conv2d(
+            input, weight, bias, stride, padding_unwrap(pad), dilation, groups);
+      },
+      padding);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.conv2d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::Conv2dFuncOptions` class
+/// to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::conv2d(x, weight, F::Conv2dFuncOptions().stride(1));
+/// ```
+inline Tensor conv2d(
+    const Tensor& input,
+    const Tensor& weight,
+    const Conv2dFuncOptions& options = {}) {
+  return detail::conv2d(
+      input,
+      weight,
+      options.bias(),
+      options.stride(),
+      options.padding(),
+      options.dilation(),
+      options.groups());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor conv3d(
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    ExpandingArray<3> stride,
+    const Conv3dFuncOptions::padding_t& padding,
+    ExpandingArray<3> dilation,
+    int64_t groups) {
+  return std::visit(
+      [&](const auto& pad) {
+        return torch::conv3d(
+            input, weight, bias, stride, padding_unwrap(pad), dilation, groups);
+      },
+      padding);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.conv3d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::Conv3dFuncOptions` class
+/// to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::conv3d(x, weight, F::Conv3dFuncOptions().stride(1));
+/// ```
+inline Tensor conv3d(
+    const Tensor& input,
+    const Tensor& weight,
+    const Conv3dFuncOptions& options = {}) {
+  return detail::conv3d(
+      input,
+      weight,
+      options.bias(),
+      options.stride(),
+      options.padding(),
+      options.dilation(),
+      options.groups());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor conv_transpose1d(
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef output_padding,
+    int64_t groups,
+    IntArrayRef dilation) {
+  return torch::conv_transpose1d(
+      input, weight, bias, stride, padding, output_padding, groups, dilation);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.conv_transpose1d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::ConvTranspose1dFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::conv_transpose1d(x, weight, F::ConvTranspose1dFuncOptions().stride(1));
+/// ```
+inline Tensor conv_transpose1d(
+    const Tensor& input,
+    const Tensor& weight,
+    const ConvTranspose1dFuncOptions& options = {}) {
+  return detail::conv_transpose1d(
+      input,
+      weight,
+      options.bias(),
+      options.stride(),
+      options.padding(),
+      options.output_padding(),
+      options.groups(),
+      options.dilation());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor conv_transpose2d(
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef output_padding,
+    int64_t groups,
+    IntArrayRef dilation) {
+  return torch::conv_transpose2d(
+      input, weight, bias, stride, padding, output_padding, groups, dilation);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.conv_transpose2d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::ConvTranspose2dFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::conv_transpose2d(x, weight, F::ConvTranspose2dFuncOptions().stride(1));
+/// ```
+inline Tensor conv_transpose2d(
+    const Tensor& input,
+    const Tensor& weight,
+    const ConvTranspose2dFuncOptions& options = {}) {
+  return detail::conv_transpose2d(
+      input,
+      weight,
+      options.bias(),
+      options.stride(),
+      options.padding(),
+      options.output_padding(),
+      options.groups(),
+      options.dilation());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor conv_transpose3d(
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef output_padding,
+    int64_t groups,
+    IntArrayRef dilation) {
+  return torch::conv_transpose3d(
+      input, weight, bias, stride, padding, output_padding, groups, dilation);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.conv_transpose3d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::ConvTranspose3dFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::conv_transpose3d(x, weight, F::ConvTranspose3dFuncOptions().stride(1));
+/// ```
+inline Tensor conv_transpose3d(
+    const Tensor& input,
+    const Tensor& weight,
+    const ConvTranspose3dFuncOptions& options = {}) {
+  return detail::conv_transpose3d(
+      input,
+      weight,
+      options.bias(),
+      options.stride(),
+      options.padding(),
+      options.output_padding(),
+      options.groups(),
+      options.dilation());
+}
+
+} // namespace torch::nn::functional
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/distance.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/distance.h
new file mode 100644
index 0000000000000000000000000000000000000000..a0fdc3ccc91e6a971d5a11c5a7503f625496e067
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/distance.h
@@ -0,0 +1,84 @@
+#pragma once
+
+#include <torch/nn/options/distance.h>
+
+namespace torch::nn::functional {
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor cosine_similarity(
+    const Tensor& x1,
+    const Tensor& x2,
+    int64_t dim,
+    double eps) {
+  return torch::cosine_similarity(x1, x2, dim, eps);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.cosine_similarity
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::CosineSimilarityFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::cosine_similarity(input1, input2,
+/// F::CosineSimilarityFuncOptions().dim(1));
+/// ```
+inline Tensor cosine_similarity(
+    const Tensor& x1,
+    const Tensor& x2,
+    const CosineSimilarityFuncOptions& options = {}) {
+  return detail::cosine_similarity(x1, x2, options.dim(), options.eps());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor pairwise_distance(
+    const Tensor& x1,
+    const Tensor& x2,
+    double p,
+    double eps,
+    bool keepdim) {
+  return torch::pairwise_distance(x1, x2, p, eps, keepdim);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.pairwise_distance
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::PairwiseDistanceFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::pairwise_distance(input1, input2, F::PairwiseDistanceFuncOptions().p(1));
+/// ```
+inline Tensor pairwise_distance(
+    const Tensor& x1,
+    const Tensor& x2,
+    const PairwiseDistanceFuncOptions& options = {}) {
+  return detail::pairwise_distance(
+      x1, x2, options.p(), options.eps(), options.keepdim());
+}
+
+// ============================================================================
+
+/// Computes the p-norm distance between every pair of row vectors in the input.
+/// This function will be faster if the rows are contiguous.
+inline Tensor pdist(const Tensor& input, double p = 2.0) {
+  return torch::pdist(input, p);
+}
+
+} // namespace torch::nn::functional
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/dropout.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/dropout.h
new file mode 100644
index 0000000000000000000000000000000000000000..0c3893241da81f63a7f518152f46b79b8b491af5
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/dropout.h
@@ -0,0 +1,230 @@
+#pragma once
+
+#include <torch/nn/options/dropout.h>
+
+#include <utility>
+
+namespace torch::nn::functional {
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+
+inline Tensor dropout(Tensor input, double p, bool training, bool inplace) {
+  TORCH_CHECK(
+      p >= 0. && p <= 1.,
+      "dropout probability has to be between 0 and 1, but got ",
+      p);
+  if (inplace) {
+    return torch::dropout_(input, p, training);
+  } else {
+    return torch::dropout(input, p, training);
+  }
+}
+
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.dropout
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::DropoutFuncOptions` class
+/// to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::dropout(input, F::DropoutFuncOptions().p(0.5));
+/// ```
+inline Tensor dropout(Tensor input, const DropoutFuncOptions& options = {}) {
+  return detail::dropout(
+      std::move(input), options.p(), options.training(), options.inplace());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+
+template <int64_t unbatched_dim, int64_t batched_dim>
+inline Tensor _dropoutNd_helper(
+    Tensor input,
+    double p,
+    bool training,
+    bool inplace,
+    const char* fn_name) {
+  TORCH_CHECK(
+      p >= 0. && p <= 1.,
+      "dropout probability has to be between 0 and 1, but got ",
+      p);
+
+  auto inp_dim = input.dim();
+  auto is_batched = inp_dim == batched_dim;
+  if (!is_batched) {
+    if (inplace) {
+      input = input.unsqueeze_(0);
+    } else {
+      input = input.unsqueeze(0);
+    }
+  }
+
+  Tensor result;
+  if (inplace) {
+    result = torch::feature_dropout_(input, p, training);
+  } else {
+    result = torch::feature_dropout(input, p, training);
+  }
+
+  if (!is_batched) {
+    if (inplace) {
+      result = result.squeeze_(0);
+    } else {
+      result = result.squeeze(0);
+    }
+  }
+  return result;
+}
+
+inline Tensor dropout2d(Tensor input, double p, bool training, bool inplace) {
+  return _dropoutNd_helper<3, 4>(
+      std::move(input), p, training, inplace, "dropout2d");
+}
+
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.dropout2d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::Dropout2dFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::dropout2d(input, F::Dropout2dFuncOptions().p(0.5));
+/// ```
+inline Tensor dropout2d(
+    Tensor input,
+    const Dropout2dFuncOptions& options = {}) {
+  return detail::dropout2d(
+      std::move(input), options.p(), options.training(), options.inplace());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+
+inline Tensor dropout3d(Tensor input, double p, bool training, bool inplace) {
+  return _dropoutNd_helper<4, 5>(
+      std::move(input), p, training, inplace, "dropout3d");
+}
+
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.dropout3d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::Dropout3dFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::dropout3d(input, F::Dropout3dFuncOptions().p(0.5));
+/// ```
+inline Tensor dropout3d(
+    Tensor input,
+    const Dropout3dFuncOptions& options = {}) {
+  return detail::dropout3d(
+      std::move(input), options.p(), options.training(), options.inplace());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+
+inline Tensor alpha_dropout(
+    Tensor input,
+    double p,
+    bool training,
+    bool inplace) {
+  if (p < 0. || p > 1.) {
+    TORCH_CHECK(
+        false, "dropout probability has to be between 0 and 1, but got ", p);
+  }
+  return inplace ? torch::alpha_dropout_(input, p, training)
+                 : torch::alpha_dropout(input, p, training);
+}
+
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.alpha_dropout
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::AlphaDropoutFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::alpha_dropout(input,
+/// F::AlphaDropoutFuncOptions().p(0.5).training(false));
+/// ```
+inline Tensor alpha_dropout(
+    Tensor input,
+    const AlphaDropoutFuncOptions& options = {}) {
+  return detail::alpha_dropout(
+      std::move(input), options.p(), options.training(), options.inplace());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+
+inline Tensor feature_alpha_dropout(
+    Tensor input,
+    double p,
+    bool training,
+    bool inplace) {
+  if (p < 0. || p > 1.) {
+    TORCH_CHECK(
+        false, "dropout probability has to be between 0 and 1, but got ", p);
+  }
+  return inplace ? torch::feature_alpha_dropout_(input, p, training)
+                 : torch::feature_alpha_dropout(input, p, training);
+}
+
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.feature_alpha_dropout
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::FeatureAlphaDropoutFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::feature_alpha_dropout(input,
+/// F::FeatureAlphaDropoutFuncOptions().p(0.5).training(false));
+/// ```
+inline Tensor feature_alpha_dropout(
+    Tensor input,
+    const FeatureAlphaDropoutFuncOptions& options = {}) {
+  return detail::feature_alpha_dropout(
+      std::move(input), options.p(), options.training(), options.inplace());
+}
+
+} // namespace torch::nn::functional
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/embedding.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/embedding.h
new file mode 100644
index 0000000000000000000000000000000000000000..db97ec422107ab0eb3d8a78cd19ccd99707d97c3
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/embedding.h
@@ -0,0 +1,206 @@
+#pragma once
+
+#include <torch/nn/options/embedding.h>
+
+namespace torch::nn::functional {
+
+inline Tensor one_hot(const Tensor& tensor, int64_t num_classes = -1) {
+  return torch::one_hot(tensor, num_classes);
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline void _no_grad_embedding_renorm_(
+    Tensor weight,
+    const Tensor& input,
+    float max_norm,
+    float norm_type) {
+  torch::NoGradGuard no_grad;
+  torch::embedding_renorm_(weight, input, max_norm, norm_type);
+}
+
+inline Tensor embedding(
+    const Tensor& input,
+    const Tensor& weight,
+    std::optional<int64_t> padding_idx,
+    std::optional<double> max_norm,
+    double norm_type,
+    bool scale_grad_by_freq,
+    bool sparse) {
+  auto input_ = input;
+
+  if (padding_idx != std::nullopt) {
+    if (*padding_idx > 0) {
+      TORCH_CHECK(
+          *padding_idx < weight.size(0),
+          "Padding_idx must be within num_embeddings");
+    } else if (*padding_idx < 0) {
+      TORCH_CHECK(
+          *padding_idx >= -weight.size(0),
+          "Padding_idx must be within num_embedding");
+      padding_idx = weight.size(0) + *padding_idx;
+    }
+  } else {
+    padding_idx = -1;
+  }
+
+  if (max_norm != std::nullopt) {
+    input_ = input_.contiguous();
+    // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+    _no_grad_embedding_renorm_(weight, input_, *max_norm, norm_type);
+  }
+  return torch::embedding(
+      weight, input_, *padding_idx, scale_grad_by_freq, sparse);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.embedding
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::EmbeddingFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::embedding(input, weight,
+/// F::EmbeddingFuncOptions().norm_type(2.5).scale_grad_by_freq(true).sparse(true));
+/// ```
+inline Tensor embedding(
+    const Tensor& input,
+    const Tensor& weight,
+    const EmbeddingFuncOptions& options = {}) {
+  return detail::embedding(
+      input,
+      weight,
+      options.padding_idx(),
+      options.max_norm(),
+      options.norm_type(),
+      options.scale_grad_by_freq(),
+      options.sparse());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor embedding_bag(
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& offsets,
+    std::optional<double> max_norm,
+    double norm_type,
+    bool scale_grad_by_freq,
+    EmbeddingBagMode mode,
+    bool sparse,
+    const Tensor& per_sample_weights,
+    bool include_last_offset,
+    std::optional<int64_t> padding_idx) {
+  auto input_ = input;
+  auto offsets_ = offsets;
+  auto per_sample_weights_ = per_sample_weights;
+  TORCH_CHECK(
+      !per_sample_weights_.defined() ||
+          input_.sizes() == per_sample_weights_.sizes(),
+      "embedding_bag: If per_sample_weights (",
+      per_sample_weights_.sizes(),
+      ") is not null, then it must have the same shape as the input (",
+      input_.sizes(),
+      ")");
+  if (input_.dim() == 2) {
+    TORCH_CHECK(
+        !offsets_.defined(),
+        "If input is 2D, then offsets has to be null, as input is treated is a mini-batch of fixed length sequences. However, found offsets of type Tensor");
+    offsets_ = torch::arange(
+        0,
+        input_.numel(),
+        input_.size(1),
+        torch::TensorOptions().dtype(torch::kLong).device(input_.device()));
+    input_ = input_.reshape(-1);
+    if (per_sample_weights_.defined()) {
+      per_sample_weights_ = per_sample_weights_.reshape(-1);
+    }
+  } else if (input_.dim() == 1) {
+    TORCH_CHECK(
+        offsets_.defined(), "offsets has to be a 1D Tensor but got null");
+    TORCH_CHECK(offsets_.dim() == 1, "offsets has to be a 1D Tensor");
+  } else {
+    TORCH_CHECK(
+        false,
+        "input has to be 1D or 2D Tensor, but got Tensor of dimension ",
+        input_.dim());
+  }
+
+  int mode_enum = 0;
+  if (std::holds_alternative<enumtype::kSum>(mode)) {
+    mode_enum = 0;
+  } else if (std::holds_alternative<enumtype::kMean>(mode)) {
+    mode_enum = 1;
+  } else if (std::holds_alternative<enumtype::kMax>(mode)) {
+    mode_enum = 2;
+    TORCH_CHECK(
+        !scale_grad_by_freq,
+        "max mode does not support scaling the gradient by the frequency");
+    TORCH_CHECK(!sparse, "max mode does not support sparse weights");
+  } else {
+    TORCH_CHECK(false, "mode has to be one of sum, mean or max");
+  }
+
+  if (max_norm != std::nullopt) {
+    // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+    _no_grad_embedding_renorm_(weight, input_, *max_norm, norm_type);
+  }
+
+  TORCH_CHECK(
+      !per_sample_weights_.defined() || std::get_if<enumtype::kSum>(&mode),
+      "embedding_bag: per_sample_weights was not null. ",
+      "per_sample_weights is only supported for mode='kSum' (got mode='",
+      torch::enumtype::get_enum_name(mode),
+      "').Please open a feature request on GitHub.");
+
+  return std::get<0>(torch::embedding_bag(
+      weight,
+      input_,
+      offsets_,
+      scale_grad_by_freq,
+      mode_enum,
+      sparse,
+      per_sample_weights_,
+      include_last_offset,
+      padding_idx));
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.embedding_bag
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::EmbeddingBagFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::embedding_bag(input, weight,
+/// F::EmbeddingBagFuncOptions().mode(torch::kSum).offsets(offsets));
+/// ```
+inline Tensor embedding_bag(
+    const Tensor& input,
+    const Tensor& weight,
+    const EmbeddingBagFuncOptions& options = {}) {
+  return detail::embedding_bag(
+      input,
+      weight,
+      options.offsets(),
+      options.max_norm(),
+      options.norm_type(),
+      options.scale_grad_by_freq(),
+      options.mode(),
+      options.sparse(),
+      options.per_sample_weights(),
+      options.include_last_offset(),
+      options.padding_idx());
+}
+
+} // namespace torch::nn::functional
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/fold.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/fold.h
new file mode 100644
index 0000000000000000000000000000000000000000..227924eecdb7b65315e2cd597b2858e6c0184e40
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/fold.h
@@ -0,0 +1,98 @@
+#pragma once
+
+#include <torch/nn/options/fold.h>
+
+namespace torch::nn::functional {
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor fold(
+    const Tensor& input,
+    ExpandingArray<2> output_size,
+    ExpandingArray<2> kernel_size,
+    ExpandingArray<2> dilation,
+    ExpandingArray<2> padding,
+    ExpandingArray<2> stride) {
+  if (input.dim() == 3 || input.dim() == 2) {
+    return torch::col2im(
+        input, output_size, kernel_size, dilation, padding, stride);
+  } else {
+    TORCH_CHECK(
+        false,
+        "Input Error: Only unbatched (2D) or batched (3D) input Tensors are supported "
+        "(got ",
+        input.dim(),
+        "D)");
+  }
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.fold
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::FoldFuncOptions` class to
+/// learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::fold(input, F::FoldFuncOptions({3, 2}, {2, 2}));
+/// ```
+inline Tensor fold(const Tensor& input, const FoldFuncOptions& options) {
+  return detail::fold(
+      input,
+      options.output_size(),
+      options.kernel_size(),
+      options.dilation(),
+      options.padding(),
+      options.stride());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor unfold(
+    const Tensor& input,
+    ExpandingArray<2> kernel_size,
+    ExpandingArray<2> dilation,
+    ExpandingArray<2> padding,
+    ExpandingArray<2> stride) {
+  if (input.dim() == 4) {
+    return torch::im2col(input, kernel_size, dilation, padding, stride);
+  } else {
+    TORCH_CHECK(
+        false,
+        "Input Error: Only 4D input Tensors are supported "
+        "(got ",
+        input.dim(),
+        "D)");
+  }
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.unfold
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::UnfoldFuncOptions` class
+/// to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::unfold(input, F::UnfoldFuncOptions({2, 2}).padding(1).stride(2));
+/// ```
+inline Tensor unfold(const Tensor& input, const UnfoldFuncOptions& options) {
+  return detail::unfold(
+      input,
+      options.kernel_size(),
+      options.dilation(),
+      options.padding(),
+      options.stride());
+}
+
+} // namespace torch::nn::functional
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/instancenorm.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/instancenorm.h
new file mode 100644
index 0000000000000000000000000000000000000000..8142c0a9e339a9ed5c271ae84c6645ce8a4d2e1a
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/instancenorm.h
@@ -0,0 +1,59 @@
+#pragma once
+
+#include <torch/nn/options/instancenorm.h>
+
+namespace torch::nn::functional {
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor instance_norm(
+    const Tensor& input,
+    const Tensor& running_mean,
+    const Tensor& running_var,
+    const Tensor& weight,
+    const Tensor& bias,
+    bool use_input_stats,
+    double momentum,
+    double eps) {
+  return torch::instance_norm(
+      input,
+      weight,
+      bias,
+      running_mean,
+      running_var,
+      use_input_stats,
+      momentum,
+      eps,
+      at::globalContext().userEnabledCuDNN());
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.instance_norm
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::InstanceNormFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::instance_norm(input,
+/// F::InstanceNormFuncOptions().running_mean(mean).running_var(variance).weight(weight).bias(bias).momentum(0.1).eps(1e-5));
+/// ```
+inline Tensor instance_norm(
+    const Tensor& input,
+    const InstanceNormFuncOptions& options = {}) {
+  return detail::instance_norm(
+      input,
+      options.running_mean(),
+      options.running_var(),
+      options.weight(),
+      options.bias(),
+      options.use_input_stats(),
+      options.momentum(),
+      options.eps());
+}
+
+} // namespace torch::nn::functional
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/linear.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/linear.h
new file mode 100644
index 0000000000000000000000000000000000000000..22afd656f1b2103bcce84204aa09108449390461
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/linear.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <torch/types.h>
+
+namespace torch::nn::functional {
+
+inline Tensor bilinear(
+    const Tensor& input1,
+    const Tensor& input2,
+    const Tensor& weight,
+    const Tensor& bias = Tensor()) {
+  return torch::bilinear(input1, input2, weight, bias);
+}
+
+// ============================================================================
+
+inline Tensor linear(
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias = {}) {
+  if (input.dim() == 2 && bias.defined()) {
+    // fused op is marginally faster
+    return torch::addmm(bias, input, weight.t());
+  } else {
+    auto output = input.matmul(weight.t());
+    if (bias.defined()) {
+      output += bias;
+    }
+    return output;
+  }
+}
+
+} // namespace torch::nn::functional
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/loss.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/loss.h
new file mode 100644
index 0000000000000000000000000000000000000000..6ab5374def51c44225568c2e022959555e4d7f12
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/loss.h
@@ -0,0 +1,1039 @@
+#pragma once
+
+#include <ATen/ExpandUtils.h>
+#include <torch/nn/functional/activation.h>
+#include <torch/nn/options/loss.h>
+
+namespace torch::nn::functional {
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor l1_loss(
+    const Tensor& input,
+    const Tensor& target,
+    L1LossFuncOptions::reduction_t reduction) {
+  return torch::l1_loss(input, target, enumtype::reduction_get_enum(reduction));
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.l1_loss
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::L1LossFuncOptions` class
+/// to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::l1_loss(input, target, F::L1LossFuncOptions(torch::kNone));
+/// ```
+inline Tensor l1_loss(
+    const Tensor& input,
+    const Tensor& target,
+    const L1LossFuncOptions& options = {}) {
+  return detail::l1_loss(input, target, options.reduction());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor kl_div(
+    const Tensor& input,
+    const Tensor& target,
+    KLDivFuncOptions::reduction_t reduction,
+    bool log_target = false) {
+  torch::Reduction::Reduction reduction_enum{};
+
+  if (std::holds_alternative<enumtype::kMean>(reduction)) {
+    TORCH_WARN(
+        "reduction: 'mean' divides the total loss by both the batch size and the support size."
+        "'batchmean' divides only by the batch size, and aligns with the KL div math definition."
+        "'mean' will be changed to behave the same as 'batchmean' in the next major release.");
+  }
+
+  // special case for batchmean
+  if (std::holds_alternative<enumtype::kBatchMean>(reduction)) {
+    reduction_enum = torch::Reduction::Sum;
+  } else {
+    reduction_enum = enumtype::reduction_get_enum(reduction);
+  }
+
+  auto reduced = torch::kl_div(input, target, reduction_enum, log_target);
+
+  if (std::holds_alternative<enumtype::kBatchMean>(reduction) &&
+      input.dim() != 0) {
+    reduced = reduced / input.sizes()[0];
+  }
+
+  return reduced;
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.kl_div
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::KLDivFuncOptions` class to
+/// learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::kl_div(input, target,
+/// F::KLDivFuncOptions.reduction(torch::kNone).log_target(false));
+/// ```
+inline Tensor kl_div(
+    const Tensor& input,
+    const Tensor& target,
+    const KLDivFuncOptions& options = {}) {
+  return detail::kl_div(
+      input, target, options.reduction(), options.log_target());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor mse_loss(
+    const Tensor& input,
+    const Tensor& target,
+    MSELossFuncOptions::reduction_t reduction) {
+  if (!(target.sizes() == input.sizes())) {
+    TORCH_WARN(
+        "Using a target size (",
+        target.sizes(),
+        ") that is different to the input size (",
+        input.sizes(),
+        "). ",
+        "This will likely lead to incorrect results due to broadcasting. ",
+        "Please ensure they have the same size.");
+  }
+  std::vector<torch::Tensor> broadcast_tensors =
+      torch::broadcast_tensors({input, target});
+  auto expanded_input = broadcast_tensors[0];
+  auto expanded_target = broadcast_tensors[1];
+  return torch::mse_loss(
+      expanded_input, expanded_target, enumtype::reduction_get_enum(reduction));
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.mse_loss
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::MSELossFuncOptions` class
+/// to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::mse_loss(input, target, F::MSELossFuncOptions(torch::kNone));
+/// ```
+inline Tensor mse_loss(
+    const Tensor& input,
+    const Tensor& target,
+    const MSELossFuncOptions& options = {}) {
+  return detail::mse_loss(input, target, options.reduction());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor binary_cross_entropy(
+    const Tensor& input,
+    const Tensor& target,
+    const Tensor& weight,
+    BinaryCrossEntropyFuncOptions::reduction_t reduction) {
+  auto reduction_enum = enumtype::reduction_get_enum(reduction);
+
+  if (target.sizes() != input.sizes()) {
+    TORCH_CHECK(
+        false,
+        "Using a target size (",
+        target.sizes(),
+        ") ",
+        "that is different to the input size (",
+        input.sizes(),
+        ") is deprecated. ",
+        "Please ensure they have the same size.");
+  }
+
+  auto weight_ = weight;
+  if (weight_.defined()) {
+    auto new_size = at::infer_size(target.sizes(), weight_.sizes());
+    weight_ = weight_.expand(new_size);
+  }
+
+  return torch::binary_cross_entropy(input, target, weight_, reduction_enum);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.binary_cross_entropy
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::BinaryCrossEntropyFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::binary_cross_entropy(input, target,
+/// F::BinaryCrossEntropyFuncOptions().weight(weight));
+/// ```
+inline Tensor binary_cross_entropy(
+    const Tensor& input,
+    const Tensor& target,
+    const BinaryCrossEntropyFuncOptions& options = {}) {
+  return detail::binary_cross_entropy(
+      input, target, options.weight(), options.reduction());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor hinge_embedding_loss(
+    const Tensor& input,
+    const Tensor& target,
+    double margin,
+    HingeEmbeddingLossFuncOptions::reduction_t reduction) {
+  return torch::hinge_embedding_loss(
+      input, target, margin, enumtype::reduction_get_enum(reduction));
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.hinge_embedding_loss
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::HingeEmbeddingLossFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::hinge_embedding_loss(input, target,
+/// F::HingeEmbeddingLossFuncOptions().margin(2));
+/// ```
+inline Tensor hinge_embedding_loss(
+    const Tensor& input,
+    const Tensor& target,
+    const HingeEmbeddingLossFuncOptions& options = {}) {
+  return detail::hinge_embedding_loss(
+      input, target, options.margin(), options.reduction());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor multi_margin_loss(
+    const Tensor& input,
+    const Tensor& target,
+    int64_t p,
+    double margin,
+    const Tensor& weight,
+    MultiMarginLossFuncOptions::reduction_t reduction) {
+  TORCH_CHECK(p == 1 || p == 2, "only p == 1 and p == 2 supported");
+  if (weight.defined()) {
+    TORCH_CHECK(weight.dim() == 1, "weight must be one-dimensional");
+  }
+
+  return torch::multi_margin_loss(
+      input,
+      target,
+      p,
+      margin,
+      weight,
+      enumtype::reduction_get_enum(reduction));
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.multi_margin_loss
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::MultiMarginLossFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::multi_margin_loss(input, target,
+/// F::MultiMarginLossFuncOptions().margin(2).weight(weight));
+/// ```
+inline Tensor multi_margin_loss(
+    const Tensor& input,
+    const Tensor& target,
+    const MultiMarginLossFuncOptions& options = {}) {
+  return detail::multi_margin_loss(
+      input,
+      target,
+      options.p(),
+      options.margin(),
+      options.weight(),
+      options.reduction());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor cosine_embedding_loss(
+    const Tensor& input1,
+    const Tensor& input2,
+    const Tensor& target,
+    double margin,
+    CosineEmbeddingLossFuncOptions::reduction_t reduction) {
+  return torch::cosine_embedding_loss(
+      input1, input2, target, margin, enumtype::reduction_get_enum(reduction));
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.cosine_embedding_loss
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::CosineEmbeddingLossFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::cosine_embedding_loss(input1, input2, target,
+/// F::CosineEmbeddingLossFuncOptions().margin(0.5));
+/// ```
+inline Tensor cosine_embedding_loss(
+    const Tensor& input1,
+    const Tensor& input2,
+    const Tensor& target,
+    const CosineEmbeddingLossFuncOptions& options = {}) {
+  return detail::cosine_embedding_loss(
+      input1, input2, target, options.margin(), options.reduction());
+}
+
+// ============================================================================
+
+inline Tensor _smooth_l1_loss(
+    const Tensor& input,
+    const Tensor& target,
+    double beta = 1.) {
+  auto t = torch::abs(input - target);
+  return torch::where(t < beta, 0.5 * torch::pow(t, 2) / beta, t - 0.5 * beta);
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor smooth_l1_loss(
+    const Tensor& input,
+    const Tensor& target,
+    SmoothL1LossFuncOptions::reduction_t reduction,
+    std::optional<double> beta_opt = std::nullopt) {
+  if (target.sizes() != input.sizes()) {
+    TORCH_WARN(
+        "Using a target size (",
+        target.sizes(),
+        ") that is different to the input size (",
+        input.sizes(),
+        "). ",
+        "This will likely lead to incorrect results due to broadcasting. ",
+        "Please ensure they have the same size.");
+  }
+  double beta = beta_opt.value_or(1.0);
+
+  std::vector<Tensor> expanded_tensors =
+      torch::broadcast_tensors({input, target});
+  return torch::smooth_l1_loss(
+      expanded_tensors[0],
+      expanded_tensors[1],
+      enumtype::reduction_get_enum(reduction),
+      beta);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.smooth_l1_loss
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::SmoothL1LossFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::smooth_l1_loss(input, target, F::SmoothL1LossFuncOptions(torch::kNone));
+/// ```
+inline Tensor smooth_l1_loss(
+    const Tensor& input,
+    const Tensor& target,
+    const SmoothL1LossFuncOptions& options = {}) {
+  return detail::smooth_l1_loss(
+      input, target, options.reduction(), options.beta());
+}
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.smooth_l1_loss
+/// about the exact behavior of this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::smooth_l1_loss(input, target, /*options=*/torch::kNone, /*beta=*/0.5);
+/// ```
+inline Tensor smooth_l1_loss(
+    const Tensor& input,
+    const Tensor& target,
+    const SmoothL1LossFuncOptions& options,
+    double beta) {
+  TORCH_CHECK(
+      !options.beta().has_value(),
+      "expected beta not to be provided in 'options', but got ",
+      options.beta());
+  return detail::smooth_l1_loss(input, target, options.reduction(), beta);
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor huber_loss(
+    const Tensor& input,
+    const Tensor& target,
+    HuberLossFuncOptions::reduction_t reduction,
+    double delta = 1.) {
+  if (target.sizes() != input.sizes()) {
+    TORCH_WARN(
+        "Using a target size (",
+        target.sizes(),
+        ") that is different to the input size (",
+        input.sizes(),
+        "). ",
+        "This will likely lead to incorrect results due to broadcasting. ",
+        "Please ensure they have the same size.");
+  }
+
+  std::vector<Tensor> expanded_tensors =
+      torch::broadcast_tensors({input, target});
+  return torch::huber_loss(
+      expanded_tensors[0],
+      expanded_tensors[1],
+      enumtype::reduction_get_enum(reduction),
+      delta);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.huber_loss
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::HuberLossFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::huber_loss(input, target,
+/// F::HuberLossFuncOptions().reduction(torch::kNone).delta(0.5));
+/// ```
+inline Tensor huber_loss(
+    const Tensor& input,
+    const Tensor& target,
+    const HuberLossFuncOptions& options = {}) {
+  return detail::huber_loss(
+      input, target, options.reduction(), options.delta());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor multilabel_margin_loss(
+    const Tensor& input,
+    const Tensor& target,
+    MultilabelMarginLossFuncOptions::reduction_t reduction) {
+  return torch::multilabel_margin_loss(
+      input, target, enumtype::reduction_get_enum(reduction));
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.multilabel_margin_loss
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::MultilabelMarginLossFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::multilabel_margin_loss(input, target,
+/// F::MultilabelMarginLossFuncOptions(torch::kNone));
+/// ```
+inline Tensor multilabel_margin_loss(
+    const Tensor& input,
+    const Tensor& target,
+    const MultilabelMarginLossFuncOptions& options = {}) {
+  return detail::multilabel_margin_loss(input, target, options.reduction());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor soft_margin_loss(
+    const Tensor& input,
+    const Tensor& target,
+    SoftMarginLossFuncOptions::reduction_t reduction) {
+  return torch::soft_margin_loss(
+      input, target, enumtype::reduction_get_enum(reduction));
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.soft_margin_loss
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::SoftMarginLossFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::soft_margin_loss(input, target,
+/// F::SoftMarginLossFuncOptions(torch::kNone));
+/// ```
+inline Tensor soft_margin_loss(
+    const Tensor& input,
+    const Tensor& target,
+    const SoftMarginLossFuncOptions& options = {}) {
+  return detail::soft_margin_loss(input, target, options.reduction());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor multilabel_soft_margin_loss(
+    const Tensor& input,
+    const Tensor& target,
+    const Tensor& weight,
+    MultilabelSoftMarginLossFuncOptions::reduction_t reduction) {
+  auto loss =
+      -(target * torch::log_sigmoid(input) +
+        (1 - target) * torch::log_sigmoid(-input));
+  if (weight.defined()) {
+    loss = loss * weight;
+  }
+
+  auto class_dim = input.dim() - 1;
+  auto C = input.size(class_dim);
+  loss = loss.sum(class_dim) / C; // only return N loss values
+
+  Tensor ret;
+
+  if (std::holds_alternative<enumtype::kNone>(reduction)) {
+    ret = loss;
+  } else if (std::holds_alternative<enumtype::kMean>(reduction)) {
+    ret = loss.mean();
+  } else if (std::holds_alternative<enumtype::kSum>(reduction)) {
+    ret = loss.sum();
+  } else {
+    ret = input;
+    TORCH_INTERNAL_ASSERT(
+        false, enumtype::get_enum_name(reduction), " is not valid");
+  }
+  return ret;
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.multilabel_soft_margin_loss
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::MultilabelSoftMarginLossFuncOptions` class to learn
+/// what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::multilabel_soft_margin_loss(input, target,
+/// F::MultilabelSoftMarginLossFuncOptions().reduction(torch::kNone).weight(weight));
+/// ```
+inline Tensor multilabel_soft_margin_loss(
+    const Tensor& input,
+    const Tensor& target,
+    const MultilabelSoftMarginLossFuncOptions& options = {}) {
+  return detail::multilabel_soft_margin_loss(
+      input, target, options.weight(), options.reduction());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor triplet_margin_loss(
+    const Tensor& anchor,
+    const Tensor& positive,
+    const Tensor& negative,
+    double margin,
+    double p,
+    double eps,
+    bool swap,
+    TripletMarginLossFuncOptions::reduction_t reduction) {
+  return torch::triplet_margin_loss(
+      anchor,
+      positive,
+      negative,
+      margin,
+      p,
+      eps,
+      swap,
+      enumtype::reduction_get_enum(reduction));
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.triplet_margin_loss
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::TripletMarginLossFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::triplet_margin_loss(anchor, positive, negative,
+/// F::TripletMarginLossFuncOptions().margin(1.0));
+/// ```
+inline Tensor triplet_margin_loss(
+    const Tensor& anchor,
+    const Tensor& positive,
+    const Tensor& negative,
+    const TripletMarginLossFuncOptions& options = {}) {
+  return detail::triplet_margin_loss(
+      anchor,
+      positive,
+      negative,
+      options.margin(),
+      options.p(),
+      options.eps(),
+      options.swap(),
+      options.reduction());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor triplet_margin_with_distance_loss(
+    const Tensor& anchor,
+    const Tensor& positive,
+    const Tensor& negative,
+    std::optional<TripletMarginWithDistanceLossFuncOptions::distance_function_t>
+        distance_function,
+    double margin,
+    bool swap,
+    TripletMarginWithDistanceLossFuncOptions::reduction_t reduction) {
+  Tensor dist_pos, dist_neg;
+  if (distance_function.has_value()) {
+    auto distance_function_impl = distance_function.value();
+    dist_pos = distance_function_impl(anchor, positive);
+    dist_neg = distance_function_impl(anchor, negative);
+  } else {
+    dist_pos = pairwise_distance(anchor, positive);
+    dist_neg = pairwise_distance(anchor, negative);
+  }
+
+  if (swap) {
+    Tensor dist_swap;
+    if (distance_function.has_value()) {
+      dist_swap = distance_function.value()(positive, negative);
+    } else {
+      dist_swap = pairwise_distance(positive, negative);
+    }
+    dist_neg = torch::min(dist_neg, dist_swap);
+  }
+
+  auto loss = torch::clamp_min(dist_pos - dist_neg + margin, 0);
+
+  Tensor ret;
+  if (std::holds_alternative<enumtype::kNone>(reduction)) {
+    ret = loss;
+  } else if (std::holds_alternative<enumtype::kMean>(reduction)) {
+    ret = loss.mean();
+  } else if (std::holds_alternative<enumtype::kSum>(reduction)) {
+    ret = loss.sum();
+  } else {
+    ret = anchor;
+    TORCH_INTERNAL_ASSERT(
+        false, enumtype::get_enum_name(reduction), " is not valid");
+  }
+  return ret;
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.triplet_margin_with_distance_loss
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::TripletMarginWithDistanceLossFuncOptions` class to
+/// learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::triplet_margin_with_distance_loss(anchor, positive, negative,
+/// F::TripletMarginWithDistanceLossFuncOptions().margin(1.0));
+/// ```
+inline Tensor triplet_margin_with_distance_loss(
+    const Tensor& anchor,
+    const Tensor& positive,
+    const Tensor& negative,
+    const TripletMarginWithDistanceLossFuncOptions& options = {}) {
+  return detail::triplet_margin_with_distance_loss(
+      anchor,
+      positive,
+      negative,
+      options.distance_function(),
+      options.margin(),
+      options.swap(),
+      options.reduction());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor ctc_loss(
+    const Tensor& log_probs,
+    const Tensor& targets,
+    const Tensor& input_lengths,
+    const Tensor& target_lengths,
+    int64_t blank,
+    CTCLossFuncOptions::reduction_t reduction,
+    bool zero_infinity) {
+  return torch::ctc_loss(
+      log_probs,
+      targets,
+      input_lengths,
+      target_lengths,
+      blank,
+      enumtype::reduction_get_enum(reduction),
+      zero_infinity);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.ctc_loss
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::CTCLossFuncOptions` class
+/// to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::ctc_loss(log_probs, targets, input_lengths, target_lengths,
+/// F::CTCLossFuncOptions().reduction(torch::kNone));
+/// ```
+inline Tensor ctc_loss(
+    const Tensor& log_probs,
+    const Tensor& targets,
+    const Tensor& input_lengths,
+    const Tensor& target_lengths,
+    const CTCLossFuncOptions& options = {}) {
+  return detail::ctc_loss(
+      log_probs,
+      targets,
+      input_lengths,
+      target_lengths,
+      options.blank(),
+      options.reduction(),
+      options.zero_infinity());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor poisson_nll_loss(
+    const Tensor& input,
+    const Tensor& target,
+    bool log_input,
+    bool full,
+    double eps,
+    PoissonNLLLossFuncOptions::reduction_t reduction) {
+  return torch::poisson_nll_loss(
+      input,
+      target,
+      log_input,
+      full,
+      eps,
+      enumtype::reduction_get_enum(reduction));
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.poisson_nll_loss
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::PoissonNLLLossFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::poisson_nll_loss(input, target,
+/// F::PoissonNLLLossFuncOptions().reduction(torch::kNone));
+/// ```
+inline Tensor poisson_nll_loss(
+    const Tensor& input,
+    const Tensor& target,
+    const PoissonNLLLossFuncOptions& options = {}) {
+  return detail::poisson_nll_loss(
+      input,
+      target,
+      options.log_input(),
+      options.full(),
+      options.eps(),
+      options.reduction());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor margin_ranking_loss(
+    const Tensor& input1,
+    const Tensor& input2,
+    const Tensor& target,
+    double margin,
+    MarginRankingLossFuncOptions::reduction_t reduction) {
+  TORCH_CHECK(
+      input1.dim() == input2.dim() && input1.dim() == target.dim(),
+      "margin_ranking_loss : All input tensors should have same dimension but got sizes: "
+      "input1: ",
+      input1.sizes(),
+      ", input2: ",
+      input2.sizes(),
+      ", target: ",
+      target.sizes());
+  return torch::margin_ranking_loss(
+      input1, input2, target, margin, enumtype::reduction_get_enum(reduction));
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.margin_ranking_loss
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::MarginRankingLossFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::margin_ranking_loss(input1, input2, target,
+/// F::MarginRankingLossFuncOptions().margin(0.5).reduction(torch::kSum));
+/// ```
+inline Tensor margin_ranking_loss(
+    const Tensor& input1,
+    const Tensor& input2,
+    const Tensor& target,
+    const MarginRankingLossFuncOptions& options = {}) {
+  return detail::margin_ranking_loss(
+      input1, input2, target, options.margin(), options.reduction());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor nll_loss(
+    const Tensor& input,
+    const Tensor& target,
+    const Tensor& weight,
+    int64_t ignore_index,
+    const NLLLossFuncOptions::reduction_t& reduction) {
+  if (input.dim() < 2) {
+    TORCH_CHECK(false, "Expected 2 or more dimensions (got ", input.dim(), ")");
+  }
+
+  if (input.sizes()[0] != target.sizes()[0]) {
+    TORCH_CHECK(
+        false,
+        "Expected input batch_size (",
+        input.sizes()[0],
+        ") to match target batch_size (",
+        target.sizes()[0],
+        ").");
+  }
+
+  return torch::nll_loss_nd(
+      input,
+      target,
+      weight,
+      enumtype::reduction_get_enum(reduction),
+      ignore_index);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.nll_loss
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::NLLLossFuncOptions` class
+/// to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::nll_loss(input, target,
+/// F::NLLLossFuncOptions().ignore_index(-100).reduction(torch::kMean));
+/// ```
+inline Tensor nll_loss(
+    const Tensor& input,
+    const Tensor& target,
+    const NLLLossFuncOptions& options = {}) {
+  return detail::nll_loss(
+      input,
+      target,
+      options.weight(),
+      options.ignore_index(),
+      options.reduction());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor cross_entropy(
+    const Tensor& input,
+    const Tensor& target,
+    const Tensor& weight,
+    int64_t ignore_index,
+    CrossEntropyFuncOptions::reduction_t reduction,
+    double label_smoothing) {
+  return torch::cross_entropy_loss(
+      input,
+      target,
+      weight,
+      enumtype::reduction_get_enum(reduction),
+      ignore_index,
+      label_smoothing);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.cross_entropy
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::CrossEntropyFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::cross_entropy(input, target,
+/// F::CrossEntropyFuncOptions().ignore_index(-100).reduction(torch::kMean));
+/// ```
+inline Tensor cross_entropy(
+    const Tensor& input,
+    const Tensor& target,
+    const CrossEntropyFuncOptions& options = {}) {
+  return detail::cross_entropy(
+      input,
+      target,
+      options.weight(),
+      options.ignore_index(),
+      options.reduction(),
+      options.label_smoothing());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor binary_cross_entropy_with_logits(
+    const Tensor& input,
+    const Tensor& target,
+    const Tensor& weight,
+    BinaryCrossEntropyWithLogitsFuncOptions::reduction_t reduction,
+    const Tensor& pos_weight) {
+  TORCH_CHECK(
+      target.sizes() == input.sizes(),
+      "Target size (",
+      target.sizes(),
+      ") must be the same as input size (",
+      input.sizes(),
+      ")");
+
+  return torch::binary_cross_entropy_with_logits(
+      input,
+      target,
+      weight,
+      pos_weight,
+      enumtype::reduction_get_enum(reduction));
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.binary_cross_entropy_with_logits
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::BinaryCrossEntropyWithLogitsFuncOptions` class to
+/// learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::binary_cross_entropy_with_logits(input, target,
+/// F::BinaryCrossEntropyWithLogitsFuncOptions().pos_weight(pos_weight).reduction(torch::kSum));
+/// ```
+inline Tensor binary_cross_entropy_with_logits(
+    const Tensor& input,
+    const Tensor& target,
+    const BinaryCrossEntropyWithLogitsFuncOptions& options = {}) {
+  return detail::binary_cross_entropy_with_logits(
+      input,
+      target,
+      options.weight(),
+      options.reduction(),
+      options.pos_weight());
+}
+
+} // namespace torch::nn::functional
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/normalization.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/normalization.h
new file mode 100644
index 0000000000000000000000000000000000000000..50095822fae5277819e03934482ab0de0db86556
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/normalization.h
@@ -0,0 +1,207 @@
+#pragma once
+
+#include <torch/nn/functional/padding.h>
+#include <torch/nn/functional/pooling.h>
+#include <torch/nn/options/normalization.h>
+#include <torch/types.h>
+
+namespace torch::nn::functional {
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor normalize(
+    const Tensor& input,
+    double p,
+    int64_t dim,
+    double eps,
+    std::optional<Tensor> out) {
+  if (out == std::nullopt) {
+    auto denom = input.norm(p, dim, true).clamp_min(eps).expand_as(input);
+    return input / denom;
+  } else {
+    auto denom = input.norm(p, dim, true).clamp_min(eps).expand_as(input);
+    return torch::div_out(*out, input, denom);
+  }
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.normalize
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::NormalizeFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::normalize(input, F::NormalizeFuncOptions().p(1).dim(-1));
+/// ```
+inline Tensor normalize(
+    const Tensor& input,
+    NormalizeFuncOptions options = {}) {
+  return detail::normalize(
+      input, options.p(), options.dim(), options.eps(), options.out());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor layer_norm(
+    const Tensor& input,
+    const std::vector<int64_t>& normalized_shape,
+    const Tensor& weight,
+    const Tensor& bias,
+    double eps) {
+  return torch::layer_norm(input, normalized_shape, weight, bias, eps);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.layer_norm
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::LayerNormFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::layer_norm(input, F::LayerNormFuncOptions({2, 2}).eps(2e-5));
+/// ```
+inline Tensor layer_norm(
+    const Tensor& input,
+    const LayerNormFuncOptions& options) {
+  return detail::layer_norm(
+      input,
+      options.normalized_shape(),
+      options.weight(),
+      options.bias(),
+      options.eps());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor local_response_norm(
+    const Tensor& input,
+    int64_t size,
+    double alpha,
+    double beta,
+    double k) {
+  auto dim = input.dim();
+  TORCH_CHECK(
+      dim >= 3,
+      "Expected 3D or higher dimensionality input (got ",
+      dim,
+      " dimensions)");
+  auto div = input.mul(input).unsqueeze(1);
+  if (dim == 3) {
+    div = detail::pad(
+        div,
+        /*pad=*/{0, 0, size / 2, (size - 1) / 2},
+        /*mode=*/torch::kConstant,
+        /*value=*/0);
+    div = detail::avg_pool2d(
+              div,
+              /*kernel_size=*/{size, 1},
+              /*stride=*/1,
+              /*padding=*/0,
+              /*ceil_mode=*/false,
+              /*count_include_pad=*/true,
+              /*divisor_override=*/std::nullopt)
+              .squeeze(1);
+  } else {
+    auto sizes = input.sizes();
+    div = div.view({sizes[0], 1, sizes[1], sizes[2], -1});
+    div = detail::pad(
+        div,
+        /*pad=*/{0, 0, 0, 0, size / 2, (size - 1) / 2},
+        /*mode=*/torch::kConstant,
+        /*value=*/0);
+    div = detail::avg_pool3d(
+              div,
+              /*kernel_size=*/{size, 1, 1},
+              /*stride=*/1,
+              /*padding=*/0,
+              /*ceil_mode=*/false,
+              /*count_include_pad=*/true,
+              /*divisor_override=*/std::nullopt)
+              .squeeze(1);
+    div = div.view(sizes);
+  }
+  div = div.mul(alpha).add(k).pow(beta);
+  return input / div;
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.local_response_norm
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::LocalResponseNormFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::local_response_norm(x, F::LocalResponseNormFuncOptions(2));
+/// ```
+inline Tensor local_response_norm(
+    const Tensor& input,
+    const LocalResponseNormFuncOptions& options) {
+  return detail::local_response_norm(
+      input, options.size(), options.alpha(), options.beta(), options.k());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor group_norm(
+    const Tensor& input,
+    int64_t num_groups,
+    const Tensor& weight,
+    const Tensor& bias,
+    double eps) {
+  return torch::group_norm(
+      input,
+      num_groups,
+      weight,
+      bias,
+      eps,
+      at::globalContext().userEnabledCuDNN());
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.group_norm
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::GroupNormFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::group_norm(input, F::GroupNormFuncOptions(2).eps(2e-5));
+/// ```
+inline Tensor group_norm(
+    const Tensor& input,
+    const GroupNormFuncOptions& options) {
+  return detail::group_norm(
+      input,
+      options.num_groups(),
+      options.weight(),
+      options.bias(),
+      options.eps());
+}
+
+} // namespace torch::nn::functional
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/padding.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/padding.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a0fe71d9c9159c63c3af4b0e17e5c5b33fbb277
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/padding.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include <ATen/PadNd.h>
+#include <torch/nn/options/padding.h>
+
+namespace torch::nn::functional {
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor pad(
+    const Tensor& input,
+    IntArrayRef pad,
+    PadFuncOptions::mode_t mode,
+    double value) {
+  const auto mode_enum = [&] {
+    if (std::holds_alternative<enumtype::kConstant>(mode)) {
+      return at::padding_mode::constant;
+    } else if (std::holds_alternative<enumtype::kReflect>(mode)) {
+      return at::padding_mode::reflect;
+    } else if (std::holds_alternative<enumtype::kReplicate>(mode)) {
+      return at::padding_mode::replicate;
+    } else if (std::holds_alternative<enumtype::kCircular>(mode)) {
+      return at::padding_mode::circular;
+    }
+    TORCH_CHECK(false, "Unrecognised padding mode");
+  }();
+
+  std::optional<double> fill_value;
+  if (value != 0.0) {
+    fill_value = value;
+  }
+  return at::_pad_enum(input, pad, static_cast<int64_t>(mode_enum), fill_value);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.pad
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::PadFuncOptions` class to
+/// learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::pad(input, F::PadFuncOptions({1, 2, 2, 1, 1,
+/// 2}).mode(torch::kReplicate));
+/// ```
+inline Tensor pad(const Tensor& input, const PadFuncOptions& options) {
+  return detail::pad(input, options.pad(), options.mode(), options.value());
+}
+
+} // namespace torch::nn::functional
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/pixelshuffle.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/pixelshuffle.h
new file mode 100644
index 0000000000000000000000000000000000000000..0c709d47f50f090835e41c64858c1fe00b806936
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/pixelshuffle.h
@@ -0,0 +1,43 @@
+#pragma once
+
+#include <torch/nn/options/pixelshuffle.h>
+
+namespace torch::nn::functional {
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor pixel_shuffle(const Tensor& input, int64_t upscale_factor) {
+  return torch::pixel_shuffle(input, upscale_factor);
+}
+
+inline Tensor pixel_unshuffle(const Tensor& input, int64_t downscale_factor) {
+  return torch::pixel_unshuffle(input, downscale_factor);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.pixel_shuffle
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::PixelShuffleFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::pixel_shuffle(x, F::PixelShuffleFuncOptions(2));
+/// ```
+inline Tensor pixel_shuffle(
+    const Tensor& input,
+    const PixelShuffleFuncOptions& options) {
+  return detail::pixel_shuffle(input, options.upscale_factor());
+}
+
+inline Tensor pixel_unshuffle(
+    const Tensor& input,
+    const PixelUnshuffleFuncOptions& options) {
+  return detail::pixel_unshuffle(input, options.downscale_factor());
+}
+
+} // namespace torch::nn::functional
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/pooling.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/pooling.h
new file mode 100644
index 0000000000000000000000000000000000000000..3894f55e46b78c00ff26b83ee81fdd4a5d4c918c
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/pooling.h
@@ -0,0 +1,1149 @@
+#pragma once
+
+#include <c10/util/irange.h>
+#include <torch/nn/functional/activation.h>
+#include <torch/nn/modules/utils.h>
+#include <torch/nn/options/pooling.h>
+
+namespace torch::nn::functional {
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor avg_pool1d(
+    const Tensor& input,
+    ExpandingArray<1> kernel_size,
+    ExpandingArray<1> stride,
+    ExpandingArray<1> padding,
+    bool ceil_mode,
+    bool count_include_pad) {
+  return torch::avg_pool1d(
+      input, kernel_size, stride, padding, ceil_mode, count_include_pad);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.avg_pool1d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::AvgPool1dFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::avg_pool1d(x, F::AvgPool1dFuncOptions(3).stride(2));
+/// ```
+inline Tensor avg_pool1d(
+    const Tensor& input,
+    const AvgPool1dFuncOptions& options) {
+  return avg_pool1d(
+      input,
+      options.kernel_size(),
+      options.stride(),
+      options.padding(),
+      options.ceil_mode(),
+      options.count_include_pad());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor avg_pool2d(
+    const Tensor& input,
+    ExpandingArray<2> kernel_size,
+    ExpandingArray<2> stride,
+    ExpandingArray<2> padding,
+    bool ceil_mode,
+    bool count_include_pad,
+    std::optional<int64_t> divisor_override) {
+  return torch::avg_pool2d(
+      input,
+      kernel_size,
+      stride,
+      padding,
+      ceil_mode,
+      count_include_pad,
+      divisor_override);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.avg_pool2d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::AvgPool2dFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::avg_pool2d(x, F::AvgPool2dFuncOptions(3).stride(2));
+/// ```
+inline Tensor avg_pool2d(
+    const Tensor& input,
+    const AvgPool2dFuncOptions& options) {
+  return detail::avg_pool2d(
+      input,
+      options.kernel_size(),
+      options.stride(),
+      options.padding(),
+      options.ceil_mode(),
+      options.count_include_pad(),
+      options.divisor_override());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor avg_pool3d(
+    const Tensor& input,
+    ExpandingArray<3> kernel_size,
+    ExpandingArray<3> stride,
+    ExpandingArray<3> padding,
+    bool ceil_mode,
+    bool count_include_pad,
+    std::optional<int64_t> divisor_override) {
+  return torch::avg_pool3d(
+      input,
+      kernel_size,
+      stride,
+      padding,
+      ceil_mode,
+      count_include_pad,
+      divisor_override);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.avg_pool3d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::AvgPool3dFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::avg_pool3d(x, F::AvgPool3dFuncOptions(3).stride(2));
+/// ```
+inline Tensor avg_pool3d(
+    const Tensor& input,
+    const AvgPool3dFuncOptions& options) {
+  return detail::avg_pool3d(
+      input,
+      options.kernel_size(),
+      options.stride(),
+      options.padding(),
+      options.ceil_mode(),
+      options.count_include_pad(),
+      options.divisor_override());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor max_pool1d(
+    const Tensor& input,
+    ExpandingArray<1> kernel_size,
+    ExpandingArray<1> stride,
+    ExpandingArray<1> padding,
+    ExpandingArray<1> dilation,
+    bool ceil_mode) {
+  return torch::max_pool1d(
+      input, kernel_size, stride, padding, dilation, ceil_mode);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.max_pool1d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::MaxPool1dFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::max_pool1d(x, F::MaxPool1dFuncOptions(3).stride(2));
+/// ```
+inline Tensor max_pool1d(
+    const Tensor& input,
+    const MaxPool1dFuncOptions& options) {
+  return detail::max_pool1d(
+      input,
+      options.kernel_size(),
+      options.stride(),
+      options.padding(),
+      options.dilation(),
+      options.ceil_mode());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline std::tuple<Tensor, Tensor> max_pool1d_with_indices(
+    const Tensor& input,
+    ExpandingArray<1> kernel_size,
+    ExpandingArray<1> stride,
+    ExpandingArray<1> padding,
+    ExpandingArray<1> dilation,
+    bool ceil_mode) {
+  return torch::max_pool1d_with_indices(
+      input, kernel_size, stride, padding, dilation, ceil_mode);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See the documentation for `torch::nn::functional::MaxPool1dFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::max_pool1d_with_indices(x, F::MaxPool1dFuncOptions(3).stride(2));
+/// ```
+inline std::tuple<Tensor, Tensor> max_pool1d_with_indices(
+    const Tensor& input,
+    const MaxPool1dFuncOptions& options) {
+  return detail::max_pool1d_with_indices(
+      input,
+      options.kernel_size(),
+      options.stride(),
+      options.padding(),
+      options.dilation(),
+      options.ceil_mode());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor max_pool2d(
+    const Tensor& input,
+    ExpandingArray<2> kernel_size,
+    ExpandingArray<2> stride,
+    ExpandingArray<2> padding,
+    ExpandingArray<2> dilation,
+    bool ceil_mode) {
+  return torch::max_pool2d(
+      input, kernel_size, stride, padding, dilation, ceil_mode);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.max_pool2d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::MaxPool2dFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::max_pool2d(x, F::MaxPool2dFuncOptions(3).stride(2));
+/// ```
+inline Tensor max_pool2d(
+    const Tensor& input,
+    const MaxPool2dFuncOptions& options) {
+  return detail::max_pool2d(
+      input,
+      options.kernel_size(),
+      options.stride(),
+      options.padding(),
+      options.dilation(),
+      options.ceil_mode());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline std::tuple<Tensor, Tensor> max_pool2d_with_indices(
+    const Tensor& input,
+    ExpandingArray<2> kernel_size,
+    ExpandingArray<2> stride,
+    ExpandingArray<2> padding,
+    ExpandingArray<2> dilation,
+    bool ceil_mode) {
+  return torch::max_pool2d_with_indices(
+      input, kernel_size, stride, padding, dilation, ceil_mode);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See the documentation for `torch::nn::functional::MaxPool2dFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::max_pool2d_with_indices(x, F::MaxPool2dFuncOptions(3).stride(2));
+/// ```
+inline std::tuple<Tensor, Tensor> max_pool2d_with_indices(
+    const Tensor& input,
+    const MaxPool2dFuncOptions& options) {
+  return detail::max_pool2d_with_indices(
+      input,
+      options.kernel_size(),
+      options.stride(),
+      options.padding(),
+      options.dilation(),
+      options.ceil_mode());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor max_pool3d(
+    const Tensor& input,
+    ExpandingArray<3> kernel_size,
+    ExpandingArray<3> stride,
+    ExpandingArray<3> padding,
+    ExpandingArray<3> dilation,
+    bool ceil_mode) {
+  return torch::max_pool3d(
+      input, kernel_size, stride, padding, dilation, ceil_mode);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.max_pool3d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::MaxPool3dFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::max_pool3d(x, F::MaxPool3dFuncOptions(3).stride(2));
+/// ```
+inline Tensor max_pool3d(
+    const Tensor& input,
+    const MaxPool3dFuncOptions& options) {
+  return detail::max_pool3d(
+      input,
+      options.kernel_size(),
+      options.stride(),
+      options.padding(),
+      options.dilation(),
+      options.ceil_mode());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline std::tuple<Tensor, Tensor> max_pool3d_with_indices(
+    const Tensor& input,
+    ExpandingArray<3> kernel_size,
+    ExpandingArray<3> stride,
+    ExpandingArray<3> padding,
+    ExpandingArray<3> dilation,
+    bool ceil_mode) {
+  return torch::max_pool3d_with_indices(
+      input, kernel_size, stride, padding, dilation, ceil_mode);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See the documentation for `torch::nn::functional::MaxPool3dFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::max_pool3d_with_indices(x, F::MaxPool3dFuncOptions(3).stride(2));
+/// ```
+inline std::tuple<Tensor, Tensor> max_pool3d_with_indices(
+    const Tensor& input,
+    const MaxPool3dFuncOptions& options) {
+  return detail::max_pool3d_with_indices(
+      input,
+      options.kernel_size(),
+      options.stride(),
+      options.padding(),
+      options.dilation(),
+      options.ceil_mode());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline std::tuple<Tensor, Tensor> adaptive_max_pool1d_with_indices(
+    const Tensor& input,
+    ExpandingArray<1> output_size) {
+  return torch::adaptive_max_pool1d(input, output_size);
+}
+} // namespace detail
+
+/// See the documentation for
+/// `torch::nn::functional::AdaptiveMaxPool1dFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::adaptive_max_pool1d_with_indices(x, F::AdaptiveMaxPool1dFuncOptions(3));
+/// ```
+inline std::tuple<Tensor, Tensor> adaptive_max_pool1d_with_indices(
+    const Tensor& input,
+    const AdaptiveMaxPool1dFuncOptions& options) {
+  return detail::adaptive_max_pool1d_with_indices(input, options.output_size());
+}
+
+namespace detail {
+inline Tensor adaptive_max_pool1d(
+    const Tensor& input,
+    ExpandingArray<1> output_size) {
+  return std::get<0>(adaptive_max_pool1d_with_indices(input, output_size));
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.adaptive_max_pool1d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::AdaptiveMaxPool1dFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::adaptive_max_pool1d(x, F::AdaptiveMaxPool1dFuncOptions(3));
+/// ```
+inline Tensor adaptive_max_pool1d(
+    const Tensor& input,
+    const AdaptiveMaxPool1dFuncOptions& options) {
+  return detail::adaptive_max_pool1d(input, options.output_size());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline std::tuple<Tensor, Tensor> adaptive_max_pool2d_with_indices(
+    const Tensor& input,
+    ExpandingArrayWithOptionalElem<2> output_size) {
+  auto output_size_ =
+      torch::nn::modules::utils::_list_with_default(output_size, input.sizes());
+  return torch::adaptive_max_pool2d(input, output_size_);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See the documentation for
+/// `torch::nn::functional::AdaptiveMaxPool2dFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::adaptive_max_pool2d_with_indices(x, F::AdaptiveMaxPool2dFuncOptions(3));
+/// ```
+inline std::tuple<Tensor, Tensor> adaptive_max_pool2d_with_indices(
+    const Tensor& input,
+    const AdaptiveMaxPool2dFuncOptions& options) {
+  return detail::adaptive_max_pool2d_with_indices(input, options.output_size());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor adaptive_max_pool2d(
+    const Tensor& input,
+    ExpandingArrayWithOptionalElem<2> output_size) {
+  return std::get<0>(adaptive_max_pool2d_with_indices(input, output_size));
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.adaptive_max_pool2d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::AdaptiveMaxPool2dFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::adaptive_max_pool2d(x, F::AdaptiveMaxPool2dFuncOptions(3));
+/// ```
+inline Tensor adaptive_max_pool2d(
+    const Tensor& input,
+    const AdaptiveMaxPool2dFuncOptions& options) {
+  return detail::adaptive_max_pool2d(input, options.output_size());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline std::tuple<Tensor, Tensor> adaptive_max_pool3d_with_indices(
+    const Tensor& input,
+    ExpandingArrayWithOptionalElem<3> output_size) {
+  auto output_size_ =
+      torch::nn::modules::utils::_list_with_default(output_size, input.sizes());
+  return torch::adaptive_max_pool3d(input, output_size_);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See the documentation for
+/// `torch::nn::functional::AdaptiveMaxPool3dFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::adaptive_max_pool3d_with_indices(x, F::AdaptiveMaxPool3dFuncOptions(3));
+/// ```
+inline std::tuple<Tensor, Tensor> adaptive_max_pool3d_with_indices(
+    const Tensor& input,
+    const AdaptiveMaxPool3dFuncOptions& options) {
+  return detail::adaptive_max_pool3d_with_indices(input, options.output_size());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor adaptive_max_pool3d(
+    const Tensor& input,
+    ExpandingArrayWithOptionalElem<3> output_size) {
+  return std::get<0>(adaptive_max_pool3d_with_indices(input, output_size));
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.adaptive_max_pool3d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::AdaptiveMaxPool3dFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::adaptive_max_pool3d(x, F::AdaptiveMaxPool3dFuncOptions(3));
+/// ```
+inline Tensor adaptive_max_pool3d(
+    const Tensor& input,
+    const AdaptiveMaxPool3dFuncOptions& options) {
+  return detail::adaptive_max_pool3d(input, options.output_size());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor adaptive_avg_pool1d(
+    const Tensor& input,
+    ExpandingArray<1> output_size) {
+  return torch::adaptive_avg_pool1d(input, output_size);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.adaptive_avg_pool1d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::AdaptiveAvgPool1dFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::adaptive_avg_pool1d(x, F::AdaptiveAvgPool1dFuncOptions(3));
+/// ```
+inline Tensor adaptive_avg_pool1d(
+    const Tensor& input,
+    const AdaptiveAvgPool1dFuncOptions& options) {
+  return detail::adaptive_avg_pool1d(input, options.output_size());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor adaptive_avg_pool2d(
+    const Tensor& input,
+    ExpandingArrayWithOptionalElem<2> output_size) {
+  auto output_size_ =
+      torch::nn::modules::utils::_list_with_default(output_size, input.sizes());
+  return torch::adaptive_avg_pool2d(input, output_size_);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.adaptive_avg_pool2d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::AdaptiveAvgPool2dFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::adaptive_avg_pool2d(x, F::AdaptiveAvgPool2dFuncOptions(3));
+/// ```
+inline Tensor adaptive_avg_pool2d(
+    const Tensor& input,
+    const AdaptiveAvgPool2dFuncOptions& options) {
+  return detail::adaptive_avg_pool2d(input, options.output_size());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor adaptive_avg_pool3d(
+    const Tensor& input,
+    ExpandingArrayWithOptionalElem<3> output_size) {
+  auto output_size_ =
+      torch::nn::modules::utils::_list_with_default(output_size, input.sizes());
+  return torch::adaptive_avg_pool3d(input, output_size_);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.adaptive_avg_pool3d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for
+/// `torch::nn::functional::AdaptiveAvgPool3dFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::adaptive_avg_pool3d(x, F::AdaptiveAvgPool3dFuncOptions(3));
+/// ```
+inline Tensor adaptive_avg_pool3d(
+    const Tensor& input,
+    const AdaptiveAvgPool3dFuncOptions& options) {
+  return detail::adaptive_avg_pool3d(input, options.output_size());
+}
+
+// ============================================================================
+
+inline std::vector<int64_t> _unpool_output_size(
+    const Tensor& input,
+    const IntArrayRef& kernel_size,
+    const IntArrayRef& stride,
+    const IntArrayRef& padding,
+    const std::optional<std::vector<int64_t>>& output_size) {
+  auto input_size = input.sizes();
+  std::vector<int64_t> default_size;
+  for (const auto d : c10::irange(kernel_size.size())) {
+    default_size.push_back(
+        (input_size[input_size.size() - kernel_size.size() + d] - 1) *
+            stride[d] +
+        kernel_size[d] - 2 * padding[d]);
+  }
+  if (!output_size) {
+    return default_size;
+  } else {
+    std::vector<int64_t> output_size_;
+    if (output_size->size() == kernel_size.size() + 2) {
+      output_size_ = IntArrayRef(*output_size).slice(2).vec();
+    }
+    if (output_size_.size() != kernel_size.size()) {
+      TORCH_CHECK(
+          false,
+          "output_size should be a sequence containing ",
+          kernel_size.size(),
+          " or ",
+          kernel_size.size() + 2,
+          " elements, but it has a length of '",
+          output_size_.size(),
+          "'");
+    }
+    for (const auto d : c10::irange(kernel_size.size())) {
+      const auto min_size = default_size[d] - stride[d];
+      const auto max_size = default_size[d] + stride[d];
+      if (!(min_size <= output_size_[d] && output_size_[d] <= max_size)) {
+        TORCH_CHECK(
+            false,
+            "invalid output_size ",
+            output_size_,
+            " (dim ",
+            d,
+            " must be between ",
+            min_size,
+            " and ",
+            max_size,
+            ")");
+      }
+    }
+    return output_size_;
+  }
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor max_unpool1d(
+    const Tensor& input,
+    const Tensor& indices,
+    ExpandingArray<1> kernel_size,
+    ExpandingArray<1> stride,
+    ExpandingArray<1> padding,
+    const std::optional<std::vector<int64_t>>& output_size) {
+  auto output_size_ =
+      _unpool_output_size(input, kernel_size, stride, padding, output_size);
+  output_size_.push_back(1);
+  return torch::max_unpool2d(
+             input.unsqueeze(-1), indices.unsqueeze(-1), output_size_)
+      .squeeze(-1);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.max_unpool1d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::MaxUnpool1dFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::max_unpool1d(x, indices,
+/// F::MaxUnpool1dFuncOptions(3).stride(2).padding(1));
+/// ```
+inline Tensor max_unpool1d(
+    const Tensor& input,
+    const Tensor& indices,
+    const MaxUnpool1dFuncOptions& options) {
+  return detail::max_unpool1d(
+      input,
+      indices,
+      options.kernel_size(),
+      options.stride(),
+      options.padding(),
+      options.output_size());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor max_unpool2d(
+    const Tensor& input,
+    const Tensor& indices,
+    ExpandingArray<2> kernel_size,
+    ExpandingArray<2> stride,
+    ExpandingArray<2> padding,
+    const std::optional<std::vector<int64_t>>& output_size) {
+  auto output_size_ =
+      _unpool_output_size(input, kernel_size, stride, padding, output_size);
+
+  return torch::max_unpool2d(input, indices, output_size_);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.max_unpool2d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::MaxUnpool2dFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::max_unpool2d(x, indices,
+/// F::MaxUnpool2dFuncOptions(3).stride(2).padding(1));
+/// ```
+inline Tensor max_unpool2d(
+    const Tensor& input,
+    const Tensor& indices,
+    const MaxUnpool2dFuncOptions& options) {
+  return detail::max_unpool2d(
+      input,
+      indices,
+      options.kernel_size(),
+      options.stride(),
+      options.padding(),
+      options.output_size());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor max_unpool3d(
+    const Tensor& input,
+    const Tensor& indices,
+    ExpandingArray<3> kernel_size,
+    ExpandingArray<3> stride,
+    ExpandingArray<3> padding,
+    const std::optional<std::vector<int64_t>>& output_size) {
+  auto output_size_ =
+      _unpool_output_size(input, kernel_size, stride, padding, output_size);
+
+  return torch::max_unpool3d(input, indices, output_size_, stride, padding);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.max_unpool3d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::MaxUnpool3dFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::max_unpool3d(x, indices, F::MaxUnpool3dFuncOptions(3));
+/// ```
+inline Tensor max_unpool3d(
+    const Tensor& input,
+    const Tensor& indices,
+    const MaxUnpool3dFuncOptions& options) {
+  return detail::max_unpool3d(
+      input,
+      indices,
+      options.kernel_size(),
+      options.stride(),
+      options.padding(),
+      options.output_size());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline std::tuple<Tensor, Tensor> fractional_max_pool2d_with_indices(
+    const Tensor& input,
+    const ExpandingArray<2>& kernel_size,
+    const std::optional<ExpandingArray<2>>& output_size,
+    const std::optional<ExpandingArray<2, double>>& output_ratio,
+    const Tensor& _random_samples) {
+  if (output_size == std::nullopt && output_ratio == std::nullopt) {
+    TORCH_CHECK(
+        false,
+        "fractional_max_pool2d requires specifying either ",
+        "an output_size or an output_ratio");
+  }
+  std::optional<ExpandingArray<2>> output_size_ = output_size;
+  if (output_size_ == std::nullopt) {
+    TORCH_INTERNAL_ASSERT(output_ratio != std::nullopt);
+    output_size_ = {
+        (int64_t)(static_cast<double>(input.size(-2)) *
+                  (*output_ratio.value())[0]),
+        (int64_t)(static_cast<double>(input.size(-1)) *
+                  (*output_ratio.value())[1])};
+  }
+
+  Tensor _random_samples_ = _random_samples;
+  if (!_random_samples_.defined()) {
+    auto n_batch = input.dim() == 3 ? 1 : input.size(0);
+    _random_samples_ = torch::rand(
+        {n_batch, input.size(-3), 2},
+        torch::TensorOptions().dtype(input.dtype()).device(input.device()));
+  }
+  return torch::fractional_max_pool2d(
+      input, kernel_size, *output_size_, _random_samples_);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See the documentation for
+/// `torch::nn::functional::FractionalMaxPool2dFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::fractional_max_pool2d_with_indices(x,
+/// F::FractionalMaxPool2dFuncOptions(3).output_size(2));
+/// ```
+inline std::tuple<Tensor, Tensor> fractional_max_pool2d_with_indices(
+    const Tensor& input,
+    const FractionalMaxPool2dFuncOptions& options) {
+  return detail::fractional_max_pool2d_with_indices(
+      input,
+      options.kernel_size(),
+      options.output_size(),
+      options.output_ratio(),
+      options._random_samples());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor fractional_max_pool2d(
+    const Tensor& input,
+    ExpandingArray<2> kernel_size,
+    std::optional<ExpandingArray<2>> output_size,
+    std::optional<ExpandingArray<2, double>> output_ratio,
+    const Tensor& _random_samples) {
+  return std::get<0>(fractional_max_pool2d_with_indices(
+      input, kernel_size, output_size, output_ratio, _random_samples));
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See the documentation for
+/// `torch::nn::functional::FractionalMaxPool2dFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::fractional_max_pool2d(x,
+/// F::FractionalMaxPool2dFuncOptions(3).output_size(2));
+/// ```
+inline Tensor fractional_max_pool2d(
+    const Tensor& input,
+    const FractionalMaxPool2dFuncOptions& options) {
+  return detail::fractional_max_pool2d(
+      input,
+      options.kernel_size(),
+      options.output_size(),
+      options.output_ratio(),
+      options._random_samples());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline std::tuple<Tensor, Tensor> fractional_max_pool3d_with_indices(
+    const Tensor& input,
+    const ExpandingArray<3>& kernel_size,
+    const std::optional<ExpandingArray<3>>& output_size,
+    const std::optional<ExpandingArray<3, double>>& output_ratio,
+    const Tensor& _random_samples) {
+  if (output_size == std::nullopt && output_ratio == std::nullopt) {
+    TORCH_CHECK(
+        false,
+        "fractional_max_pool3d requires specifying either ",
+        "an output_size or an output_ratio");
+  }
+
+  std::optional<ExpandingArray<3>> output_size_ = output_size;
+  if (output_size_ == std::nullopt) {
+    TORCH_INTERNAL_ASSERT(output_ratio != std::nullopt);
+    output_size_ = {
+        (int64_t)(static_cast<double>(input.size(-3)) *
+                  (*output_ratio.value())[0]),
+        (int64_t)(static_cast<double>(input.size(-2)) *
+                  (*output_ratio.value())[1]),
+        (int64_t)(static_cast<double>(input.size(-1)) *
+                  (*output_ratio.value())[2])};
+  }
+
+  Tensor _random_samples_ = _random_samples;
+  if (!_random_samples_.defined()) {
+    auto n_batch = input.dim() == 4 ? 1 : input.size(0);
+    _random_samples_ = torch::rand(
+        {n_batch, input.size(-4), 3},
+        torch::TensorOptions().dtype(input.dtype()).device(input.device()));
+  }
+  return torch::fractional_max_pool3d(
+      input, kernel_size, *output_size_, _random_samples_);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See the documentation for
+/// `torch::nn::functional::FractionalMaxPool3dFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::fractional_max_pool3d_with_indices(x,
+/// F::FractionalMaxPool3dFuncOptions(3).output_size(2));
+/// ```
+inline std::tuple<Tensor, Tensor> fractional_max_pool3d_with_indices(
+    const Tensor& input,
+    const FractionalMaxPool3dFuncOptions& options) {
+  return detail::fractional_max_pool3d_with_indices(
+      input,
+      options.kernel_size(),
+      options.output_size(),
+      options.output_ratio(),
+      options._random_samples());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor fractional_max_pool3d(
+    const Tensor& input,
+    ExpandingArray<3> kernel_size,
+    std::optional<ExpandingArray<3>> output_size,
+    std::optional<ExpandingArray<3, double>> output_ratio,
+    const Tensor& _random_samples) {
+  return std::get<0>(fractional_max_pool3d_with_indices(
+      input, kernel_size, output_size, output_ratio, _random_samples));
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See the documentation for
+/// `torch::nn::functional::FractionalMaxPool3dFuncOptions` class to learn what
+/// optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::fractional_max_pool3d(x,
+/// F::FractionalMaxPool3dFuncOptions(3).output_size(2));
+/// ```
+inline Tensor fractional_max_pool3d(
+    const Tensor& input,
+    const FractionalMaxPool3dFuncOptions& options) {
+  return detail::fractional_max_pool3d(
+      input,
+      options.kernel_size(),
+      options.output_size(),
+      options.output_ratio(),
+      options._random_samples());
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor lp_pool1d(
+    const Tensor& input,
+    double norm_type,
+    ExpandingArray<1> kernel_size,
+    ExpandingArray<1> stride,
+    bool ceil_mode) {
+  Tensor out = detail::avg_pool1d(
+      input.pow(norm_type),
+      kernel_size,
+      stride,
+      /*padding=*/0,
+      ceil_mode,
+      /*count_include_pad=*/true);
+
+  return (torch::sign(out) * relu(torch::abs(out)))
+      .mul((*kernel_size)[0])
+      .pow(1. / norm_type);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.lp_pool1d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::LPPool1dFuncOptions` class
+/// to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::lp_pool1d(x, F::LPPool1dFuncOptions(2, 3).stride(2));
+/// ```
+inline Tensor lp_pool1d(
+    const Tensor& input,
+    const LPPool1dFuncOptions& options) {
+  return detail::lp_pool1d(
+      input,
+      options.norm_type(),
+      options.kernel_size(),
+      options.stride(),
+      options.ceil_mode());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor lp_pool2d(
+    const Tensor& input,
+    double norm_type,
+    ExpandingArray<2> kernel_size,
+    ExpandingArray<2> stride,
+    bool ceil_mode) {
+  auto kw = (*kernel_size)[0];
+  auto kh = (*kernel_size)[1];
+  Tensor out = detail::avg_pool2d(
+      input.pow(norm_type),
+      kernel_size,
+      stride,
+      /*padding=*/0,
+      ceil_mode,
+      /*count_include_pad=*/true,
+      /*divisor_override=*/std::nullopt);
+
+  return (torch::sign(out) * relu(torch::abs(out)))
+      .mul(kw * kh)
+      .pow(1. / norm_type);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.lp_pool2d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::LPPool2dFuncOptions` class
+/// to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::lp_pool2d(x, F::LPPool2dFuncOptions(2, {2, 3}).stride(2));
+/// ```
+inline Tensor lp_pool2d(
+    const Tensor& input,
+    const LPPool2dFuncOptions& options) {
+  return detail::lp_pool2d(
+      input,
+      options.norm_type(),
+      options.kernel_size(),
+      options.stride(),
+      options.ceil_mode());
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor lp_pool3d(
+    const Tensor& input,
+    double norm_type,
+    ExpandingArray<3> kernel_size,
+    ExpandingArray<3> stride,
+    bool ceil_mode) {
+  auto kd = (*kernel_size)[0];
+  auto kw = (*kernel_size)[1];
+  auto kh = (*kernel_size)[2];
+  Tensor out = detail::avg_pool3d(
+      input.pow(norm_type),
+      kernel_size,
+      stride,
+      /*padding=*/0,
+      ceil_mode,
+      /*count_include_pad=*/true,
+      /*divisor_override=*/std::nullopt);
+
+  return (torch::sign(out) * relu(torch::abs(out)))
+      .mul(kd * kw * kh)
+      .pow(1. / norm_type);
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.lp_pool3d
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::LPPool3dFuncOptions` class
+/// to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::lp_pool3d(x, F::LPPool3dFuncOptions(3, {3, 3, 5}).stride(3));
+/// ```
+inline Tensor lp_pool3d(
+    const Tensor& input,
+    const LPPool3dFuncOptions& options) {
+  return detail::lp_pool3d(
+      input,
+      options.norm_type(),
+      options.kernel_size(),
+      options.stride(),
+      options.ceil_mode());
+}
+
+} // namespace torch::nn::functional
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/upsampling.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/upsampling.h
new file mode 100644
index 0000000000000000000000000000000000000000..4d6c14817805a5228de26b68ce0242a79fbac8d0
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/upsampling.h
@@ -0,0 +1,286 @@
+#pragma once
+
+#include <c10/util/irange.h>
+#include <torch/nn/functional/pooling.h>
+#include <torch/nn/options/upsampling.h>
+
+#include <cmath>
+#include <utility>
+
+namespace torch::nn::functional {
+
+inline std::vector<int64_t> _interp_output_size(
+    int64_t dim,
+    std::tuple<
+        Tensor,
+        std::optional<std::vector<int64_t>>,
+        std::optional<std::vector<double>>,
+        std::optional<bool>> closed_over_args) {
+  auto [input, size, scale_factor, recompute_scale_factor] =
+      std::move(closed_over_args);
+  if (size == std::nullopt && scale_factor == std::nullopt) {
+    TORCH_CHECK(false, "either size or scale_factor should be defined");
+  }
+  if (size != std::nullopt && scale_factor != std::nullopt) {
+    TORCH_CHECK(false, "only one of size or scale_factor should be defined");
+  }
+  if (scale_factor != std::nullopt) {
+    if (static_cast<int64_t>(scale_factor.value().size()) != dim) {
+      TORCH_CHECK(
+          false,
+          "scale_factor shape must match input shape. ",
+          "Input is ",
+          dim,
+          "D, scale_factor size is ",
+          torch::ArrayRef<double>(*scale_factor));
+    }
+  }
+  if (size != std::nullopt) {
+    return *size;
+  }
+
+  TORCH_INTERNAL_ASSERT(scale_factor != std::nullopt);
+  auto scale_factors = *scale_factor;
+
+  if (recompute_scale_factor == std::nullopt) {
+    // only warn when the scales have floating values since
+    // the result for ints is the same with/without recompute_scale_factor
+    bool is_float_scale_factor = false;
+    for (double scale : scale_factors) {
+      is_float_scale_factor = floor(scale) != scale;
+      if (is_float_scale_factor) {
+        break;
+      }
+    }
+    if (is_float_scale_factor) {
+      TORCH_WARN(
+          "The default behavior for interpolate/upsample with float scale_factor changed "
+          "in 1.6.0 to align with other frameworks/libraries, and uses scale_factor directly, "
+          "instead of relying on the computed output size. "
+          "If you wish to keep the old behavior, please set recompute_scale_factor=True. "
+          "See the documentation of nn.Upsample for details. ");
+    }
+  }
+
+  std::vector<int64_t> ret;
+  for (const auto i : c10::irange(dim)) {
+    ret.emplace_back(static_cast<int64_t>(
+        floor(static_cast<double>(input.size(i + 2)) * scale_factors[i])));
+  }
+  return ret;
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor interpolate(
+    const Tensor& input,
+    const std::optional<std::vector<int64_t>>& size,
+    const std::optional<std::vector<double>>& scale_factor,
+    InterpolateFuncOptions::mode_t mode,
+    std::optional<bool> align_corners,
+    std::optional<bool> recompute_scale_factor,
+    bool antialias) {
+  if (std::holds_alternative<enumtype::kNearest>(mode) ||
+      std::get_if<enumtype::kArea>(&mode)) {
+    if (align_corners != std::nullopt) {
+      TORCH_CHECK(
+          false,
+          "align_corners option can only be set with the "
+          "interpolating modes: linear | bilinear | bicubic | trilinear");
+    }
+  } else {
+    if (align_corners == std::nullopt) {
+      TORCH_WARN(
+          "Default upsampling behavior when mode=",
+          enumtype::get_enum_name(mode),
+          " is changed "
+          "to align_corners=False since 0.4.0. Please specify "
+          "align_corners=True if the old behavior is desired. "
+          "See the documentation of nn.Upsample for details.");
+      align_corners = false;
+    }
+  }
+
+  TORCH_CHECK(
+      input.dim() >= 3 && input.dim() <= 5,
+      "Input Error: Only 3D, 4D and 5D input Tensors supported "
+      "(got ",
+      input.dim(),
+      "D) for the modes: nearest | linear | bilinear | bicubic | trilinear "
+      "(got ",
+      enumtype::get_enum_name(mode),
+      ")");
+
+  auto scale_factor_len = input.dim() - 2;
+  std::vector<std::optional<double>> scale_factor_list(
+      scale_factor_len, std::nullopt);
+  if (scale_factor != std::nullopt && !recompute_scale_factor.value_or(false)) {
+    auto _scale_factor_repeated = *scale_factor;
+    scale_factor_list = {};
+    for (const auto& elem : _scale_factor_repeated) {
+      scale_factor_list.emplace_back(elem);
+    }
+  }
+
+  if (antialias &&
+      !(input.dim() == 4 &&
+        (std::get_if<enumtype::kBilinear>(&mode) ||
+         std::get_if<enumtype::kBicubic>(&mode)))) {
+    TORCH_CHECK(
+        false,
+        "Anti-alias option is only supported for bilinear and bicubic modes");
+  }
+
+  auto closed_over_args =
+      std::make_tuple(input, size, scale_factor, recompute_scale_factor);
+  if (input.dim() == 3 && std::get_if<enumtype::kNearest>(&mode)) {
+    return torch::upsample_nearest1d(
+        input,
+        _interp_output_size(1, std::move(closed_over_args)),
+        scale_factor_list.at(0));
+  } else if (input.dim() == 4 && std::get_if<enumtype::kNearest>(&mode)) {
+    return torch::upsample_nearest2d(
+        input,
+        _interp_output_size(2, std::move(closed_over_args)),
+        scale_factor_list.at(0),
+        scale_factor_list.at(1));
+  } else if (input.dim() == 5 && std::get_if<enumtype::kNearest>(&mode)) {
+    return torch::upsample_nearest3d(
+        input,
+        _interp_output_size(3, std::move(closed_over_args)),
+        scale_factor_list.at(0),
+        scale_factor_list.at(1),
+        scale_factor_list.at(2));
+  } else if (input.dim() == 3 && std::get_if<enumtype::kNearestExact>(&mode)) {
+    return torch::_upsample_nearest_exact1d(
+        input,
+        _interp_output_size(1, std::move(closed_over_args)),
+        scale_factor_list.at(0));
+  } else if (input.dim() == 4 && std::get_if<enumtype::kNearestExact>(&mode)) {
+    return torch::_upsample_nearest_exact2d(
+        input,
+        _interp_output_size(2, std::move(closed_over_args)),
+        scale_factor_list.at(0),
+        scale_factor_list.at(1));
+  } else if (input.dim() == 5 && std::get_if<enumtype::kNearestExact>(&mode)) {
+    return torch::_upsample_nearest_exact3d(
+        input,
+        _interp_output_size(3, std::move(closed_over_args)),
+        scale_factor_list.at(0),
+        scale_factor_list.at(1),
+        scale_factor_list.at(2));
+  } else if (input.dim() == 3 && std::get_if<enumtype::kArea>(&mode)) {
+    return detail::adaptive_avg_pool1d(
+        input, _interp_output_size(1, std::move(closed_over_args)));
+  } else if (input.dim() == 4 && std::get_if<enumtype::kArea>(&mode)) {
+    return detail::adaptive_avg_pool2d(
+        input, _interp_output_size(2, std::move(closed_over_args)));
+  } else if (input.dim() == 5 && std::get_if<enumtype::kArea>(&mode)) {
+    return detail::adaptive_avg_pool3d(
+        input, _interp_output_size(3, std::move(closed_over_args)));
+  } else if (input.dim() == 3 && std::get_if<enumtype::kLinear>(&mode)) {
+    TORCH_CHECK(
+        align_corners != std::nullopt, "align_corners should be specified.");
+    return torch::upsample_linear1d(
+        input,
+        _interp_output_size(1, std::move(closed_over_args)),
+        *align_corners,
+        scale_factor_list.at(0));
+  } else if (input.dim() == 3 && std::get_if<enumtype::kBilinear>(&mode)) {
+    TORCH_CHECK(false, "Got 3D input, but bilinear mode needs 4D input");
+  } else if (input.dim() == 3 && std::get_if<enumtype::kTrilinear>(&mode)) {
+    TORCH_CHECK(false, "Got 3D input, but trilinear mode needs 5D input");
+  } else if (input.dim() == 4 && std::get_if<enumtype::kLinear>(&mode)) {
+    TORCH_CHECK(false, "Got 4D input, but linear mode needs 3D input");
+  } else if (input.dim() == 4 && std::get_if<enumtype::kBilinear>(&mode)) {
+    TORCH_CHECK(
+        align_corners != std::nullopt, "align_corners should be specified.");
+    if (antialias) {
+      return torch::_upsample_bilinear2d_aa(
+          input,
+          _interp_output_size(2, std::move(closed_over_args)),
+          *align_corners,
+          scale_factor_list.at(0),
+          scale_factor_list.at(1));
+    }
+    return torch::upsample_bilinear2d(
+        input,
+        _interp_output_size(2, std::move(closed_over_args)),
+        *align_corners,
+        scale_factor_list.at(0),
+        scale_factor_list.at(1));
+  } else if (input.dim() == 4 && std::get_if<enumtype::kTrilinear>(&mode)) {
+    TORCH_CHECK(false, "Got 4D input, but trilinear mode needs 5D input");
+  } else if (input.dim() == 5 && std::get_if<enumtype::kLinear>(&mode)) {
+    TORCH_CHECK(false, "Got 5D input, but linear mode needs 3D input");
+  } else if (input.dim() == 5 && std::get_if<enumtype::kBilinear>(&mode)) {
+    TORCH_CHECK(false, "Got 5D input, but bilinear mode needs 4D input");
+  } else if (input.dim() == 5 && std::get_if<enumtype::kTrilinear>(&mode)) {
+    TORCH_CHECK(
+        align_corners != std::nullopt, "align_corners should be specified.");
+    return torch::upsample_trilinear3d(
+        input,
+        _interp_output_size(3, std::move(closed_over_args)),
+        *align_corners,
+        scale_factor_list.at(0),
+        scale_factor_list.at(1),
+        scale_factor_list.at(2));
+  } else if (input.dim() == 4 && std::get_if<enumtype::kBicubic>(&mode)) {
+    TORCH_CHECK(
+        align_corners != std::nullopt, "align_corners should be specified.");
+    if (antialias) {
+      return torch::_upsample_bicubic2d_aa(
+          input,
+          _interp_output_size(2, std::move(closed_over_args)),
+          *align_corners,
+          scale_factor_list.at(0),
+          scale_factor_list.at(1));
+    }
+    return torch::upsample_bicubic2d(
+        input,
+        _interp_output_size(2, std::move(closed_over_args)),
+        *align_corners,
+        scale_factor_list.at(0),
+        scale_factor_list.at(1));
+  } else {
+    TORCH_CHECK(
+        false,
+        "Input Error: Only 3D, 4D and 5D input Tensors supported "
+        "(got ",
+        input.dim(),
+        "D) for the modes: nearest | linear | bilinear | bicubic | trilinear "
+        "(got ",
+        enumtype::get_enum_name(mode),
+        ")");
+  }
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.interpolate
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::InterpolateFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::interpolate(input,
+/// F::InterpolateFuncOptions().size({4}).mode(torch::kNearest));
+/// ```
+inline Tensor interpolate(
+    const Tensor& input,
+    const InterpolateFuncOptions& options = {}) {
+  return detail::interpolate(
+      input,
+      options.size(),
+      options.scale_factor(),
+      options.mode(),
+      options.align_corners(),
+      options.recompute_scale_factor(),
+      options.antialias());
+}
+
+} // namespace torch::nn::functional
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/vision.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/vision.h
new file mode 100644
index 0000000000000000000000000000000000000000..2805a330ed5997c19b8931eb9c76b1dbcaf504da
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/functional/vision.h
@@ -0,0 +1,119 @@
+#pragma once
+
+#include <torch/nn/options/vision.h>
+#include <torch/types.h>
+
+namespace torch::nn::functional {
+
+inline Tensor affine_grid(
+    const Tensor& theta,
+    const IntArrayRef& size,
+    bool align_corners = false) {
+  // enforce floating point dtype on theta
+  TORCH_CHECK(
+      theta.is_floating_point(),
+      "Expected theta to have floating point type, but got ",
+      theta.dtype());
+
+  // check that shapes and sizes match
+  if (size.size() == 4) {
+    TORCH_CHECK(
+        theta.dim() == 3 && theta.size(-2) == 2 && theta.size(-1) == 3,
+        "Expected a batch of 2D affine matrices of shape Nx2x3 for size ",
+        size,
+        ". Got ",
+        theta.sizes(),
+        ".");
+  } else if (size.size() == 5) {
+    TORCH_CHECK(
+        theta.dim() == 3 && theta.size(-2) == 3 && theta.size(-1) == 4,
+        "Expected a batch of 3D affine matrices of shape Nx3x4 for size ",
+        size,
+        ". Got ",
+        theta.sizes(),
+        ".");
+  } else {
+    TORCH_CHECK(
+        false,
+        "affine_grid only supports 4D and 5D sizes, ",
+        "for 2D and 3D affine transforms, respectively. ",
+        "Got size ",
+        size);
+  }
+
+  if (*std::min_element(size.begin(), size.end()) <= 0) {
+    TORCH_CHECK(false, "Expected non-zero, positive output size. Got ", size);
+  }
+
+  return torch::affine_grid_generator(theta, size, align_corners);
+}
+
+// ============================================================================
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace detail {
+inline Tensor grid_sample(
+    const Tensor& input,
+    const Tensor& grid,
+    GridSampleFuncOptions::mode_t mode,
+    GridSampleFuncOptions::padding_mode_t padding_mode,
+    std::optional<bool> align_corners) {
+  int64_t mode_enum = 0, padding_mode_enum = 0;
+
+  if (std::holds_alternative<enumtype::kBilinear>(mode)) {
+    mode_enum = 0;
+  } else if (std::holds_alternative<enumtype::kNearest>(mode)) {
+    mode_enum = 1;
+  } else { /// mode == 'bicubic'
+    mode_enum = 2;
+  }
+
+  if (std::holds_alternative<enumtype::kZeros>(padding_mode)) {
+    padding_mode_enum = 0;
+  } else if (std::holds_alternative<enumtype::kBorder>(padding_mode)) {
+    padding_mode_enum = 1;
+  } else { /// padding_mode == 'reflection'
+    padding_mode_enum = 2;
+  }
+
+  if (!align_corners.has_value()) {
+    TORCH_WARN(
+        "Default grid_sample and affine_grid behavior has changed ",
+        "to align_corners=False since 1.3.0. Please specify ",
+        "align_corners=True if the old behavior is desired. ",
+        "See the documentation of grid_sample for details.");
+    align_corners = false;
+  }
+
+  return torch::grid_sampler(
+      input, grid, mode_enum, padding_mode_enum, align_corners.value());
+}
+} // namespace detail
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
+
+/// See
+/// https://pytorch.org/docs/main/nn.functional.html#torch.nn.functional.grid_sample
+/// about the exact behavior of this functional.
+///
+/// See the documentation for `torch::nn::functional::GridSampleFuncOptions`
+/// class to learn what optional arguments are supported for this functional.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::grid_sample(input, grid,
+/// F::GridSampleFuncOptions().mode(torch::kBilinear).padding_mode(torch::kZeros).align_corners(true));
+/// ```
+inline Tensor grid_sample(
+    const Tensor& input,
+    const Tensor& grid,
+    const GridSampleFuncOptions& options = {}) {
+  return detail::grid_sample(
+      input,
+      grid,
+      options.mode(),
+      options.padding_mode(),
+      options.align_corners());
+}
+
+} // namespace torch::nn::functional
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/init.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/init.h
new file mode 100644
index 0000000000000000000000000000000000000000..a19bc4afe1d834399e28de553a08bba05e8de077
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/init.h
@@ -0,0 +1,122 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/enum.h>
+#include <torch/types.h>
+
+namespace torch {
+
+namespace nn::init {
+
+using NonlinearityType = std::variant<
+    enumtype::kLinear,
+    enumtype::kConv1D,
+    enumtype::kConv2D,
+    enumtype::kConv3D,
+    enumtype::kConvTranspose1D,
+    enumtype::kConvTranspose2D,
+    enumtype::kConvTranspose3D,
+    enumtype::kSigmoid,
+    enumtype::kTanh,
+    enumtype::kReLU,
+    enumtype::kLeakyReLU>;
+
+using FanModeType = std::variant<enumtype::kFanIn, enumtype::kFanOut>;
+
+} // namespace nn::init
+
+namespace nn::init {
+
+/// Return the recommended gain value for the given nonlinearity function.
+TORCH_API double calculate_gain(
+    NonlinearityType nonlinearity,
+    double param = 0.01);
+
+/// Fills the given `tensor` with the provided `value` in-place, and returns it.
+/// No gradient will be recorded for this operation.
+TORCH_API Tensor constant_(Tensor tensor, Scalar value);
+
+/// Fills the given `tensor` with the Dirac delta function in-place, and returns
+/// it. No gradient will be recorded for this operation.
+TORCH_API Tensor dirac_(Tensor tensor);
+
+/// Fills the given 2-dimensional `matrix` with an identity matrix.
+/// No gradient will be recorded for this operation.
+TORCH_API Tensor eye_(Tensor matrix);
+
+/// Fills the given 2-dimensional `matrix` with values drawn from a normal
+/// distribution parameterized by `mean` and `std`.
+/// No gradient will be recorded for this operation.
+TORCH_API Tensor normal_(Tensor tensor, double mean = 0, double std = 1);
+
+/// Fills the given `tensor` with ones.
+/// No gradient will be recorded for this operation.
+TORCH_API Tensor ones_(Tensor tensor);
+
+/// Fills the input `Tensor` with a (semi) orthogonal matrix, as described in
+/// "Exact solutions to the nonlinear dynamics of learning in deep linear neural
+/// networks" - Saxe, A. et al. (2013). The input tensor must have at least 2
+/// dimensions, and for tensors with more than 2 dimensions the trailing
+/// dimensions are flattened.
+/// No gradient will be recorded for this operation.
+TORCH_API Tensor orthogonal_(Tensor tensor, double gain = 1.0);
+
+/// Fills the 2D input `Tensor` as a sparse matrix, where the
+/// non-zero elements will be drawn from a centered normal distribution
+/// with the given standard deviation `std`, as described in "Deep learning via
+/// Hessian-free optimization" - Martens, J. (2010). The `sparsity` is a real
+/// value between 0 and 1 that controls the fraction of elements in each column
+/// to be set to zero.
+/// No gradient will be recorded for this operation.
+TORCH_API Tensor sparse_(Tensor tensor, double sparsity, double std = 0.01);
+
+/// Fills the given 2-dimensional `matrix` with values drawn from a uniform
+/// distribution parameterized by `low` and `high`.
+/// No gradient will be recorded for this operation.
+TORCH_API Tensor uniform_(Tensor tensor, double low = 0, double high = 1);
+
+/// Fills the input `Tensor` with values according to the method
+/// described in "Delving deep into rectifiers: Surpassing human-level
+/// performance on ImageNet classification" - He, K. et al. (2015), using a
+/// normal distribution. Also known as He initialization.
+/// No gradient will be recorded for this operation.
+TORCH_API Tensor kaiming_normal_(
+    Tensor tensor,
+    double a = 0,
+    FanModeType mode = torch::kFanIn,
+    NonlinearityType nonlinearity = torch::kLeakyReLU);
+
+/// Fills the input `Tensor` with values according to the method
+/// described in "Delving deep into rectifiers: Surpassing human-level
+/// performance on ImageNet classification" - He, K. et al. (2015), using a
+/// uniform distribution. Also known as He initialization.
+/// No gradient will be recorded for this operation.
+TORCH_API Tensor kaiming_uniform_(
+    Tensor tensor,
+    double a = 0,
+    FanModeType mode = torch::kFanIn,
+    NonlinearityType nonlinearity = torch::kLeakyReLU);
+
+/// Fills the input `Tensor` with values according to the method
+/// described in "Understanding the difficulty of training deep feedforward
+/// neural networks" - Glorot, X. & Bengio, Y. (2010). Values are scaled by the
+/// `gain` parameter. No gradient will be recorded for this operation.
+TORCH_API Tensor xavier_normal_(Tensor tensor, double gain = 1.0);
+
+/// Fills the input `Tensor` with values according to the method
+/// described in "Understanding the difficulty of training deep feedforward
+/// neural networks" - Glorot, X. & Bengio, Y. (2010), using a uniform
+/// distribution. Values are scaled by the `gain` parameter
+/// No gradient will be recorded for this operation.
+TORCH_API Tensor xavier_uniform_(Tensor tensor, double gain = 1.0);
+
+/// Fills the given `tensor` with zeros.
+/// No gradient will be recorded for this operation.
+TORCH_API Tensor zeros_(Tensor tensor);
+
+TORCH_API std::tuple<int64_t, int64_t> _calculate_fan_in_and_fan_out(
+    const Tensor& tensor);
+
+} // namespace nn::init
+
+} // namespace torch
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/module.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/module.h
new file mode 100644
index 0000000000000000000000000000000000000000..abc3155c419f3704822b50c48b8b59588cc9398e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/module.h
@@ -0,0 +1,700 @@
+#pragma once
+
+#include <torch/nn/modules/container/any_module_holder.h>
+#include <torch/nn/modules/container/any_value.h>
+#include <torch/nn/pimpl.h>
+#include <torch/ordered_dict.h>
+#include <torch/serialize/archive.h>
+#include <torch/types.h>
+
+#include <ATen/ATen.h>
+
+#include <functional>
+#include <iosfwd>
+#include <map>
+#include <memory>
+#include <string>
+#include <type_traits>
+
+namespace torch::nn {
+
+/// The base class for all modules in PyTorch.
+///
+/// \rst
+/// .. note::
+///   The design and implementation of this class is largely based on the Python
+///   API. You may want to consult the python documentation for
+///   :py:class:`pytorch:torch.nn.Module` for further clarification on certain
+///   methods or behavior.
+/// \endrst
+///
+/// A `Module` is an abstraction over the implementation of some function or
+/// algorithm, possibly associated with some persistent data. A `Module` may
+/// contain further `Module`s ("submodules"), each with their own
+/// implementation, persistent data and further submodules. `Module`s can thus
+/// be said to form a recursive tree structure. A `Module` is registered as a
+/// submodule to another `Module` by calling `register_module()`, typically from
+/// within a parent module's constructor.
+///
+/// A distinction is made between three kinds of persistent data that may be
+/// associated with a `Module`:
+///
+/// 1. *Parameters*: tensors that record gradients, typically weights updated
+///    during the backward step (e.g. the `weight` of a `Linear` module),
+/// 2. *Buffers*: tensors that do not record gradients, typically updated during
+///    the forward step, such as running statistics (e.g. `mean` and `variance`
+///    in the `BatchNorm` module),
+/// 3. Any additional state, not necessarily tensors, required for the
+///    implementation or configuration of a `Module`.
+///
+/// The first two kinds of state are special in that they may be registered
+/// with the `Module` system to allow convenient access and batch configuration.
+/// For example, registered parameters in any `Module` may be iterated over via
+/// the `parameters()` accessor. Further, changing the data type of a `Module`'s
+/// registered parameters can be done conveniently via `Module::to()`, e.g.
+/// `module->to(torch::kCUDA)` to move all parameters to GPU memory. Lastly,
+/// registered parameters and buffers are handled specially during a `clone()`
+/// operation, which performs a deepcopy of a cloneable `Module` hierarchy.
+///
+/// Parameters are registered with a `Module` via `register_parameter`. Buffers
+/// are registered separately via `register_buffer`. These methods are part of
+/// the public API of `Module` and are typically invoked from within a
+/// concrete `Module`s constructor.
+class TORCH_API Module : public std::enable_shared_from_this<Module> {
+ public:
+  using ModuleApplyFunction = std::function<void(Module&)>;
+  using ConstModuleApplyFunction = std::function<void(const Module&)>;
+  using NamedModuleApplyFunction =
+      std::function<void(const std::string&, Module&)>;
+  using ConstNamedModuleApplyFunction =
+      std::function<void(const std::string&, const Module&)>;
+  using ModulePointerApplyFunction =
+      std::function<void(const std::shared_ptr<Module>&)>;
+  using NamedModulePointerApplyFunction =
+      std::function<void(const std::string&, const std::shared_ptr<Module>&)>;
+
+  /// Tells the base `Module` about the name of the submodule.
+  explicit Module(std::string name);
+
+  /// Constructs the module without immediate knowledge of the submodule's name.
+  /// The name of the submodule is inferred via RTTI (if possible) the first
+  /// time `.name()` is invoked.
+  Module();
+  Module(const Module&) = default;
+  Module& operator=(const Module&) = default;
+  Module(Module&&) noexcept = default;
+  Module& operator=(Module&&) noexcept = default;
+
+  virtual ~Module() = default;
+
+  /// Returns the name of the `Module`.
+  ///
+  /// A `Module` has an associated `name`, which is a string representation of
+  /// the kind of concrete `Module` it represents, such as `"Linear"` for the
+  /// `Linear` module. Under most circumstances, this name is automatically
+  /// inferred via runtime type information (RTTI). In the unusual circumstance
+  /// that you have this feature disabled, you may want to manually name your
+  /// `Module`s by passing the string name to the `Module` base class'
+  /// constructor.
+  const std::string& name() const noexcept;
+
+  /// Performs a recursive deep copy of the module and all its registered
+  /// parameters, buffers and submodules.
+  ///
+  /// Optionally, this method sets the current device
+  /// to the one supplied before cloning. If no device is given, each
+  /// parameter and buffer will be moved to the device of its source.
+  ///
+  /// \rst
+  /// .. attention::
+  ///   Attempting to call the `clone()` method inherited from the base `Module`
+  ///   class (the one documented here) will fail. To inherit an actual
+  ///   implementation of `clone()`, you must subclass `Cloneable`. `Cloneable`
+  ///   is templatized on the concrete module type, and can thus properly copy a
+  ///   `Module`. This method is provided on the base class' API solely for an
+  ///   easier-to-use polymorphic interface.
+  /// \endrst
+  virtual std::shared_ptr<Module> clone(
+      const std::optional<Device>& device = std::nullopt) const;
+
+  /// Applies the `function` to the `Module` and recursively to every submodule.
+  /// The function must accept a `Module&`.
+  ///
+  /// \rst
+  /// .. code-block:: cpp
+  ///   MyModule module;
+  ///   module->apply([](nn::Module& module) {
+  ///     std::cout << module.name() << std::endl;
+  ///   });
+  /// \endrst
+  void apply(const ModuleApplyFunction& function);
+
+  /// Applies the `function` to the `Module` and recursively to every submodule.
+  /// The function must accept a `const Module&`.
+  ///
+  /// \rst
+  /// .. code-block:: cpp
+  ///   MyModule module;
+  ///   module->apply([](const nn::Module& module) {
+  ///     std::cout << module.name() << std::endl;
+  ///   });
+  /// \endrst
+  void apply(const ConstModuleApplyFunction& function) const;
+
+  /// Applies the `function` to the `Module` and recursively to every submodule.
+  /// The function must accept a `const std::string&` for the key of the module,
+  /// and a `Module&`. The key of the module itself is the empty string. If
+  /// `name_prefix` is given, it is prepended to every key as
+  /// `<name_prefix>.<key>` (and just `name_prefix` for the module itself).
+  ///
+  /// \rst
+  /// .. code-block:: cpp
+  ///   MyModule module;
+  ///   module->apply([](const std::string& key, nn::Module& module) {
+  ///     std::cout << key << ": " << module.name() << std::endl;
+  ///   });
+  /// \endrst
+  void apply(
+      const NamedModuleApplyFunction& function,
+      const std::string& name_prefix = std::string());
+
+  /// Applies the `function` to the `Module` and recursively to every submodule.
+  /// The function must accept a `const std::string&` for the key of the module,
+  /// and a `const Module&`. The key of the module itself is the empty string.
+  /// If `name_prefix` is given, it is prepended to every key as
+  /// `<name_prefix>.<key>` (and just `name_prefix` for the module itself).
+  ///
+  /// \rst
+  /// .. code-block:: cpp
+  ///   MyModule module;
+  ///   module->apply([](const std::string& key, const nn::Module& module) {
+  ///     std::cout << key << ": " << module.name() << std::endl;
+  ///   });
+  /// \endrst
+  void apply(
+      const ConstNamedModuleApplyFunction& function,
+      const std::string& name_prefix = std::string()) const;
+
+  /// Applies the `function` to the `Module` and recursively to every submodule.
+  /// The function must accept a `const std::shared_ptr<Module>&`.
+  ///
+  /// \rst
+  /// .. code-block:: cpp
+  ///   MyModule module;
+  ///   module->apply([](const std::shared_ptr<nn::Module>& module) {
+  ///     std::cout << module->name() << std::endl;
+  ///   });
+  /// \endrst
+  void apply(const ModulePointerApplyFunction& function) const;
+
+  /// Applies the `function` to the `Module` and recursively to every submodule.
+  /// The function must accept a `const std::string&` for the key of the module,
+  /// and a `const std::shared_ptr<Module>&`. The key of the module itself is
+  /// the empty string. If `name_prefix` is given, it is prepended to every key
+  /// as
+  /// `<name_prefix>.<key>` (and just `name_prefix` for the module itself).
+  ///
+  /// \rst
+  /// .. code-block:: cpp
+  ///   MyModule module;
+  ///   module->apply([](const std::string& key,
+  ///                    const std::shared_ptr<nn::Module>& module) {
+  ///     std::cout << key << ": " << module->name() << std::endl;
+  ///   });
+  /// \endrst
+  void apply(
+      const NamedModulePointerApplyFunction& function,
+      const std::string& name_prefix = std::string()) const;
+
+  /// Returns the parameters of this `Module` and if `recurse` is true, also
+  /// recursively of every submodule.
+  std::vector<Tensor> parameters(bool recurse = true) const;
+
+  /// Returns an `OrderedDict` with the parameters of this `Module` along with
+  /// their keys, and if `recurse` is true also recursively of every submodule.
+  OrderedDict<std::string, Tensor> named_parameters(bool recurse = true) const;
+
+  /// Returns the buffers of this `Module` and if `recurse` is true, also
+  /// recursively of every submodule.
+  std::vector<Tensor> buffers(bool recurse = true) const;
+
+  /// Returns an `OrderedDict` with the buffers of this `Module` along with
+  /// their keys, and if `recurse` is true also recursively of every submodule.
+  OrderedDict<std::string, Tensor> named_buffers(bool recurse = true) const;
+
+  /// Returns the submodules of this `Module` (the entire submodule hierarchy)
+  /// and if `include_self` is true, also inserts a `shared_ptr` to this module
+  /// in the first position.
+  ///
+  /// \rst
+  /// .. warning::
+  ///   Only pass `include_self` as `true` if this `Module` is stored in a
+  ///   `shared_ptr`! Otherwise an exception will be thrown. You may still call
+  ///   this method with `include_self` set to false if your `Module` is not
+  ///   stored in a `shared_ptr`.
+  /// \endrst
+  std::vector<std::shared_ptr<Module>> modules(bool include_self = true) const;
+
+  /// Returns an `OrderedDict` of the submodules of this `Module` (the entire
+  /// submodule hierarchy) and their keys, and if `include_self` is true, also
+  /// inserts a `shared_ptr` to this module in the first position. If
+  /// `name_prefix` is given, it is prepended to every key as
+  /// `<name_prefix>.<key>` (and just `name_prefix` for the module itself).
+  ///
+  /// \rst
+  /// .. warning::
+  ///   Only pass `include_self` as `true` if this `Module` is stored in a
+  ///   `shared_ptr`! Otherwise an exception will be thrown. You may still call
+  ///   this method with `include_self` set to false if your `Module` is not
+  ///   stored in a `shared_ptr`.
+  /// \endrst
+  OrderedDict<std::string, std::shared_ptr<Module>> named_modules(
+      const std::string& name_prefix = std::string(),
+      bool include_self = true) const;
+
+  /// Returns the direct submodules of this `Module`.
+  std::vector<std::shared_ptr<Module>> children() const;
+
+  /// Returns an `OrderedDict` of the direct submodules of this `Module` and
+  /// their keys.
+  OrderedDict<std::string, std::shared_ptr<Module>> named_children() const;
+
+  /// Enables "training" mode.
+  virtual void train(bool on = true);
+
+  /// Calls train(false) to enable "eval" mode.
+  /// Do not override this method, override `train()` instead.
+  void eval();
+
+  /// True if the module is in training mode.
+  ///
+  /// Every `Module` has a boolean associated with it that determines whether
+  /// the `Module` is currently in *training* mode (set via `.train()`) or in
+  /// *evaluation* (inference) mode (set via `.eval()`). This property is
+  /// exposed via `is_training()`, and may be used by the implementation of a
+  /// concrete module to modify its runtime behavior. See the `BatchNorm` or
+  /// `Dropout` modules for examples of `Module`s that use different code paths
+  /// depending on this property.
+  virtual bool is_training() const noexcept;
+
+  /// Recursively casts all parameters to the given `dtype` and `device`.
+  ///
+  /// If `non_blocking` is true and the source is in pinned memory and
+  /// destination is on the GPU or vice versa, the copy is performed
+  /// asynchronously with respect to the host. Otherwise, the argument has no
+  /// effect.
+  virtual void to(
+      torch::Device device,
+      torch::Dtype dtype,
+      bool non_blocking = false);
+
+  /// Recursively casts all parameters to the given dtype.
+  ///
+  /// If `non_blocking` is true and the source is in pinned memory and
+  /// destination is on the GPU or vice versa, the copy is performed
+  /// asynchronously with respect to the host. Otherwise, the argument has no
+  /// effect.
+  virtual void to(torch::Dtype dtype, bool non_blocking = false);
+
+  /// Recursively moves all parameters to the given device.
+  ///
+  /// If `non_blocking` is true and the source is in pinned memory and
+  /// destination is on the GPU or vice versa, the copy is performed
+  /// asynchronously with respect to the host. Otherwise, the argument has no
+  /// effect.
+  virtual void to(torch::Device device, bool non_blocking = false);
+
+  /// Recursively zeros out the `grad` value of each registered parameter.
+  virtual void zero_grad(bool set_to_none = true);
+
+  /// Attempts to cast this `Module` to the given `ModuleType`.
+  ///
+  /// This method is useful when calling `apply()`.
+  /// \rst
+  /// .. code-block:: cpp
+  ///
+  ///   void initialize_weights(nn::Module& module) {
+  ///     torch::NoGradGuard no_grad;
+  ///     if (auto* linear = module.as<nn::Linear>()) {
+  ///       linear->weight.normal_(0.0, 0.02);
+  ///     }
+  ///   }
+  ///
+  ///   MyModule module;
+  ///   module->apply(initialize_weights);
+  /// \endrst
+  template <typename ModuleType>
+  typename ModuleType::ContainedType* as() noexcept;
+
+  /// Attempts to cast this `Module` to the given `ModuleType`.
+  ///
+  /// This method is useful when calling `apply()`.
+  /// \rst
+  /// .. code-block:: cpp
+  ///   void initialize_weights(nn::Module& module) {
+  ///     torch::NoGradGuard no_grad;
+  ///     if (auto* linear = module.as<nn::Linear>()) {
+  ///       linear->weight.normal_(0.0, 0.02);
+  ///     }
+  ///   }
+  ///
+  ///   MyModule module;
+  ///   module->apply(initialize_weights);
+  /// \endrst
+  template <typename ModuleType>
+  const typename ModuleType::ContainedType* as() const noexcept;
+
+  /// Attempts to cast this `Module` to the given `ModuleType`.
+  ///
+  /// This method is useful when calling `apply()`.
+  /// \rst
+  /// .. code-block:: cpp
+  ///
+  ///   void initialize_weights(nn::Module& module) {
+  ///     torch::NoGradGuard no_grad;
+  ///     if (auto* linear = module.as<nn::Linear>()) {
+  ///       linear->weight.normal_(0.0, 0.02);
+  ///     }
+  ///   }
+  ///
+  ///   MyModule module;
+  ///   module.apply(initialize_weights);
+  /// \endrst
+  template <
+      typename ModuleType,
+      typename = torch::detail::disable_if_module_holder_t<ModuleType>>
+  ModuleType* as() noexcept;
+
+  /// Attempts to cast this `Module` to the given `ModuleType`.
+  ///
+  /// This method is useful when calling `apply()`.
+  /// \rst
+  /// .. code-block:: cpp
+  ///
+  ///   void initialize_weights(nn::Module& module) {
+  ///     torch::NoGradGuard no_grad;
+  ///     if (auto* linear = module.as<nn::Linear>()) {
+  ///       linear->weight.normal_(0.0, 0.02);
+  ///     }
+  ///   }
+  ///
+  ///   MyModule module;
+  ///   module.apply(initialize_weights);
+  /// \endrst
+  template <
+      typename ModuleType,
+      typename = torch::detail::disable_if_module_holder_t<ModuleType>>
+  const ModuleType* as() const noexcept;
+
+  /// Serializes the `Module` into the given `OutputArchive`.
+  ///
+  /// If the `Module` contains unserializable submodules (e.g.
+  /// `nn::Functional`), those submodules are skipped when serializing.
+  virtual void save(serialize::OutputArchive& archive) const;
+
+  /// Deserializes the `Module` from the given `InputArchive`.
+  ///
+  /// If the `Module` contains unserializable submodules (e.g.
+  /// `nn::Functional`), we don't check the existence of those submodules in the
+  /// `InputArchive` when deserializing.
+  virtual void load(serialize::InputArchive& archive);
+
+  /// Streams a pretty representation of the `Module` into the given `stream`.
+  /// By default, this representation will be the name of the module (taken from
+  /// `name()`), followed by a recursive pretty print of all of the `Module`'s
+  /// submodules.
+  ///
+  /// Override this method to change the pretty print. The input
+  /// `stream` should be returned from the method, to allow easy chaining.
+  virtual void pretty_print(std::ostream& stream) const;
+
+  /// Returns whether the `Module` is serializable.
+  virtual bool is_serializable() const;
+
+  /// Registers a parameter with this `Module`.
+  ///
+  /// A parameter should be any gradient-recording tensor used in the
+  /// implementation of your `Module`. Registering it makes it available to
+  /// methods such as `parameters()`, `clone()` or `to().`
+  ///
+  /// Note that registering an undefined Tensor (e.g.
+  /// `module.register_parameter("param", Tensor())`) is allowed, and is
+  /// equivalent to `module.register_parameter("param", None)` in Python API.
+  ///
+  /// \rst
+  /// .. code-block:: cpp
+  ///
+  ///   MyModule::MyModule() {
+  ///     weight_ = register_parameter("weight", torch::randn({A, B}));
+  ///   }
+  /// \endrst
+  Tensor& register_parameter(
+      std::string name,
+      Tensor tensor,
+      bool requires_grad = true);
+
+  /// Registers a buffer with this `Module`.
+  ///
+  /// A buffer is intended to be state in your module that does not record
+  /// gradients, such as running statistics. Registering it makes it available
+  /// to methods such as `buffers()`, `clone()` or `to().
+  ///
+  /// \rst
+  /// .. code-block:: cpp
+  ///
+  ///   MyModule::MyModule() {
+  ///     mean_ = register_buffer("mean", torch::empty({num_features_}));
+  ///   }
+  /// \endrst
+  Tensor& register_buffer(std::string name, Tensor tensor);
+
+  /// Registers a submodule with this `Module`.
+  ///
+  /// Registering a module makes it available to methods such as `modules()`,
+  /// `clone()` or `to()`.
+  ///
+  /// \rst
+  /// .. code-block:: cpp
+  ///
+  ///   MyModule::MyModule() {
+  ///     submodule_ = register_module("linear", torch::nn::Linear(3, 4));
+  ///   }
+  /// \endrst
+  template <typename ModuleType>
+  std::shared_ptr<ModuleType> register_module(
+      std::string name,
+      std::shared_ptr<ModuleType> module);
+
+  /// Registers a submodule with this `Module`.
+  ///
+  /// This method deals with `ModuleHolder`s.
+  ///
+  /// Registering a module makes it available to methods such as `modules()`,
+  /// `clone()` or `to()`.
+  ///
+  /// \rst
+  /// .. code-block:: cpp
+  ///
+  ///   MyModule::MyModule() {
+  ///     submodule_ = register_module("linear", torch::nn::Linear(3, 4));
+  ///   }
+  /// \endrst
+  template <typename ModuleType>
+  std::shared_ptr<ModuleType> register_module(
+      std::string name,
+      ModuleHolder<ModuleType> module_holder);
+
+  /// Replaces a registered submodule with this `Module`.
+  ///
+  /// This takes care of the registration, if you used submodule members, you
+  /// should
+  //  assign the submodule as well, i.e. use as
+  ///     module->submodule_ = module->replace_module("linear",
+  ///     torch::nn::Linear(3, 4));
+  /// It only works when a module of the name is already registered.
+  ///
+  /// This is useful for replacing a module after initialization, e.g.
+  /// for finetuning.
+  template <typename ModuleType>
+  std::shared_ptr<ModuleType> replace_module(
+      const std::string& name,
+      std::shared_ptr<ModuleType> module);
+
+  /// Replaces a registered submodule with this `Module`.
+  /// This method deals with `ModuleHolder`s.
+  ///
+  /// This takes care of the registration, if you used submodule members, you
+  /// should
+  //  assign the submodule as well, i.e. use as
+  ///     module->submodule_ = module->replace_module("linear", linear_holder);
+  /// It only works when a module of the name is already registered.
+  ///
+  /// This is useful for replacing a module after initialization, e.g.
+  /// for finetuning.
+  template <typename ModuleType>
+  std::shared_ptr<ModuleType> replace_module(
+      const std::string& name,
+      ModuleHolder<ModuleType> module_holder);
+
+  /// Unregisters a submodule from this `Module`. If there is no such module
+  /// with `name` an exception is thrown.
+  void unregister_module(const std::string& name);
+
+ protected:
+  /// The following three functions allow a module with default arguments in its
+  /// forward method to be used in a Sequential module.
+  /// You should NEVER override these functions manually. Instead, you should
+  /// use the `FORWARD_HAS_DEFAULT_ARGS` macro.
+  virtual bool _forward_has_default_args() {
+    return false;
+  }
+
+  virtual unsigned int _forward_num_required_args() {
+    TORCH_CHECK(
+        false,
+        "torch::nn::Module subclass that has default arguments in `forward` method ",
+        "must override `_forward_num_required_args` method. Please use ",
+        "`FORWARD_HAS_DEFAULT_ARGS` macro to do so.");
+  }
+
+  virtual std::vector<AnyValue> _forward_populate_default_args(
+      std::vector<AnyValue>&& arguments) {
+    TORCH_CHECK(
+        false,
+        "torch::nn::Module subclass that has default arguments in `forward` method ",
+        "must override `_forward_populate_default_args` method. Please use ",
+        "`FORWARD_HAS_DEFAULT_ARGS` macro to do so.");
+  }
+
+  /// The registered parameters of this `Module`.
+  /// Inorder to access parameters_ in ParameterDict and ParameterList
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  OrderedDict<std::string, Tensor> parameters_;
+
+ private:
+  // Friend classes.
+
+  template <typename Derived>
+  friend class Cloneable;
+
+  template <typename ModuleType, typename... ArgumentTypes>
+  friend struct AnyModuleHolder;
+
+  /// Pretty prints the given `Module` into the `ostream`.
+  TORCH_API friend std::ostream& operator<<(
+      std::ostream& stream,
+      const nn::Module& module);
+
+  // data parallel using this method to configure gradient edges during the
+  // replicate step.
+  template <typename ModuleType>
+  friend void replicate_grad_edges(
+      const std::shared_ptr<Module>& module,
+      const std::vector<std::shared_ptr<ModuleType>>& replicas,
+      const std::vector<Device>& devices);
+
+  // Private methods.
+
+  /// Used in the implementation of `Cloneable`.
+  virtual void clone_(Module& other, const std::optional<Device>& device);
+
+  /// The implementation of the various `to()` methods.
+  template <typename... Ts>
+  void to_impl(Ts&&... ts);
+
+  /// Implements pretty printing the module hierarchy.
+  void pretty_print_recursive(
+      std::ostream& stream,
+      const std::string& indentation) const;
+
+  /// Applies the `function` to every submodule recursively, starting at this
+  /// `Module`'s children (thus not including the module itself).
+  void apply_to_submodules(
+      const NamedModulePointerApplyFunction& function,
+      const std::string& name_prefix = std::string()) const;
+
+  /// Returns a shared_ptr to `this` in a safe (checked) way.
+  std::shared_ptr<Module> shared_from_this_checked() const;
+
+  /// The registered buffers of this `Module`.
+  OrderedDict<std::string, Tensor> buffers_;
+
+  /// The registered (direct) submodules of this `Module`.
+  OrderedDict<std::string, std::shared_ptr<Module>> children_;
+
+  /// The module's name (e.g. "LSTM").
+  mutable std::optional<std::string> name_;
+
+  /// Whether the module is in training mode.
+  bool is_training_{true};
+};
+
+/// Serialize a `Module` pointer into an `OutputArchive`.
+TORCH_API serialize::OutputArchive& operator<<(
+    serialize::OutputArchive& archive,
+    const std::shared_ptr<nn::Module>& module);
+
+/// Deserializes a `Module` from an `InputArchive`.
+TORCH_API serialize::InputArchive& operator>>(
+    serialize::InputArchive& archive,
+    const std::shared_ptr<nn::Module>& module);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ nn::Module ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <typename ModuleType>
+typename ModuleType::ContainedType* Module::as() noexcept {
+  // Use the contained type of the `ModuleHolder`, e.g. `LinearImpl` for
+  // `Linear`, since `LinearImpl` inherits `nn::Module`.
+  return as<typename ModuleType::ContainedType>();
+}
+
+template <typename ModuleType>
+const typename ModuleType::ContainedType* Module::as() const noexcept {
+  // Use the contained type of the `ModuleHolder`, e.g. `LinearImpl` for
+  // `Linear`, since `LinearImpl` inherits `nn::Module`.
+  return as<typename ModuleType::ContainedType>();
+}
+
+template <typename ModuleType, typename>
+ModuleType* Module::as() noexcept {
+  return dynamic_cast<ModuleType*>(this);
+}
+
+template <typename ModuleType, typename>
+const ModuleType* Module::as() const noexcept {
+  return dynamic_cast<const ModuleType*>(this);
+}
+
+template <typename ModuleType>
+std::shared_ptr<ModuleType> Module::register_module(
+    std::string name,
+    std::shared_ptr<ModuleType> module) {
+  TORCH_CHECK(!name.empty(), "Submodule name must not be empty");
+  TORCH_CHECK(
+      name.find('.') == std::string::npos,
+      "Submodule name must not contain a dot (got '",
+      name,
+      "')");
+  auto& base_module = children_.insert(std::move(name), std::move(module));
+  return std::dynamic_pointer_cast<ModuleType>(base_module);
+}
+
+template <typename ModuleType>
+std::shared_ptr<ModuleType> Module::register_module(
+    std::string name,
+    ModuleHolder<ModuleType> module_holder) {
+  return register_module(std::move(name), module_holder.ptr());
+}
+
+template <typename ModuleType>
+std::shared_ptr<ModuleType> Module::replace_module(
+    const std::string& name,
+    std::shared_ptr<ModuleType> module) {
+  auto& base_module = (children_[name] = std::move(module));
+  return std::dynamic_pointer_cast<ModuleType>(base_module);
+}
+
+template <typename ModuleType>
+std::shared_ptr<ModuleType> Module::replace_module(
+    const std::string& name,
+    ModuleHolder<ModuleType> module_holder) {
+  return replace_module(name, module_holder.ptr());
+}
+
+template <typename... Ts>
+void Module::to_impl(Ts&&... ts) {
+  // First call `to()` on every child module.
+  for (auto& child : children_) {
+    child.value()->to(ts...);
+  }
+  // Then move every parameter to the new dtype/device.
+  for (auto& parameter : named_parameters(/*recurse=*/false)) {
+    parameter->set_data(parameter->to(ts...));
+  }
+  // Then move every buffer to the new dtype/device.
+  for (auto& buffer : named_buffers(/*recurse=*/false)) {
+    buffer->set_data(buffer->to(ts...));
+  }
+}
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules.h
new file mode 100644
index 0000000000000000000000000000000000000000..79241fc5e9a8665c723ec8fff72f7d55f7bb82a4
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules.h
@@ -0,0 +1,36 @@
+#pragma once
+
+// Common
+#include <torch/nn/modules/common.h>
+
+// Containers
+#include <torch/nn/modules/container/any.h>
+#include <torch/nn/modules/container/functional.h>
+#include <torch/nn/modules/container/moduledict.h>
+#include <torch/nn/modules/container/modulelist.h>
+#include <torch/nn/modules/container/named_any.h>
+#include <torch/nn/modules/container/parameterdict.h>
+#include <torch/nn/modules/container/parameterlist.h>
+#include <torch/nn/modules/container/sequential.h>
+
+// Layers
+#include <torch/nn/modules/activation.h>
+#include <torch/nn/modules/adaptive.h>
+#include <torch/nn/modules/batchnorm.h>
+#include <torch/nn/modules/conv.h>
+#include <torch/nn/modules/distance.h>
+#include <torch/nn/modules/dropout.h>
+#include <torch/nn/modules/embedding.h>
+#include <torch/nn/modules/fold.h>
+#include <torch/nn/modules/instancenorm.h>
+#include <torch/nn/modules/linear.h>
+#include <torch/nn/modules/loss.h>
+#include <torch/nn/modules/normalization.h>
+#include <torch/nn/modules/padding.h>
+#include <torch/nn/modules/pixelshuffle.h>
+#include <torch/nn/modules/pooling.h>
+#include <torch/nn/modules/rnn.h>
+#include <torch/nn/modules/transformer.h>
+#include <torch/nn/modules/transformercoder.h>
+#include <torch/nn/modules/transformerlayer.h>
+#include <torch/nn/modules/upsampling.h>
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/_functions.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..7ab0e9d8603cb48c6c610473407515fe3f83c3ba
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/_functions.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <torch/csrc/autograd/custom_function.h>
+#include <torch/csrc/autograd/variable.h>
+#include <torch/nn/options/normalization.h>
+#include <torch/types.h>
+
+namespace torch::nn::functions {
+
+class CrossMapLRN2d : public torch::autograd::Function<CrossMapLRN2d> {
+ public:
+  static torch::autograd::Variable forward(
+      torch::autograd::AutogradContext* ctx,
+      const torch::autograd::Variable& input,
+      const CrossMapLRN2dOptions& options);
+
+  static torch::autograd::variable_list backward(
+      torch::autograd::AutogradContext* ctx,
+      torch::autograd::variable_list grad_output);
+};
+
+} // namespace torch::nn::functions
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/activation.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/activation.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a086fd1de1943d8bb8020a269bab7f514a17671
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/activation.h
@@ -0,0 +1,873 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/functional/activation.h>
+#include <torch/nn/modules/common.h>
+#include <torch/nn/modules/linear.h>
+#include <torch/nn/options/activation.h>
+
+#include <torch/csrc/Export.h>
+
+namespace torch::nn {
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ELU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies elu over a given input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.ELU to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::ELUOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// ELU model(ELUOptions().alpha(42.42).inplace(true));
+/// ```
+class TORCH_API ELUImpl : public torch::nn::Cloneable<ELUImpl> {
+ public:
+  explicit ELUImpl(const ELUOptions& options_ = {});
+
+  Tensor forward(Tensor input);
+
+  void reset() override;
+
+  /// Pretty prints the `ELU` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  ELUOptions options;
+};
+
+/// A `ModuleHolder` subclass for `ELUImpl`.
+/// See the documentation for `ELUImpl` class to learn what methods it
+/// provides, and examples of how to use `ELU` with `torch::nn::ELUOptions`.
+/// See the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(ELU);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SELU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the selu function element-wise.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.SELU to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::SELUOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// SELU model(SELUOptions().inplace(true));
+/// ```
+class TORCH_API SELUImpl : public torch::nn::Cloneable<SELUImpl> {
+ public:
+  explicit SELUImpl(const SELUOptions& options_ = {});
+
+  Tensor forward(Tensor input);
+
+  void reset() override;
+
+  /// Pretty prints the `SELU` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  SELUOptions options;
+};
+
+/// A `ModuleHolder` subclass for `SELUImpl`.
+/// See the documentation for `SELUImpl` class to learn what methods it
+/// provides, and examples of how to use `SELU` with `torch::nn::SELUOptions`.
+/// See the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(SELU);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Hardshrink ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the hard shrinkage function element-wise.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Hardshrink to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::HardshrinkOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Hardshrink model(HardshrinkOptions().lambda(42.42));
+/// ```
+class TORCH_API HardshrinkImpl : public torch::nn::Cloneable<HardshrinkImpl> {
+ public:
+  explicit HardshrinkImpl(const HardshrinkOptions& options_ = {});
+
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `Hardshrink` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  HardshrinkOptions options;
+};
+
+/// A `ModuleHolder` subclass for `HardshrinkImpl`.
+/// See the documentation for `HardshrinkImpl` class to learn what methods it
+/// provides, and examples of how to use `Hardshrink` with
+/// `torch::nn::HardshrinkOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(Hardshrink);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Hardtanh ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the HardTanh function element-wise.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Hardtanh to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::HardtanhOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Hardtanh
+/// model(HardtanhOptions().min_val(-42.42).max_val(0.42).inplace(true));
+/// ```
+class TORCH_API HardtanhImpl : public torch::nn::Cloneable<HardtanhImpl> {
+ public:
+  explicit HardtanhImpl(const HardtanhOptions& options_ = {});
+
+  Tensor forward(Tensor input);
+
+  void reset() override;
+
+  /// Pretty prints the `Hardtanh` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  HardtanhOptions options;
+};
+
+/// A `ModuleHolder` subclass for `HardtanhImpl`.
+/// See the documentation for `HardtanhImpl` class to learn what methods it
+/// provides, and examples of how to use `Hardtanh` with
+/// `torch::nn::HardtanhOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(Hardtanh);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ LeakyReLU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the LeakyReLU function element-wise.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.LeakyReLU to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::LeakyReLUOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// LeakyReLU model(LeakyReLUOptions().negative_slope(0.42).inplace(true));
+/// ```
+class TORCH_API LeakyReLUImpl : public torch::nn::Cloneable<LeakyReLUImpl> {
+ public:
+  explicit LeakyReLUImpl(const LeakyReLUOptions& options_ = {});
+
+  Tensor forward(Tensor input);
+
+  void reset() override;
+
+  /// Pretty prints the `LeakyReLU` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  LeakyReLUOptions options;
+};
+
+/// A `ModuleHolder` subclass for `LeakyReLUImpl`.
+/// See the documentation for `LeakyReLUImpl` class to learn what methods it
+/// provides, and examples of how to use `LeakyReLU` with
+/// `torch::nn::LeakyReLUOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(LeakyReLU);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ LogSigmoid ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the LogSigmoid function element-wise.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.LogSigmoid to learn
+/// about the exact behavior of this module.
+class TORCH_API LogSigmoidImpl : public torch::nn::Cloneable<LogSigmoidImpl> {
+ public:
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `LogSigmoid` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+};
+
+/// A `ModuleHolder` subclass for `LogSigmoidImpl`.
+/// See the documentation for `LogSigmoidImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(LogSigmoid);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Softmax ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the Softmax function.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Softmax to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::SoftmaxOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Softmax model(SoftmaxOptions(1));
+/// ```
+class TORCH_API SoftmaxImpl : public torch::nn::Cloneable<SoftmaxImpl> {
+ public:
+  explicit SoftmaxImpl(int64_t dim) : SoftmaxImpl(SoftmaxOptions(dim)) {}
+  explicit SoftmaxImpl(const SoftmaxOptions& options_);
+
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `Softmax` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  SoftmaxOptions options;
+};
+
+/// A `ModuleHolder` subclass for `SoftmaxImpl`.
+/// See the documentation for `SoftmaxImpl` class to learn what methods it
+/// provides, and examples of how to use `Softmax` with
+/// `torch::nn::SoftmaxOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(Softmax);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Softmin ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the Softmin function element-wise.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Softmin to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::SoftminOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Softmin model(SoftminOptions(1));
+/// ```
+class TORCH_API SoftminImpl : public torch::nn::Cloneable<SoftminImpl> {
+ public:
+  explicit SoftminImpl(int64_t dim) : SoftminImpl(SoftminOptions(dim)) {}
+  explicit SoftminImpl(const SoftminOptions& options_);
+
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `Softmin` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  SoftminOptions options;
+};
+
+/// A `ModuleHolder` subclass for `SoftminImpl`.
+/// See the documentation for `SoftminImpl` class to learn what methods it
+/// provides, and examples of how to use `Softmin` with
+/// `torch::nn::SoftminOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(Softmin);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ LogSoftmax ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the LogSoftmax function element-wise.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.LogSoftmax to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::LogSoftmaxOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// LogSoftmax model(LogSoftmaxOptions(1));
+/// ```
+class TORCH_API LogSoftmaxImpl : public torch::nn::Cloneable<LogSoftmaxImpl> {
+ public:
+  explicit LogSoftmaxImpl(int64_t dim)
+      : LogSoftmaxImpl(LogSoftmaxOptions(dim)) {}
+  explicit LogSoftmaxImpl(const LogSoftmaxOptions& options_);
+
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `LogSoftmax` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  LogSoftmaxOptions options;
+};
+
+/// A `ModuleHolder` subclass for `LogSoftmaxImpl`.
+/// See the documentation for `LogSoftmaxImpl` class to learn what methods it
+/// provides, and examples of how to use `LogSoftmax` with
+/// `torch::nn::LogSoftmaxOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(LogSoftmax);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Softmax2d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the Softmax2d function element-wise.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Softmax2d to learn
+/// about the exact behavior of this module.
+class TORCH_API Softmax2dImpl : public torch::nn::Cloneable<Softmax2dImpl> {
+ public:
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `Softmax2d` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+};
+
+/// A `ModuleHolder` subclass for `Softmax2dImpl`.
+/// See the documentation for `Softmax2dImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(Softmax2d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PReLU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the PReLU function element-wise.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.PReLU to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::PReLUOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// PReLU model(PReLUOptions().num_parameters(42));
+/// ```
+class TORCH_API PReLUImpl : public torch::nn::Cloneable<PReLUImpl> {
+ public:
+  explicit PReLUImpl(const PReLUOptions& options_ = {});
+
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `PReLU` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  PReLUOptions options;
+
+  /// The learned weight.
+  Tensor weight;
+};
+
+/// A `ModuleHolder` subclass for `PReLUImpl`.
+/// See the documentation for `PReLUImpl` class to learn what methods it
+/// provides, and examples of how to use `PReLU` with `torch::nn::PReLUOptions`.
+/// See the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(PReLU);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ReLU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the ReLU function element-wise.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.ReLU to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::ReLUOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// ReLU model(ReLUOptions().inplace(true));
+/// ```
+class TORCH_API ReLUImpl : public torch::nn::Cloneable<ReLUImpl> {
+ public:
+  explicit ReLUImpl(const ReLUOptions& options_ = {});
+
+  Tensor forward(Tensor input);
+
+  void reset() override;
+
+  /// Pretty prints the `ReLU` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  ReLUOptions options;
+};
+
+/// A `ModuleHolder` subclass for `ReLUImpl`.
+/// See the documentation for `ReLUImpl` class to learn what methods it
+/// provides, and examples of how to use `ReLU` with `torch::nn::ReLUOptions`.
+/// See the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(ReLU);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ReLU6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the ReLU6 function element-wise.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.ReLU6 to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::ReLU6Options` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// ReLU6 model(ReLU6Options().inplace(true));
+/// ```
+class TORCH_API ReLU6Impl : public torch::nn::Cloneable<ReLU6Impl> {
+ public:
+  explicit ReLU6Impl(const ReLU6Options& options_ = {});
+
+  Tensor forward(Tensor input);
+
+  void reset() override;
+
+  /// Pretty prints the `ReLU6` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  ReLU6Options options;
+};
+
+/// A `ModuleHolder` subclass for `ReLU6Impl`.
+/// See the documentation for `ReLU6Impl` class to learn what methods it
+/// provides, and examples of how to use `ReLU6` with `torch::nn::ReLU6Options`.
+/// See the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(ReLU6);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RReLU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the RReLU function element-wise.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.RReLU to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::RReLUOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// RReLU model(RReLUOptions().lower(0.24).upper(0.42).inplace(true));
+/// ```
+class TORCH_API RReLUImpl : public torch::nn::Cloneable<RReLUImpl> {
+ public:
+  explicit RReLUImpl(const RReLUOptions& options_ = {});
+
+  Tensor forward(Tensor input);
+
+  void reset() override;
+
+  /// Pretty prints the `RReLU` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  RReLUOptions options;
+};
+
+/// A `ModuleHolder` subclass for `RReLUImpl`.
+/// See the documentation for `RReLUImpl` class to learn what methods it
+/// provides, and examples of how to use `RReLU` with `torch::nn::RReLUOptions`.
+/// See the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(RReLU);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CELU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies celu over a given input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.CELU to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::CELUOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// CELU model(CELUOptions().alpha(42.42).inplace(true));
+/// ```
+class TORCH_API CELUImpl : public torch::nn::Cloneable<CELUImpl> {
+ public:
+  explicit CELUImpl(const CELUOptions& options_ = {});
+
+  Tensor forward(Tensor input);
+
+  void reset() override;
+
+  /// Pretty prints the `CELU` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  CELUOptions options;
+};
+
+/// A `ModuleHolder` subclass for `CELUImpl`.
+/// See the documentation for `CELUImpl` class to learn what methods it
+/// provides, and examples of how to use `CELU` with `torch::nn::CELUOptions`.
+/// See the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(CELU);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GLU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies glu over a given input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.GLU to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::GLUOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// GLU model(GLUOptions(1));
+/// ```
+class TORCH_API GLUImpl : public torch::nn::Cloneable<GLUImpl> {
+ public:
+  explicit GLUImpl(const GLUOptions& options_ = {});
+
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `GLU` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  GLUOptions options;
+};
+
+/// A `ModuleHolder` subclass for `GLUImpl`.
+/// See the documentation for `GLUImpl` class to learn what methods it
+/// provides, and examples of how to use `GLU` with `torch::nn::GLUOptions`.
+/// See the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(GLU);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GELU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies gelu over a given input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.GELU to learn
+/// about the exact behavior of this module.
+class TORCH_API GELUImpl : public torch::nn::Cloneable<GELUImpl> {
+ public:
+  explicit GELUImpl(GELUOptions options_ = {});
+
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `GELU` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  GELUOptions options;
+};
+
+/// A `ModuleHolder` subclass for `GELUImpl`.
+/// See the documentation for `GELUImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(GELU);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SiLU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies silu over a given input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.SiLU to learn
+/// about the exact behavior of this module.
+class TORCH_API SiLUImpl : public torch::nn::Cloneable<SiLUImpl> {
+ public:
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `SiLU` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+};
+
+/// A `ModuleHolder` subclass for `SiLUImpl`.
+/// See the documentation for `SiLUImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(SiLU);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Mish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies mish over a given input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Mish to learn
+/// about the exact behavior of this module.
+class TORCH_API MishImpl : public torch::nn::Cloneable<MishImpl> {
+ public:
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `Mish` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+};
+
+/// A `ModuleHolder` subclass for `MishImpl`.
+/// See the documentation for `MishImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(Mish);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Sigmoid ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies sigmoid over a given input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Sigmoid to learn
+/// about the exact behavior of this module.
+class TORCH_API SigmoidImpl : public torch::nn::Cloneable<SigmoidImpl> {
+ public:
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `Sigmoid` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+};
+
+/// A `ModuleHolder` subclass for `SigmoidImpl`.
+/// See the documentation for `SigmoidImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(Sigmoid);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Softplus ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies softplus over a given input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Softplus to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::SoftplusOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Softplus model(SoftplusOptions().beta(0.24).threshold(42.42));
+/// ```
+class TORCH_API SoftplusImpl : public torch::nn::Cloneable<SoftplusImpl> {
+ public:
+  explicit SoftplusImpl(const SoftplusOptions& options_ = {});
+
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `Softplus` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  SoftplusOptions options;
+};
+
+/// A `ModuleHolder` subclass for `SoftplusImpl`.
+/// See the documentation for `SoftplusImpl` class to learn what methods it
+/// provides, and examples of how to use `Softplus` with
+/// `torch::nn::SoftplusOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(Softplus);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Softshrink ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the soft shrinkage function element-wise.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Softshrink to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::SoftshrinkOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Softshrink model(SoftshrinkOptions(42.42));
+/// ```
+class TORCH_API SoftshrinkImpl : public torch::nn::Cloneable<SoftshrinkImpl> {
+ public:
+  explicit SoftshrinkImpl(const SoftshrinkOptions& options_ = {});
+
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `Softshrink` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  SoftshrinkOptions options;
+};
+
+/// A `ModuleHolder` subclass for `SoftshrinkImpl`.
+/// See the documentation for `SoftshrinkImpl` class to learn what methods it
+/// provides, and examples of how to use `Softshrink` with
+/// `torch::nn::SoftshrinkOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(Softshrink);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Softsign ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies Softsign over a given input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Softsign to learn
+/// about the exact behavior of this module.
+class TORCH_API SoftsignImpl : public torch::nn::Cloneable<SoftsignImpl> {
+ public:
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `Softsign` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+};
+
+/// A `ModuleHolder` subclass for `SoftsignImpl`.
+/// See the documentation for `SoftsignImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(Softsign);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Tanh ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies Tanh over a given input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Tanh to learn
+/// about the exact behavior of this module.
+class TORCH_API TanhImpl : public torch::nn::Cloneable<TanhImpl> {
+ public:
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `Tanh` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+};
+
+/// A `ModuleHolder` subclass for `TanhImpl`.
+/// See the documentation for `TanhImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(Tanh);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Tanhshrink ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies Tanhshrink over a given input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Tanhshrink to learn
+/// about the exact behavior of this module.
+class TORCH_API TanhshrinkImpl : public torch::nn::Cloneable<TanhshrinkImpl> {
+ public:
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `Tanhshrink` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+};
+
+/// A `ModuleHolder` subclass for `TanhshrinkImpl`.
+/// See the documentation for `TanhshrinkImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(Tanhshrink);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Threshold ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the Threshold function element-wise.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Threshold to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::ThresholdOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Threshold model(ThresholdOptions(42.42, 24.24).inplace(true));
+/// ```
+class TORCH_API ThresholdImpl : public torch::nn::Cloneable<ThresholdImpl> {
+ public:
+  ThresholdImpl(double threshold, double value)
+      : ThresholdImpl(ThresholdOptions(threshold, value)) {}
+  explicit ThresholdImpl(const ThresholdOptions& options_);
+
+  Tensor forward(Tensor input);
+
+  void reset() override;
+
+  /// Pretty prints the `Threshold` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  ThresholdOptions options;
+};
+
+/// A `ModuleHolder` subclass for `ThresholdImpl`.
+/// See the documentation for `ThresholdImpl` class to learn what methods it
+/// provides, and examples of how to use `Threshold` with
+/// `torch::nn::ThresholdOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(Threshold);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MultiheadAttention ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the MultiheadAttention function element-wise.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.MultiheadAttention
+/// to learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::MultiheadAttentionOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// MultiheadAttention model(MultiheadAttentionOptions(20, 10).bias(false));
+/// ```
+class TORCH_API MultiheadAttentionImpl
+    : public torch::nn::Cloneable<MultiheadAttentionImpl> {
+ public:
+  MultiheadAttentionImpl(int64_t embed_dim, int64_t num_heads)
+      : MultiheadAttentionImpl(
+            MultiheadAttentionOptions(embed_dim, num_heads)) {}
+  explicit MultiheadAttentionImpl(const MultiheadAttentionOptions& options_);
+
+  std::tuple<Tensor, Tensor> forward(
+      const Tensor& query,
+      const Tensor& key,
+      const Tensor& value,
+      const Tensor& key_padding_mask = {},
+      bool need_weights = true,
+      const Tensor& attn_mask = {},
+      bool average_attn_weights = true);
+
+ protected:
+  FORWARD_HAS_DEFAULT_ARGS(
+      {3, AnyValue(Tensor())},
+      {4, AnyValue(true)},
+      {5, AnyValue(Tensor())},
+      {6, AnyValue(true)})
+
+ public:
+  void reset() override;
+
+  void _reset_parameters();
+
+  /// The options with which this `Module` was constructed.
+  MultiheadAttentionOptions options;
+
+  bool _qkv_same_embed_dim{};
+  Tensor in_proj_weight;
+  Tensor in_proj_bias;
+  Tensor bias_k;
+  Tensor bias_v;
+  Linear out_proj = nullptr;
+  Tensor q_proj_weight;
+  Tensor k_proj_weight;
+  Tensor v_proj_weight;
+  int64_t head_dim{};
+};
+
+/// A `ModuleHolder` subclass for `MultiheadAttentionImpl`.
+/// See the documentation for `MultiheadAttentionImpl` class to learn what
+/// methods it provides, and examples of how to use `MultiheadAttention` with
+/// `torch::nn::MultiheadAttentionOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(MultiheadAttention);
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/adaptive.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/adaptive.h
new file mode 100644
index 0000000000000000000000000000000000000000..3cb8bc210a5d2ed3e81976784635bb8238b2178d
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/adaptive.h
@@ -0,0 +1,109 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/functional/activation.h>
+#include <torch/nn/module.h>
+#include <torch/nn/modules/container/modulelist.h>
+#include <torch/nn/modules/container/sequential.h>
+#include <torch/nn/modules/linear.h>
+#include <torch/nn/options/adaptive.h>
+
+#include <utility>
+
+namespace torch::nn {
+
+/// The output of a single invocation of an AdaptiveLogSoftmaxWithLoss
+/// module's `forward()` method.
+struct TORCH_API ASMoutput {
+  ASMoutput(Tensor output_, double loss_);
+
+  /// Tensor containing computed target log probabilities for each example
+  Tensor output;
+
+  /// Scalar representing the computed negative log likelihood loss
+  double loss;
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ AdaptiveLogSoftmaxWithLoss
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Efficient softmax approximation as described in
+/// `Efficient softmax approximation for GPUs`_ by Edouard Grave, Armand Joulin,
+/// Moustapha Cissé, David Grangier, and Hervé Jégou.
+/// See
+/// https://pytorch.org/docs/main/nn.html#torch.nn.AdaptiveLogSoftmaxWithLoss
+/// to learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::AdaptiveLogSoftmaxWithLossOptions`
+/// class to learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// AdaptiveLogSoftmaxWithLoss model(AdaptiveLogSoftmaxWithLossOptions(8, 10,
+/// {4, 8}).div_value(2.).head_bias(true));
+/// ```
+class TORCH_API AdaptiveLogSoftmaxWithLossImpl
+    : public Cloneable<AdaptiveLogSoftmaxWithLossImpl> {
+ public:
+  AdaptiveLogSoftmaxWithLossImpl(
+      int64_t in_features,
+      int64_t n_classes,
+      std::vector<int64_t> cutoffs)
+      : AdaptiveLogSoftmaxWithLossImpl(AdaptiveLogSoftmaxWithLossOptions(
+            in_features,
+            n_classes,
+            std::move(cutoffs))) {}
+
+  explicit AdaptiveLogSoftmaxWithLossImpl(
+      AdaptiveLogSoftmaxWithLossOptions options_);
+
+  ASMoutput forward(const Tensor& input, const Tensor& target);
+
+  void reset() override;
+
+  void reset_parameters();
+
+  /// Pretty prints the `AdaptiveLogSoftmaxWithLoss` module into the given
+  /// `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// Given input tensor, and output of `head`, computes the log of the full
+  /// distribution
+  Tensor _get_full_log_prob(const Tensor& input, const Tensor& head_output);
+
+  /// Computes log probabilities for all n_classes
+  Tensor log_prob(const Tensor& input);
+
+  /// This is equivalent to `log_pob(input).argmax(1)` but is more efficient in
+  /// some cases
+  Tensor predict(const Tensor& input);
+
+  /// The options with which this `Module` was constructed
+  AdaptiveLogSoftmaxWithLossOptions options;
+
+  /// Cutoffs used to assign targets to their buckets. It should be an ordered
+  /// Sequence of integers sorted in the increasing order
+  std::vector<int64_t> cutoffs;
+
+  int64_t shortlist_size;
+
+  /// Number of clusters
+  int64_t n_clusters;
+
+  /// Output size of head classifier
+  int64_t head_size;
+
+  Linear head = nullptr;
+
+  ModuleList tail;
+};
+
+/// A `ModuleHolder` subclass for `AdaptiveLogSoftmaxWithLossImpl`.
+/// See the documentation for `AdaptiveLogSoftmaxWithLossImpl` class to learn
+/// what methods it provides, and examples of how to use
+/// `AdaptiveLogSoftmaxWithLoss` with
+/// `torch::nn::AdaptiveLogSoftmaxWithLossOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(AdaptiveLogSoftmaxWithLoss);
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/batchnorm.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/batchnorm.h
new file mode 100644
index 0000000000000000000000000000000000000000..1520b2a83d4d371de3275101177cd8886ec3d4a3
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/batchnorm.h
@@ -0,0 +1,242 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/functional/batchnorm.h>
+#include <torch/nn/init.h>
+#include <torch/nn/options/batchnorm.h>
+#include <torch/nn/pimpl.h>
+#include <torch/types.h>
+
+namespace torch::nn {
+
+/// Base class for all (dimension-specialized) batchnorm and instancenorm
+/// modules.
+template <size_t D, typename Derived, typename DerivedOptions>
+class NormImplBase : public torch::nn::Cloneable<Derived> {
+ protected:
+  virtual void _check_input_dim(const Tensor& input) = 0;
+
+ public:
+  NormImplBase(const DerivedOptions& options_) : options(options_) {
+    NormImplBase::reset();
+  }
+
+  void reset() override {
+    if (options.affine()) {
+      weight = this->register_parameter(
+          "weight", torch::empty({options.num_features()}));
+      bias = this->register_parameter(
+          "bias", torch::empty({options.num_features()}));
+    } else {
+      weight =
+          this->register_parameter("weight", Tensor(), /*requires_grad=*/false);
+      bias =
+          this->register_parameter("bias", Tensor(), /*requires_grad=*/false);
+    }
+    if (options.track_running_stats()) {
+      running_mean = this->register_buffer(
+          "running_mean", torch::zeros({options.num_features()}));
+      running_var = this->register_buffer(
+          "running_var", torch::ones({options.num_features()}));
+      num_batches_tracked = this->register_buffer(
+          "num_batches_tracked", torch::tensor(0, torch::dtype(torch::kLong)));
+    } else {
+      running_mean = this->register_buffer("running_mean", Tensor());
+      running_var = this->register_buffer("running_var", Tensor());
+      num_batches_tracked =
+          this->register_buffer("num_batches_tracked", Tensor());
+    }
+    reset_parameters();
+  }
+
+  void reset_running_stats() {
+    if (options.track_running_stats()) {
+      running_mean.zero_();
+      running_var.fill_(1);
+      num_batches_tracked.zero_();
+    }
+  }
+
+  void reset_parameters() {
+    reset_running_stats();
+    if (options.affine()) {
+      torch::nn::init::ones_(weight);
+      torch::nn::init::zeros_(bias);
+    }
+  }
+
+  /// The options with which this module was constructed.
+  DerivedOptions options;
+
+  /// The learned weight.
+  /// Only defined if the `affine` option was `true` upon construction.
+  Tensor weight;
+
+  /// The learned bias.
+  /// Only defined if the `affine` option was `true` upon construction.
+  Tensor bias;
+
+  /// The running mean.
+  /// Only defined if the `track_running_stats` option was `true` upon
+  /// construction.
+  Tensor running_mean;
+
+  /// The running variance.
+  /// Only defined if the `track_running_stats` option was `true` upon
+  /// construction.
+  Tensor running_var;
+
+  /// The number of the forward call.
+  /// Only defined if the `track_running_stats` option was `true` upon
+  /// construction.
+  Tensor num_batches_tracked;
+};
+
+/// Base class for all (dimension-specialized) batchnorm modules.
+template <size_t D, typename Derived>
+class BatchNormImplBase : public NormImplBase<D, Derived, BatchNormOptions> {
+ public:
+  using NormImplBase<D, Derived, BatchNormOptions>::NormImplBase;
+
+  Tensor forward(const Tensor& input) {
+    this->_check_input_dim(input);
+    double exponential_average_factor = 0.0;
+    if (this->options.momentum().has_value()) {
+      exponential_average_factor = this->options.momentum().value();
+    }
+
+    if (this->is_training() && this->options.track_running_stats()) {
+      if (this->num_batches_tracked.defined()) {
+        this->num_batches_tracked += 1;
+        if (this->options.momentum() ==
+            std::nullopt) { // use cumulative moving average
+          exponential_average_factor =
+              1.0 / this->num_batches_tracked.template item<double>();
+        } else { // use exponential moving average
+          exponential_average_factor = this->options.momentum().value();
+        }
+      }
+    }
+
+    return torch::nn::functional::detail::batch_norm(
+        input,
+        this->running_mean,
+        this->running_var,
+        this->weight,
+        this->bias,
+        this->is_training() || !this->options.track_running_stats(),
+        /*momentum=*/exponential_average_factor,
+        this->options.eps());
+  }
+
+  /// Pretty prints the `BatchNorm{1,2,3}d` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override {
+    stream << std::boolalpha << "torch::nn::BatchNorm" << D << "d("
+           << this->options.num_features() << ", "
+           << "eps=" << this->options.eps() << ", "
+           << "momentum=";
+
+    if (this->options.momentum().has_value()) {
+      stream << this->options.momentum().value();
+    } else {
+      stream << "None";
+    }
+
+    stream << ", "
+           << "affine=" << this->options.affine() << ", "
+           << "track_running_stats=" << this->options.track_running_stats()
+           << ")";
+  }
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ BatchNorm1d
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the BatchNorm1d function.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.BatchNorm1d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::BatchNorm1dOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// BatchNorm1d
+/// model(BatchNorm1dOptions(4).eps(0.5).momentum(0.1).affine(false).track_running_stats(true));
+/// ```
+class TORCH_API BatchNorm1dImpl : public BatchNormImplBase<1, BatchNorm1dImpl> {
+ protected:
+  void _check_input_dim(const Tensor& input) override;
+
+ public:
+  using BatchNormImplBase<1, BatchNorm1dImpl>::BatchNormImplBase;
+};
+
+/// A `ModuleHolder` subclass for `BatchNorm1dImpl`.
+/// See the documentation for `BatchNorm1dImpl` class to learn what methods it
+/// provides, and examples of how to use `BatchNorm1d` with
+/// `torch::nn::BatchNorm1dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(BatchNorm1d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ BatchNorm2d
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the BatchNorm2d function.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.BatchNorm2d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::BatchNorm2dOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// BatchNorm2d
+/// model(BatchNorm2dOptions(4).eps(0.5).momentum(0.1).affine(false).track_running_stats(true));
+/// ```
+class TORCH_API BatchNorm2dImpl : public BatchNormImplBase<2, BatchNorm2dImpl> {
+ protected:
+  void _check_input_dim(const Tensor& input) override;
+
+ public:
+  using BatchNormImplBase<2, BatchNorm2dImpl>::BatchNormImplBase;
+};
+
+/// A `ModuleHolder` subclass for `BatchNorm2dImpl`.
+/// See the documentation for `BatchNorm2dImpl` class to learn what methods it
+/// provides, and examples of how to use `BatchNorm2d` with
+/// `torch::nn::BatchNorm2dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(BatchNorm2d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ BatchNorm3d
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the BatchNorm3d function.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.BatchNorm3d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::BatchNorm3dOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// BatchNorm3d
+/// model(BatchNorm3dOptions(4).eps(0.5).momentum(0.1).affine(false).track_running_stats(true));
+/// ```
+class TORCH_API BatchNorm3dImpl : public BatchNormImplBase<3, BatchNorm3dImpl> {
+ protected:
+  void _check_input_dim(const Tensor& input) override;
+
+ public:
+  using BatchNormImplBase<3, BatchNorm3dImpl>::BatchNormImplBase;
+};
+
+/// A `ModuleHolder` subclass for `BatchNorm3dImpl`.
+/// See the documentation for `BatchNorm3dImpl` class to learn what methods it
+/// provides, and examples of how to use `BatchNorm3d` with
+/// `torch::nn::BatchNorm3dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(BatchNorm3d);
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/common.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..557bc611d2c4a2a1cf09d753b63dd076f9baee83
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/common.h
@@ -0,0 +1,99 @@
+#pragma once
+
+/// This macro enables a module with default arguments in its forward method
+/// to be used in a Sequential module.
+///
+/// Example usage:
+///
+/// Let's say we have a module declared like this:
+/// ```
+/// struct MImpl : torch::nn::Module {
+///  public:
+///   explicit MImpl(int value_) : value(value_) {}
+///   torch::Tensor forward(int a, int b = 2, double c = 3.0) {
+///     return torch::tensor(a + b + c);
+///   }
+///  private:
+///   int value;
+/// };
+/// TORCH_MODULE(M);
+/// ```
+///
+/// If we try to use it in a Sequential module and run forward:
+/// ```
+/// torch::nn::Sequential seq(M(1));
+/// seq->forward(1);
+/// ```
+///
+/// We will receive the following error message:
+/// ```
+/// MImpl's forward() method expects 3 argument(s), but received 1.
+/// If MImpl's forward() method has default arguments, please make sure
+/// the forward() method is declared with a corresponding
+/// `FORWARD_HAS_DEFAULT_ARGS` macro.
+/// ```
+///
+/// The right way to fix this error is to use the `FORWARD_HAS_DEFAULT_ARGS`
+/// macro when declaring the module:
+/// ```
+/// struct MImpl : torch::nn::Module {
+///  public:
+///   explicit MImpl(int value_) : value(value_) {}
+///   torch::Tensor forward(int a, int b = 2, double c = 3.0) {
+///     return torch::tensor(a + b + c);
+///   }
+///  protected:
+///   /*
+///   NOTE: looking at the argument list of `forward`:
+///   `forward(int a, int b = 2, double c = 3.0)`
+///   we saw the following default arguments:
+///   ----------------------------------------------------------------
+///   0-based index of default |         Default value of arg
+///   arg in forward arg list  |  (wrapped by `torch::nn::AnyValue()`)
+///   ----------------------------------------------------------------
+///               1            |       torch::nn::AnyValue(2)
+///               2            |       torch::nn::AnyValue(3.0)
+///   ----------------------------------------------------------------
+///   Thus we pass the following arguments to the `FORWARD_HAS_DEFAULT_ARGS`
+///   macro:
+///   */
+///   FORWARD_HAS_DEFAULT_ARGS({1, torch::nn::AnyValue(2)}, {2,
+///   torch::nn::AnyValue(3.0)})
+///  private:
+///   int value;
+/// };
+/// TORCH_MODULE(M);
+/// ```
+/// Now, running the following would work:
+/// ```
+/// torch::nn::Sequential seq(M(1));
+/// seq->forward(1);  // This correctly populates the default arguments for
+/// `MImpl::forward`
+/// ```
+#define FORWARD_HAS_DEFAULT_ARGS(...)                                    \
+  template <typename ModuleType, typename... ArgumentTypes>              \
+  friend struct torch::nn::AnyModuleHolder;                              \
+  bool _forward_has_default_args() override {                            \
+    return true;                                                         \
+  }                                                                      \
+  unsigned int _forward_num_required_args() override {                   \
+    std::vector<std::pair<unsigned int, torch::nn::AnyValue>> args_info{ \
+        __VA_ARGS__};                                                    \
+    return std::begin(args_info)->first;                                 \
+  }                                                                      \
+  std::vector<torch::nn::AnyValue> _forward_populate_default_args(       \
+      std::vector<torch::nn::AnyValue>&& arguments) override {           \
+    std::vector<std::pair<unsigned int, torch::nn::AnyValue>> args_info{ \
+        __VA_ARGS__};                                                    \
+    unsigned int num_all_args = std::rbegin(args_info)->first + 1;       \
+    TORCH_INTERNAL_ASSERT(                                               \
+        arguments.size() >= _forward_num_required_args() &&              \
+        arguments.size() <= num_all_args);                               \
+    std::vector<torch::nn::AnyValue> ret = std::move(arguments);         \
+    ret.reserve(num_all_args);                                           \
+    for (auto& arg_info : args_info) {                                   \
+      if (arg_info.first > ret.size() - 1)                               \
+        ret.emplace_back(std::move(arg_info.second));                    \
+    }                                                                    \
+    return ret;                                                          \
+  }
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/any.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/any.h
new file mode 100644
index 0000000000000000000000000000000000000000..e6a41f2ca9e782a2698303eab231decdc3c5ea3a
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/any.h
@@ -0,0 +1,362 @@
+#pragma once
+
+#include <torch/nn/module.h>
+#include <torch/nn/modules/container/any_module_holder.h>
+#include <torch/types.h>
+
+#include <memory>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+namespace torch::nn {
+
+/// Stores a type erased `Module`.
+///
+/// The PyTorch C++ API does not impose an interface on the signature of
+/// `forward()` in `Module` subclasses. This gives you complete freedom to
+/// design your `forward()` methods to your liking. However, this also means
+/// there is no unified base type you could store in order to call `forward()`
+/// polymorphically for any module. This is where the `AnyModule` comes in.
+/// Instead of inheritance, it relies on type erasure for polymorphism.
+///
+/// An `AnyModule` can store any `nn::Module` subclass that provides a
+/// `forward()` method. This `forward()` may accept any types and return any
+/// type. Once stored in an `AnyModule`, you can invoke the underlying module's
+/// `forward()` by calling `AnyModule::forward()` with the arguments you would
+/// supply to the stored module (though see one important limitation below).
+/// Example:
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   struct GenericTrainer {
+///     torch::nn::AnyModule module;
+///
+///     void train(torch::Tensor input) {
+///       module.forward(input);
+///     }
+///   };
+///
+///   GenericTrainer trainer1{torch::nn::Linear(3, 4)};
+///   GenericTrainer trainer2{torch::nn::Conv2d(3, 4, 2)};
+/// \endrst
+///
+/// As `AnyModule` erases the static type of the stored module (and its
+/// `forward()` method) to achieve polymorphism, type checking of arguments is
+/// moved to runtime. That is, passing an argument with an incorrect type to an
+/// `AnyModule` will compile, but throw an exception at runtime:
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   torch::nn::AnyModule module(torch::nn::Linear(3, 4));
+///   // Linear takes a tensor as input, but we are passing an integer.
+///   // This will compile, but throw a `torch::Error` exception at runtime.
+///   module.forward(123);
+/// \endrst
+///
+/// \rst
+/// .. attention::
+///   One noteworthy limitation of `AnyModule` is that its `forward()` method
+///   does not support implicit conversion of argument types. For example, if
+///   the stored module's `forward()` method accepts a `float` and you call
+///   `any_module.forward(3.4)` (where `3.4` is a `double`), this will throw
+///   an exception.
+/// \endrst
+///
+/// The return type of the `AnyModule`'s `forward()` method is controlled via
+/// the first template argument to `AnyModule::forward()`. It defaults to
+/// `torch::Tensor`. To change it, you can write `any_module.forward<int>()`,
+/// for example.
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   torch::nn::AnyModule module(torch::nn::Linear(3, 4));
+///   auto output = module.forward(torch::ones({2, 3}));
+///
+///   struct IntModule {
+///     int forward(int x) { return x; }
+///   };
+///   torch::nn::AnyModule module(IntModule{});
+///   int output = module.forward<int>(5);
+/// \endrst
+///
+/// The only other method an `AnyModule` provides access to on the stored
+/// module is `clone()`. However, you may acquire a handle on the module via
+/// `.ptr()`, which returns a `shared_ptr<nn::Module>`. Further, if you know
+/// the concrete type of the stored module, you can get a concrete handle to it
+/// using `.get<T>()` where `T` is the concrete module type.
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   torch::nn::AnyModule module(torch::nn::Linear(3, 4));
+///   std::shared_ptr<nn::Module> ptr = module.ptr();
+///   torch::nn::Linear linear(module.get<torch::nn::Linear>());
+/// \endrst
+class AnyModule {
+ public:
+  /// A default-constructed `AnyModule` is in an empty state.
+  AnyModule() = default;
+
+  /// Constructs an `AnyModule` from a `shared_ptr` to concrete module object.
+  template <typename ModuleType>
+  explicit AnyModule(std::shared_ptr<ModuleType> module);
+
+  /// Constructs an `AnyModule` from a concrete module object.
+  template <
+      typename ModuleType,
+      typename = torch::detail::enable_if_module_t<ModuleType>>
+  explicit AnyModule(ModuleType&& module);
+
+  /// Constructs an `AnyModule` from a module holder.
+  template <typename ModuleType>
+  explicit AnyModule(const ModuleHolder<ModuleType>& module_holder);
+
+  /// Move construction and assignment is allowed, and follows the default
+  /// behavior of move for `std::unique_ptr`.
+  AnyModule(AnyModule&&) = default;
+  AnyModule& operator=(AnyModule&&) = default;
+
+  /// Creates a shallow copy of an `AnyModule`.
+  AnyModule(const AnyModule& other);
+  AnyModule& operator=(const AnyModule& other);
+
+  /// Creates a deep copy of an `AnyModule` if it contains a module, else an
+  /// empty `AnyModule` if it is empty.
+  AnyModule clone(std::optional<Device> device = std::nullopt) const;
+
+  /// Assigns a module to the `AnyModule` (to circumvent the explicit
+  /// constructor).
+  template <typename ModuleType>
+  AnyModule& operator=(std::shared_ptr<ModuleType> module);
+
+  /// Invokes `forward()` on the contained module with the given arguments, and
+  /// returns the return value as an `AnyValue`. Use this method when chaining
+  /// `AnyModule`s in a loop.
+  template <typename... ArgumentTypes>
+  AnyValue any_forward(ArgumentTypes&&... arguments);
+
+  /// Invokes `forward()` on the contained module with the given arguments, and
+  /// casts the returned `AnyValue` to the supplied `ReturnType` (which defaults
+  /// to `torch::Tensor`).
+  template <typename ReturnType = torch::Tensor, typename... ArgumentTypes>
+  ReturnType forward(ArgumentTypes&&... arguments);
+
+  /// Attempts to cast the underlying module to the given module type. Throws an
+  /// exception if the types do not match.
+  template <typename T, typename = torch::detail::enable_if_module_t<T>>
+  T& get();
+
+  /// Attempts to cast the underlying module to the given module type. Throws an
+  /// exception if the types do not match.
+  template <typename T, typename = torch::detail::enable_if_module_t<T>>
+  const T& get() const;
+
+  /// Returns the contained module in a `nn::ModuleHolder` subclass if possible
+  /// (i.e. if `T` has a constructor for the underlying module type).
+  template <typename T, typename ContainedType = typename T::ContainedType>
+  T get() const;
+
+  /// Returns a `std::shared_ptr` whose dynamic type is that of the underlying
+  /// module.
+  std::shared_ptr<Module> ptr() const;
+
+  /// Like `ptr()`, but casts the pointer to the given type.
+  template <typename T, typename = torch::detail::enable_if_module_t<T>>
+  std::shared_ptr<T> ptr() const;
+
+  /// Returns the `type_info` object of the contained value.
+  const std::type_info& type_info() const;
+
+  /// Returns true if the `AnyModule` does not contain a module.
+  bool is_empty() const noexcept;
+
+ private:
+  /// Creates a `unique_ptr<AnyModulePlaceholder>` pointing to a
+  /// `AnyModuleHolder` of the correct type. This method is used to deduce the
+  /// arguments of the module's `forward()` method.
+  template <
+      typename ModuleType,
+      typename Class,
+      typename ReturnType,
+      typename... ArgumentTypes>
+  std::unique_ptr<AnyModulePlaceholder> make_holder(
+      std::shared_ptr<ModuleType>&& module,
+      ReturnType (Class::*)(ArgumentTypes...));
+
+  /// Helper method invoked by const and non-const `get()`.
+  template <typename ModuleType, typename ReturnType, typename... ArgumentTypes>
+  ModuleType& get_(ReturnType (ModuleType::*)(ArgumentTypes...)) const;
+
+  /// Helper method invoked by const and non-const `get()`.
+  template <typename ModuleType>
+  ModuleType& get_() const;
+
+  /// The type erased module.
+  std::unique_ptr<AnyModulePlaceholder> content_;
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ AnyModule ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <typename ModuleType>
+AnyModule::AnyModule(std::shared_ptr<ModuleType> module)
+    : content_(make_holder(
+          std::move(module),
+          &std::remove_reference_t<ModuleType>::forward)) {
+  // `AnyModule` can only store an `nn::Module` subclass object that provides
+  // a `forward()` method that has a non-templatized return type.
+  // (e.g. `AnyModule` cannot store `nn::Sequential`, because `nn::Sequential`'s
+  // `forward()` method has a templatized return type.)
+  static_assert(
+      torch::detail::is_module<ModuleType>::value,
+      "Can only store object derived from nn::Module into AnyModule");
+  static_assert(
+      torch::detail::has_forward<ModuleType>::value,
+      "Can only store module with a forward() method that has a non-templatized"
+      " argument type and return type into AnyModule (e.g. we cannot store nn::Sequential"
+      "into AnyModule, because its forward() method's argument type and return type are templatized."
+      " If you need to use nn::Sequentials inside each other you can subclass "
+      "nn::Sequential and write a non-templatized forward function for it. You can checkout "
+      "https://github.com/pytorch/vision/blob/2f46070f3cb1ea894d82578f3dc5677f82f34958/torchvision/csrc/models/mnasnet.cpp#L59 "
+      "for an example on how to do this.).");
+}
+
+template <typename ModuleType, typename>
+AnyModule::AnyModule(ModuleType&& module)
+    : AnyModule(
+          std::make_shared<ModuleType>(std::forward<ModuleType>(module))) {}
+
+template <typename ModuleType>
+AnyModule::AnyModule(const ModuleHolder<ModuleType>& module_holder)
+    : AnyModule(module_holder.ptr()) {}
+
+inline AnyModule::AnyModule(const AnyModule& other)
+    : content_(other.content_ ? other.content_->copy() : nullptr) {}
+
+inline AnyModule& AnyModule::operator=(const AnyModule& other) {
+  if (this != &other) {
+    content_ = other.content_ ? other.content_->copy() : nullptr;
+  }
+  return *this;
+}
+
+inline AnyModule AnyModule::clone(std::optional<Device> device) const {
+  AnyModule clone;
+  clone.content_ = content_ ? content_->clone_module(device) : nullptr;
+  return clone;
+}
+
+template <typename ModuleType>
+AnyModule& AnyModule::operator=(std::shared_ptr<ModuleType> module) {
+  *this = AnyModule(std::move(module));
+  return *this;
+}
+
+template <typename... ArgumentTypes>
+AnyValue AnyModule::any_forward(ArgumentTypes&&... arguments) {
+  TORCH_CHECK(!is_empty(), "Cannot call forward() on an empty AnyModule");
+  std::vector<AnyValue> values;
+  values.reserve(sizeof...(ArgumentTypes));
+  torch::apply(
+      [&values](AnyValue&& value) { values.push_back(std::move(value)); },
+      AnyValue(std::forward<ArgumentTypes>(arguments))...);
+  return content_->forward(std::move(values));
+}
+
+template <typename ReturnType, typename... ArgumentTypes>
+ReturnType AnyModule::forward(ArgumentTypes&&... arguments) {
+  return any_forward(std::forward<ArgumentTypes>(arguments)...)
+      .template get<ReturnType>();
+}
+
+template <typename T, typename>
+T& AnyModule::get() {
+  TORCH_CHECK(!is_empty(), "Cannot call get() on an empty AnyModule");
+  return get_<T>();
+}
+
+template <typename T, typename>
+const T& AnyModule::get() const {
+  TORCH_CHECK(!is_empty(), "Cannot call get() on an empty AnyModule");
+  return get_<T>();
+}
+
+template <typename T, typename ContainedType>
+T AnyModule::get() const {
+  return T(ptr<ContainedType>());
+}
+
+inline std::shared_ptr<Module> AnyModule::ptr() const {
+  TORCH_CHECK(!is_empty(), "Cannot call ptr() on an empty AnyModule");
+  return content_->ptr();
+}
+
+template <typename T, typename>
+std::shared_ptr<T> AnyModule::ptr() const {
+  TORCH_CHECK(!is_empty(), "Cannot call ptr() on an empty AnyModule");
+  // Call get() but discard the value, just to do the type checking.
+  get_<T>();
+  return std::dynamic_pointer_cast<T>(ptr());
+}
+
+inline const std::type_info& AnyModule::type_info() const {
+  TORCH_CHECK(!is_empty(), "Cannot call type_info() on an empty AnyModule");
+  return content_->type_info;
+}
+
+inline bool AnyModule::is_empty() const noexcept {
+  return content_ == nullptr;
+}
+
+// Private Methods
+
+template <
+    typename ModuleType,
+    typename Class,
+    typename ReturnType,
+    typename... ArgumentTypes>
+std::unique_ptr<AnyModulePlaceholder> AnyModule::make_holder(
+    std::shared_ptr<ModuleType>&& module,
+    ReturnType (Class::*)(ArgumentTypes...)) {
+  static_assert(
+      torch::detail::check_not_lvalue_references<ArgumentTypes...>(),
+      "Modules stored inside AnyModule must not take references. "
+      "Use pointers instead.");
+  static_assert(
+      !std::is_void_v<ReturnType>,
+      "AnyModule cannot store modules that return void "
+      "(you can return a dummy value).");
+  return std::make_unique<
+      AnyModuleHolder<std::decay_t<ModuleType>, ArgumentTypes...>>(
+      std::move(module));
+}
+
+template <typename ModuleType>
+ModuleType& AnyModule::get_() const {
+  using M = std::remove_reference_t<ModuleType>;
+  static_assert(
+      torch::detail::has_forward<M>::value,
+      "Can only call AnyModule::get<T> with a type T that has a forward method");
+  return get_(&M::forward);
+}
+
+template <typename ModuleType, typename ReturnType, typename... ArgumentTypes>
+ModuleType& AnyModule::get_(
+    ReturnType (ModuleType::*)(ArgumentTypes...)) const {
+  if (typeid(ModuleType).hash_code() == type_info().hash_code()) {
+    return *static_cast<AnyModuleHolder<ModuleType, ArgumentTypes...>&>(
+                *content_)
+                .module;
+  }
+  TORCH_CHECK(
+      false,
+      "Attempted to cast module of type ",
+      c10::demangle(type_info().name()),
+      " to type ",
+      c10::demangle(typeid(ModuleType).name()));
+}
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/any_module_holder.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/any_module_holder.h
new file mode 100644
index 0000000000000000000000000000000000000000..6d5b557d34f1e7e17246fc74aa41aafd37f9e5dd
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/any_module_holder.h
@@ -0,0 +1,135 @@
+#pragma once
+
+#include <torch/csrc/utils/variadic.h>
+#include <torch/nn/modules/container/any_value.h>
+
+namespace torch::nn {
+
+class Module;
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~ AnyModulePlaceholder ~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// The static type of the object we store in the `AnyModule`, which erases
+/// the actual type, but allows us to call `forward()` on the underlying
+/// module.
+struct AnyModulePlaceholder : public AnyValue::Placeholder {
+  using AnyValue::Placeholder::Placeholder;
+
+  /// The "erased" `forward()` method.
+  virtual AnyValue forward(std::vector<AnyValue>&& arguments) = 0;
+
+  /// Returns std::shared_ptr<Module> pointing to the erased module.
+  virtual std::shared_ptr<Module> ptr() = 0;
+
+  /// Returns a `AnyModulePlaceholder` with a shallow copy of this `AnyModule`.
+  virtual std::unique_ptr<AnyModulePlaceholder> copy() const = 0;
+
+  /// Returns a `AnyModulePlaceholder` with a deep copy of this `AnyModule`.
+  virtual std::unique_ptr<AnyModulePlaceholder> clone_module(
+      std::optional<Device> device) const = 0;
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ AnyModuleHolder ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// The dynamic type of the object stored in the `AnyModule`. It contains the
+/// concrete instance to which all calls are forwarded. It is parameterized
+/// over the concrete type of the module, and the types of the arguments the
+/// module takes in its `forward()` method.
+template <typename ModuleType, typename... ArgumentTypes>
+struct AnyModuleHolder : public AnyModulePlaceholder {
+  /// \internal
+  struct CheckedGetter {
+    template <typename T>
+    std::decay_t<T>&& operator()(size_t index) {
+      AT_ASSERT(index < arguments_.size());
+      auto& value = arguments_[index];
+      if (auto* maybe_value = value.template try_get<std::decay_t<T>>()) {
+        return std::move(*maybe_value);
+      }
+      TORCH_CHECK(
+          false,
+          "Expected argument #",
+          index,
+          " to be of type ",
+          c10::demangle(typeid(T).name()),
+          ", but received value of type ",
+          c10::demangle(value.type_info().name()));
+    }
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+    std::vector<AnyValue>& arguments_;
+  };
+
+  /// \internal
+  struct InvokeForward {
+    template <typename... Ts>
+    AnyValue operator()(Ts&&... ts) {
+      return AnyValue(module_->forward(std::forward<Ts>(ts)...));
+    }
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+    std::shared_ptr<ModuleType>& module_;
+  };
+
+  /// Constructs the `AnyModuleHolder` from a concrete module.
+  explicit AnyModuleHolder(std::shared_ptr<ModuleType>&& module_)
+      : AnyModulePlaceholder(typeid(ModuleType)), module(std::move(module_)) {}
+
+  /// Calls `forward()` on the underlying module, casting each `AnyValue` in the
+  /// argument vector to a concrete value.
+  AnyValue forward(std::vector<AnyValue>&& arguments) override {
+    if (module->_forward_has_default_args()) {
+      TORCH_CHECK(
+          arguments.size() >= module->_forward_num_required_args() &&
+              arguments.size() <= sizeof...(ArgumentTypes),
+          c10::demangle(type_info.name()),
+          "'s forward() method expects at least ",
+          module->_forward_num_required_args(),
+          " argument(s) and at most ",
+          sizeof...(ArgumentTypes),
+          " argument(s), but received ",
+          arguments.size(),
+          ".");
+      arguments = std::move(
+          module->_forward_populate_default_args(std::move(arguments)));
+    } else {
+      std::string use_default_args_macro_prompt = " If " +
+          c10::demangle(type_info.name()) +
+          "'s forward() method has default arguments, " +
+          "please make sure the forward() method is declared with a corresponding `FORWARD_HAS_DEFAULT_ARGS` macro.";
+      TORCH_CHECK(
+          arguments.size() == sizeof...(ArgumentTypes),
+          c10::demangle(type_info.name()),
+          "'s forward() method expects ",
+          sizeof...(ArgumentTypes),
+          " argument(s), but received ",
+          arguments.size(),
+          ".",
+          (arguments.size() < sizeof...(ArgumentTypes))
+              ? use_default_args_macro_prompt
+              : "");
+    }
+
+    // FYI: During invocation of a module's `forward()` method, the values live
+    // in the `arguments` vector inside this function.
+    return torch::unpack<AnyValue, ArgumentTypes...>(
+        InvokeForward{module}, CheckedGetter{arguments});
+  }
+
+  std::shared_ptr<Module> ptr() override {
+    return module;
+  }
+
+  std::unique_ptr<AnyModulePlaceholder> copy() const override {
+    return std::make_unique<AnyModuleHolder>(*this);
+  }
+
+  std::unique_ptr<AnyModulePlaceholder> clone_module(
+      std::optional<Device> device) const override {
+    return std::make_unique<AnyModuleHolder>(
+        std::dynamic_pointer_cast<ModuleType>(module->clone(device)));
+  }
+
+  /// The actual concrete module instance.
+  std::shared_ptr<ModuleType> module;
+};
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/any_value.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/any_value.h
new file mode 100644
index 0000000000000000000000000000000000000000..4612e75517bb3319c95efadbcd0eafd5b5fae92b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/any_value.h
@@ -0,0 +1,124 @@
+#pragma once
+
+#include <torch/types.h>
+
+#include <memory>
+#include <type_traits>
+#include <typeinfo>
+#include <utility>
+
+namespace torch::nn {
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ AnyValue ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// An implementation of `std::any` which stores
+/// a type erased object, whose concrete value can be retrieved at runtime by
+/// checking if the `typeid()` of a requested type matches the `typeid()` of
+/// the object stored.
+class AnyValue {
+ public:
+  /// Move construction and assignment is allowed, and follows the default
+  /// behavior of move for `std::unique_ptr`.
+  AnyValue(AnyValue&&) = default;
+  AnyValue& operator=(AnyValue&&) = default;
+  ~AnyValue() = default;
+
+  /// Copy construction and assignment is allowed.
+  AnyValue(const AnyValue& other) : content_(other.content_->clone()) {}
+  AnyValue& operator=(const AnyValue& other) {
+    content_ = other.content_->clone();
+    return *this;
+  }
+
+  /// Constructs the `AnyValue` from value type.
+  template <
+      typename T,
+      typename = std::enable_if_t<!std::is_same_v<T, AnyValue>>>
+  explicit AnyValue(T&& value)
+      : content_(
+            std::make_unique<Holder<std::decay_t<T>>>(std::forward<T>(value))) {
+  }
+
+  /// Returns a pointer to the value contained in the `AnyValue` if the type
+  /// passed as template parameter matches the type of the value stored, and
+  /// returns a null pointer otherwise.
+  template <typename T>
+  T* try_get() {
+    static_assert(
+        !std::is_reference_v<T>,
+        "AnyValue stores decayed types, you cannot cast it to a reference type");
+    static_assert(
+        !std::is_array_v<T>,
+        "AnyValue stores decayed types, you must cast it to T* instead of T[]");
+    if (typeid(T).hash_code() == type_info().hash_code()) {
+      return &static_cast<Holder<T>&>(*content_).value;
+    }
+    return nullptr;
+  }
+
+  /// Returns the value contained in the `AnyValue` if the type passed as
+  /// template parameter matches the type of the value stored, and throws an
+  /// exception otherwise.
+  template <typename T>
+  T get() {
+    if (auto* maybe_value = try_get<T>()) {
+      return *maybe_value;
+    }
+    TORCH_CHECK(
+        false,
+        "Attempted to cast AnyValue to ",
+        c10::demangle(typeid(T).name()),
+        ", but its actual type is ",
+        c10::demangle(type_info().name()));
+  }
+
+  /// Returns the `type_info` object of the contained value.
+  const std::type_info& type_info() const noexcept {
+    return content_->type_info;
+  }
+
+ private:
+  friend struct AnyModulePlaceholder;
+  friend struct TestAnyValue;
+
+  /// \internal
+  /// The static type of the object we store in the `AnyValue`, which erases the
+  /// actual object's type, allowing us only to check the `type_info` of the
+  /// type stored in the dynamic type.
+  struct Placeholder {
+    explicit Placeholder(const std::type_info& type_info_) noexcept
+        : type_info(type_info_) {}
+    Placeholder(const Placeholder&) = default;
+    Placeholder(Placeholder&&) = default;
+    Placeholder& operator=(const Placeholder&) = delete;
+    Placeholder& operator=(Placeholder&&) = delete;
+    virtual ~Placeholder() = default;
+    virtual std::unique_ptr<Placeholder> clone() const {
+      TORCH_CHECK(false, "clone() should only be called on `AnyValue::Holder`");
+    }
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+    const std::type_info& type_info;
+  };
+
+  /// \internal
+  /// The dynamic type of the object we store in the `AnyValue`, which hides the
+  /// actual object we have erased in this `AnyValue`.
+  template <typename T>
+  struct Holder : public Placeholder {
+    /// A template because T&& would not be universal reference here.
+    template <
+        typename U,
+        typename = std::enable_if_t<!std::is_same_v<U, Holder>>>
+    explicit Holder(U&& value_) noexcept
+        : Placeholder(typeid(T)), value(std::forward<U>(value_)) {}
+    std::unique_ptr<Placeholder> clone() const override {
+      return std::make_unique<Holder<T>>(value);
+    }
+    T value;
+  };
+
+  /// The type erased object.
+  std::unique_ptr<Placeholder> content_;
+};
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/functional.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/functional.h
new file mode 100644
index 0000000000000000000000000000000000000000..0d0cdd8861b62277aca5615e2991b53475eaaa66
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/functional.h
@@ -0,0 +1,101 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/nn/cloneable.h>
+#include <torch/types.h>
+
+#include <functional>
+#include <utility>
+
+namespace torch::nn {
+
+/// Wraps a function in a `Module`.
+///
+/// The `Functional` module allows wrapping an arbitrary function or function
+/// object in an `nn::Module`. This is primarily handy for usage in
+/// `Sequential`.
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   Sequential sequential(
+///     Linear(3, 4),
+///     Functional(torch::relu),
+///     BatchNorm1d(3),
+///     Functional(torch::elu, /*alpha=*/1));
+/// \endrst
+///
+/// While a `Functional` module only accepts a single `Tensor` as input, it is
+/// possible for the wrapped function to accept further arguments. However,
+/// these have to be bound *at construction time*. For example, if
+/// you want to wrap `torch::leaky_relu`, which accepts a `slope` scalar as its
+/// second argument, with a particular value for its `slope` in a `Functional`
+/// module, you could write
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   Functional(torch::leaky_relu, /*slope=*/0.5)
+/// \endrst
+///
+/// The value of `0.5` is then stored within the `Functional` object and
+/// supplied to the function call at invocation time. Note that such bound
+/// values are evaluated eagerly and stored a single time. See the documentation
+/// of [std::bind](https://en.cppreference.com/w/cpp/utility/functional/bind)
+/// for more information on the semantics of argument binding.
+///
+/// \rst
+/// .. attention::
+///   After passing any bound arguments, the function must accept a single
+///   tensor and return a single tensor.
+/// \endrst
+///
+/// Note that `Functional` overloads the call operator (`operator()`) such that
+/// you can invoke it with `my_func(...)`.
+class TORCH_API FunctionalImpl : public torch::nn::Cloneable<FunctionalImpl> {
+ public:
+  using Function = std::function<Tensor(Tensor)>;
+
+  /// Constructs a `Functional` from a function object.
+  explicit FunctionalImpl(Function function);
+
+  template <
+      typename SomeFunction,
+      typename... Args,
+      typename = std::enable_if_t<(sizeof...(Args) > 0)>>
+  explicit FunctionalImpl(SomeFunction original_function, Args&&... args)
+      // NOLINTNEXTLINE(modernize-avoid-bind)
+      : function_(std::bind(
+            original_function,
+            /*input=*/std::placeholders::_1,
+            std::forward<Args>(args)...)) {
+    // std::bind is normally evil, but (1) gcc is broken w.r.t. handling
+    // parameter pack expansion in lambdas and (2) moving parameter packs into
+    // a lambda only works with C++14, so std::bind is the more move-aware
+    // solution here.
+  }
+
+  void reset() override;
+
+  /// Pretty prints the `Functional` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// Forwards the `input` tensor to the underlying (bound) function object.
+  Tensor forward(Tensor input);
+
+  /// Calls forward(input).
+  Tensor operator()(Tensor input);
+
+  bool is_serializable() const override;
+
+ private:
+  Function function_;
+};
+
+/// A `ModuleHolder` subclass for `FunctionalImpl`.
+/// See the documentation for `FunctionalImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(Functional);
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/moduledict.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/moduledict.h
new file mode 100644
index 0000000000000000000000000000000000000000..fad78ed836b427da4a693b861fb3719c09491ad4
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/moduledict.h
@@ -0,0 +1,260 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/module.h>
+#include <torch/ordered_dict.h>
+#include <vector>
+
+namespace torch::nn {
+
+/// An OrderedDict of `Module`s that registers its elements by their `key`s.
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   torch::OrderedDict<std::string, std::shared_ptr<Module>> ordereddict = {
+///     {"linear", Linear(10, 3).ptr()},
+///     {"conv", Conv2d(1, 2, 3).ptr()},
+///     {"dropout", Dropout(0.5).ptr()},
+///   };
+///   torch::nn::ModuleDict dict1(ordereddict);
+///
+///   for (const auto &module : *dict1) {
+///     module->pretty_print(std::cout);
+///   }
+///
+///   std::vector<std::pair<std::string, std::shared_ptr<Module>>> list = {
+///     {"linear", Linear(10, 3).ptr()},
+///     {"conv", Conv2d(1, 2, 3).ptr()},
+///     {"dropout", Dropout(0.5).ptr()},
+///   };
+///   torch::nn::ModuleDict dict2(list);
+///
+///   for (const auto &module : *dict2) {
+///     module->pretty_print(std::cout);
+///   }
+///
+/// \endrst
+///
+/// Why should you use `ModuleDict` instead of a simple `map` or `OrderedDict`?
+/// The value a `ModuleDict` provides over manually calling an ordered map of
+/// modules is that it allows treating the whole container *as a single module*,
+/// such that performing a transformation on the `ModuleDict` applies to each of
+/// the modules it stores (which are each a registered submodule of the
+/// `ModuleDict`). For example, calling `.to(torch::kCUDA)` on a `ModuleDict`
+/// will move each module in the map to CUDA memory. For example:
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   torch::OrderedDict<std::string, std::shared_ptr<Module>> ordereddict = {
+///     {"linear", Linear(10, 3).ptr()},
+///     {"conv", Conv2d(1, 2, 3).ptr()},
+///     {"dropout", Dropout(0.5).ptr()},
+///   };
+///   torch::nn::ModuleDict dict(ordereddict);
+///
+///   // Convert all modules to CUDA.
+///   dict->to(torch::kCUDA);
+///
+/// \endrst
+///
+/// Finally, `ModuleDict` provides a lightweight container API, such as allowing
+/// iteration over submodules, positional access, adding new modules from a
+/// vector of key-module pairs or an `OrderedDict` or another `ModuleDict` after
+/// construction via `update`.
+class ModuleDictImpl : public Cloneable<ModuleDictImpl> {
+ public:
+  using Iterator =
+      torch::OrderedDict<std::string, std::shared_ptr<Module>>::Iterator;
+  using ConstIterator =
+      torch::OrderedDict<std::string, std::shared_ptr<Module>>::ConstIterator;
+
+  ModuleDictImpl() = default;
+
+  /// Constructs the `ModuleDict` from a list of string-Module pairs.
+  explicit ModuleDictImpl(
+      const std::vector<std::pair<std::string, std::shared_ptr<Module>>>&
+          modules) {
+    update(modules);
+  }
+
+  /// Constructs the `ModuleDict` from an `OrderedDict`.
+  explicit ModuleDictImpl(
+      const torch::OrderedDict<std::string, std::shared_ptr<Module>>& modules) {
+    update(modules);
+  }
+
+  /// Return the items in the `ModuleDict`.
+  std::vector<std::pair<std::string, std::shared_ptr<Module>>> items() const {
+    return modules_.pairs();
+  }
+
+  /// Return the keys in the `ModuleDict`.
+  std::vector<std::string> keys() const {
+    return modules_.keys();
+  }
+
+  /// Return the values in the `ModuleDict`.
+  std::vector<std::shared_ptr<Module>> values() const {
+    return modules_.values();
+  }
+
+  /// Return an iterator to the start of `ModuleDict`.
+  Iterator begin() {
+    return modules_.begin();
+  }
+
+  /// Return a const iterator to the start of `ModuleDict`.
+  ConstIterator begin() const {
+    return modules_.begin();
+  }
+
+  /// Return an iterator to the end of `ModuleDict`.
+  Iterator end() {
+    return modules_.end();
+  }
+
+  /// Return a const iterator to the end of `ModuleDict`.
+  ConstIterator end() const {
+    return modules_.end();
+  }
+
+  /// Return the number of items currently stored in the `ModuleDict`.
+  size_t size() const noexcept {
+    return modules_.size();
+  }
+
+  /// Return true if the `ModuleDict` is empty, otherwise return false.
+  bool empty() const noexcept {
+    return modules_.is_empty();
+  }
+
+  /// Check if the certain parameter with the key in the `ModuleDict`.
+  bool contains(const std::string& key) const noexcept {
+    return modules_.contains(key);
+  }
+
+  /// Remove all items from the `ModuleDict`.
+  void clear() {
+    // Not remove the registration of modules to make it consistent with python
+    // version.
+    modules_.clear();
+  }
+
+  /// Special cloning function for `ModuleDict` because it does not use
+  /// `reset()`.
+  std::shared_ptr<Module> clone(
+      const std::optional<Device>& device = std::nullopt) const override {
+    auto clone = std::make_shared<ModuleDictImpl>();
+    for (const auto& module : modules_) {
+      clone->insert(module.key(), module.value()->clone(device));
+    }
+    return clone;
+  }
+
+  /// `reset()` is empty for `ModuleDict`, since it does not have parameters of
+  /// its own.
+  void reset() override {}
+
+  /// Pretty prints the `ModuleDict` into the given `stream`.
+  void pretty_print(std::ostream& stream) const override {
+    stream << "torch::nn::ModuleDict";
+  }
+
+  /// Attempts to returns the `Module` associated with the given `key`. Throws
+  /// an exception if no such `key` is stored in the `ModuleDict`. Check
+  /// contains(key) before for a non-throwing way of access.
+  std::shared_ptr<Module> operator[](const std::string& key) const {
+    return modules_[key];
+  }
+
+  /// Attempts to return the module at the given key as the requested type.
+  /// Throws an exception if no such `key` is stored in the `ModuleDict`.
+  /// Check contains(key) before for a non-throwing way of access.
+  template <typename T>
+  T& at(const std::string& key) {
+    static_assert(
+        torch::detail::is_module<T>::value,
+        "Can only call ModuleList::at with an nn::Module type");
+    auto module = modules_[key]->as<T>();
+    TORCH_CHECK(
+        module,
+        "Unable to cast module[",
+        key,
+        "] to ",
+        c10::demangle(typeid(T).name()));
+    return *module;
+  }
+
+  /// Attempts to return the module at the given key as the requested type.
+  /// Throws an exception if no such `key` is stored in the `ModuleDict`.
+  /// Check contains(key) before for a non-throwing way of access.
+  template <typename T>
+  const T& at(const std::string& key) const {
+    static_assert(
+        torch::detail::is_module<T>::value,
+        "Can only call ModuleList::at with an nn::Module type");
+    const auto module = modules_[key]->as<T>();
+    TORCH_CHECK(
+        module,
+        "Unable to cast module[",
+        key,
+        "] to ",
+        c10::demangle(typeid(T).name()));
+    return *module;
+  }
+
+  /// Removes and returns the `Module` associated with the given `key`.
+  /// Throws an exception if no such `key` is stored in the `ModuleDict`.
+  /// Check contains(key) before for a non-throwing way of access.
+  std::shared_ptr<Module> pop(const std::string& key) {
+    auto module = modules_[key];
+    modules_.erase(key);
+    // Not remove the registration of the module to make it consistent with
+    // python version.
+    return module;
+  }
+
+  /// Updated the `ModuleDict` with a vector of key-module pairs.
+  void update(
+      const std::vector<std::pair<std::string, std::shared_ptr<Module>>>&
+          modules) {
+    for (auto& item : modules) {
+      insert(item.first, item.second);
+    }
+  }
+
+  /// Updated the `ModuleDict` with key-value pairs from `OrderedDict` or
+  /// `ModuleDict`.
+  template <typename Container>
+  void update(const Container& container) {
+    for (auto& item : container) {
+      insert(item.key(), item.value());
+    }
+  }
+
+ private:
+  /// Private `OrderedDict` holding the key-Module pairs.
+  torch::OrderedDict<std::string, std::shared_ptr<Module>> modules_;
+
+  /// Insert a key-module pair by overwriting existing keys,
+  /// and register or replace the `Module`.
+  void insert(const std::string& key, std::shared_ptr<Module> module) {
+    if (contains(key)) {
+      modules_[key] = std::move(module);
+      replace_module(key, modules_[key]);
+    } else {
+      modules_.insert(key, std::move(module));
+      register_module(key, modules_.back().value());
+    }
+  }
+};
+
+/// A `ModuleHolder` subclass for `ModuleDictImpl`.
+/// See the documentation for `ModuleDictImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(ModuleDict);
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/modulelist.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/modulelist.h
new file mode 100644
index 0000000000000000000000000000000000000000..dec16cb80fedf2a30081aa5e16ec0a3b324614fc
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/modulelist.h
@@ -0,0 +1,272 @@
+#pragma once
+
+#include <c10/util/irange.h>
+#include <torch/nn/cloneable.h>
+#include <torch/nn/module.h>
+
+#include <utility>
+#include <vector>
+
+namespace torch::nn {
+
+/// A list of `Module`s that registers its elements.
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   torch::nn::ModuleList mlist(
+///     torch::nn::Linear(3, 4),
+///     torch::nn::BatchNorm1d(4),
+///     torch::nn::Dropout(0.5)
+///   );
+///
+///   for (const auto &module : *mlist) {
+///     module->pretty_print(std::cout);
+///   }
+///
+/// \endrst
+///
+/// Why should you use `ModuleList` instead of a simple `std::vector`? The value
+/// a `ModuleList` provides over manually calling a sequence of modules is that
+/// it allows treating the whole container *as a single module*, such that
+/// performing a transformation on the `ModuleList` applies to each of the
+/// modules it stores (which are each a registered submodule of the
+/// `ModuleList`). For example, calling
+/// `.to(torch::kCUDA)` on a `ModuleList` will move each module in the list to
+/// CUDA memory. For example:
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   torch::nn::ModuleList mlist(
+///     torch::nn::Linear(3, 4),
+///     torch::nn::BatchNorm1d(4),
+///     torch::nn::Dropout(0.5)
+///   );
+///
+///   // Convert all modules to CUDA.
+///   mlist->to(torch::kCUDA);
+///
+/// \endrst
+///
+/// Finally, `ModuleList` provides a lightweight container API, such as allowing
+/// iteration over submodules, positional access, adding a new module after
+/// construction via `push_back`, as well as joining two `ModuleList`s via
+/// `extend`.
+class ModuleListImpl : public Cloneable<ModuleListImpl> {
+ public:
+  using Iterator = std::vector<std::shared_ptr<Module>>::iterator;
+  using ConstIterator = std::vector<std::shared_ptr<Module>>::const_iterator;
+
+  ModuleListImpl() = default;
+
+  /// Constructs the `ModuleList` from a variadic list of modules.
+  template <typename... Modules>
+  explicit ModuleListImpl(Modules&&... modules) {
+    modules_.reserve(sizeof...(Modules));
+    push_back_var(std::forward<Modules>(modules)...);
+  }
+
+  /// Special cloning function for `ModuleList` because it does not use
+  /// `reset()`.
+  std::shared_ptr<Module> clone(
+      const std::optional<Device>& device = std::nullopt) const override {
+    auto clone = std::make_shared<ModuleListImpl>();
+    for (const auto& module : modules_) {
+      clone->push_back(module->clone(device));
+    }
+    return clone;
+  }
+
+  /// `reset()` is empty for `ModuleList`, since it does not have parameters of
+  /// its own.
+  void reset() override {}
+
+  /// Pretty prints the `ModuleList` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override {
+    stream << "torch::nn::ModuleList";
+  }
+
+  void push_back(std::shared_ptr<Module> module) {
+    modules_.push_back(std::move(module));
+    const auto index = modules_.size() - 1;
+    register_module(std::to_string(index), modules_[index]);
+  }
+
+  /// Adds a new `Module` to the `ModuleList` container, moving or copying
+  /// it into a `shared_ptr` internally. This method allows passing value types,
+  /// and letting the container deal with the boxing.
+  template <typename M, typename = torch::detail::enable_if_module_t<M>>
+  void push_back(M&& module) {
+    using Type = std::remove_reference_t<M>;
+    push_back(std::make_shared<Type>(std::forward<M>(module)));
+  }
+
+  /// Unwraps the contained module of a `ModuleHolder` and adds it to the
+  /// `ModuleList`.
+  template <typename M>
+  void push_back(const ModuleHolder<M>& module_holder) {
+    push_back(module_holder.ptr());
+  }
+
+  /// Iterates over the container and calls `push_back()` on each value.
+  template <typename Container>
+  void extend(const Container& container) {
+    for (const auto& module : container) {
+      push_back(module);
+    }
+  }
+
+  /// Returns an iterator to the start of the `ModuleList`.
+  Iterator begin() {
+    return modules_.begin();
+  }
+
+  /// Returns a const iterator to the start of the `ModuleList`.
+  ConstIterator begin() const {
+    return modules_.begin();
+  }
+
+  /// Returns an iterator to the end of the `ModuleList`.
+  Iterator end() {
+    return modules_.end();
+  }
+
+  /// Returns a const iterator to the end of the `ModuleList`.
+  ConstIterator end() const {
+    return modules_.end();
+  }
+
+  /// Attempts to return the module at the given index as the requested type.
+  /// Throws an exception if the index is out of bounds or the types do not
+  /// match.
+  template <typename T>
+  T& at(size_t index) {
+    static_assert(
+        torch::detail::is_module<T>::value,
+        "Can only call ModuleList::at with an nn::Module type");
+    TORCH_CHECK(index < size(), "Index out of range");
+    auto module = modules_[index]->as<T>();
+    TORCH_CHECK(
+        module,
+        "Unable to cast module[",
+        index,
+        "] to ",
+        c10::demangle(typeid(T).name()));
+    return *module;
+  }
+
+  /// Attempts to return the module at the given index as the requested type.
+  /// Throws an exception if the index is out of bounds or the types do not
+  /// match.
+  template <typename T>
+  const T& at(size_t index) const {
+    static_assert(
+        torch::detail::is_module<T>::value,
+        "Can only call ModuleList::at with an nn::Module type");
+    TORCH_CHECK(index < size(), "Index out of range");
+    const auto module = modules_[index]->as<T>();
+    TORCH_CHECK(
+        module,
+        "Unable to cast module[",
+        index,
+        "] to ",
+        c10::demangle(typeid(T).name()));
+    return *module;
+  }
+
+  /// Attempts to return a `std::shared_ptr` whose dynamic type is that of the
+  /// underlying module at the given index. Throws an exception if the index is
+  /// out of bounds.
+  std::shared_ptr<Module> ptr(size_t index) const {
+    TORCH_CHECK(index < size(), "Index out of range");
+    return modules_[index];
+  }
+
+  /// Attempts to return a `std::shared_ptr` whose type is the one provided.
+  /// Throws an exception if the index is out of bounds or the types do not
+  /// match.
+  template <typename T>
+  std::shared_ptr<T> ptr(size_t index) const {
+    static_assert(
+        torch::detail::is_module<T>::value,
+        "Can only call ModuleList::ptr with an nn::Module type");
+    TORCH_CHECK(index < size(), "Index out of range");
+    return std::dynamic_pointer_cast<T>(modules_[index]);
+  }
+
+  /// Like `ptr(index)`.
+  std::shared_ptr<Module> operator[](size_t index) const {
+    // This is the only method we can call without a type.
+    return ptr(index);
+  }
+
+  /// The current size of the `ModuleList` container.
+  size_t size() const noexcept {
+    return modules_.size();
+  }
+
+  /// True if there are no modules in the `ModuleList`.
+  bool is_empty() const noexcept {
+    return size() == 0;
+  }
+
+  void insert(size_t index, std::shared_ptr<Module> module) {
+    TORCH_CHECK(index <= size(), "Index out of range");
+
+    if (index == size())
+      push_back(std::move(module));
+    else {
+      modules_.insert(
+          modules_.begin() + Iterator::difference_type(index),
+          std::move(module));
+
+      for (const auto i : c10::irange(index, size() - 1)) {
+        (void)i; // Suppress unused variable warning
+        replace_module(std::to_string(index), modules_[index]);
+      }
+      register_module(std::to_string(size() - 1), modules_.back());
+    }
+  }
+
+  /// Unwraps the contained module of a `ModuleHolder` and inserts it in the
+  /// `ModuleList`.
+  template <typename M>
+  void insert(size_t index, const ModuleHolder<M>& module_holder) {
+    insert(index, module_holder.ptr());
+  }
+
+  /// inserts a new `Module` to the `ModuleList` container, moving or copying
+  /// it into a `shared_ptr` internally. This method allows passing value types,
+  /// and letting the container deal with the boxing.
+  template <typename M, typename = torch::detail::enable_if_module_t<M>>
+  void insert(size_t index, M&& module) {
+    using Type = std::remove_reference_t<M>;
+    insert(index, std::make_shared<Type>(std::forward<M>(module)));
+  }
+
+ private:
+  template <typename Head, typename... Tail>
+  void push_back_var(Head&& head, Tail&&... tail) {
+    push_back(std::forward<Head>(head));
+    // Recursively calls this method, until the parameter pack only thas this
+    // entry left. Then calls `push_back()` a final time (above).
+    push_back_var(std::forward<Tail>(tail)...);
+  }
+
+  /// The base case, when the list of modules is empty.
+  void push_back_var() {}
+
+  // Box the AnyModules to give ModuleList reference semantics, like the rest of
+  // the API. Note that this is not required otherwise, this could just be a
+  // `vector<AnyModule>`.
+  std::vector<std::shared_ptr<Module>> modules_;
+};
+
+/// A `ModuleHolder` subclass for `ModuleListImpl`.
+/// See the documentation for `ModuleListImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(ModuleList);
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/named_any.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/named_any.h
new file mode 100644
index 0000000000000000000000000000000000000000..68c5f5104c4bb4459518df26990a8c8b5cd2982a
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/named_any.h
@@ -0,0 +1,81 @@
+#pragma once
+
+#include <torch/nn/modules/container/any.h>
+#include <torch/types.h>
+
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+namespace torch::nn {
+
+/// Stores a type erased `Module` with name.
+///
+/// The `NamedAnyModule` class enables the following API for constructing
+/// `nn::Sequential` with named submodules:
+/// \rst
+/// .. code-block:: cpp
+///
+///   struct M : torch::nn::Module {
+///     explicit M(int value_) : value(value_) {}
+///     int value;
+///     int forward() {
+///       return value;
+///     }
+///   };
+///
+///   Sequential sequential({
+///     {"m1", std::make_shared<M>(1)},  // shared pointer to `Module` is
+///     supported {std::string("m2"), M(2)},  // `Module` is supported
+///     {"linear1", Linear(10, 3)}  // `ModuleHolder` is supported
+///   });
+/// \endrst
+class NamedAnyModule {
+ public:
+  /// Creates a `NamedAnyModule` from a (boxed) `Module`.
+  template <typename ModuleType>
+  NamedAnyModule(std::string name, std::shared_ptr<ModuleType> module_ptr)
+      : NamedAnyModule(std::move(name), AnyModule(std::move(module_ptr))) {}
+
+  /// Creates a `NamedAnyModule` from a `Module`, moving or copying it
+  /// into a `shared_ptr` internally.
+  // NOTE: We need to use `std::remove_reference_t<M>` to get rid of
+  // any reference components for make_unique.
+  template <typename M, typename = torch::detail::enable_if_module_t<M>>
+  NamedAnyModule(std::string name, M&& module)
+      : NamedAnyModule(
+            std::move(name),
+            std::make_shared<std::remove_reference_t<M>>(
+                std::forward<M>(module))) {}
+
+  /// Creates a `NamedAnyModule` from a `Module` that is unwrapped from
+  /// a `ModuleHolder`.
+  template <typename M>
+  NamedAnyModule(std::string name, const ModuleHolder<M>& module_holder)
+      : NamedAnyModule(std::move(name), module_holder.ptr()) {}
+
+  /// Creates a `NamedAnyModule` from a type-erased `AnyModule`.
+  NamedAnyModule(std::string name, AnyModule any_module)
+      : name_(std::move(name)), module_(std::move(any_module)) {}
+
+  /// Returns a reference to the name.
+  const std::string& name() const noexcept {
+    return name_;
+  }
+
+  /// Returns a reference to the module.
+  AnyModule& module() noexcept {
+    return module_;
+  }
+
+  /// Returns a const reference to the module.
+  const AnyModule& module() const noexcept {
+    return module_;
+  }
+
+ private:
+  std::string name_;
+  AnyModule module_;
+};
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/parameterdict.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/parameterdict.h
new file mode 100644
index 0000000000000000000000000000000000000000..5a978377d98546a131c214b60c6ab652ac91ce40
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/parameterdict.h
@@ -0,0 +1,146 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/pimpl.h>
+#include <torch/ordered_dict.h>
+#include <utility>
+#include <vector>
+
+namespace torch::nn {
+
+class ParameterDictImpl : public Cloneable<ParameterDictImpl> {
+ public:
+  using Iterator = OrderedDict<std::string, Tensor>::Iterator;
+  using ConstIterator = OrderedDict<std::string, Tensor>::ConstIterator;
+
+  ParameterDictImpl() = default;
+
+  explicit ParameterDictImpl(
+      const torch::OrderedDict<std::string, torch::Tensor>& params) {
+    parameters_ = params;
+  }
+
+  /// `reset()` is empty for `ParameterDict`, since it does not have
+  /// parameters of its own.
+  void reset() override {}
+
+  /// Pretty prints the `ParameterDict` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override {
+    stream << "torch::nn::ParameterDict(" << '\n';
+    for (const auto& pair : parameters_) {
+      stream << "(" << pair.key() << ")"
+             << ": Parameter containing: [" << pair.value().scalar_type()
+             << " of size " << pair.value().sizes() << "]";
+      ;
+      stream << '\n';
+    }
+    stream << ")";
+  }
+
+  /// Insert the parameter along with the key into ParameterDict
+  /// The parameter is set to be require grad by default
+  Tensor& insert(const std::string& key, const Tensor& param) {
+    bool requires_grad = param.requires_grad();
+    return register_parameter(key, param, requires_grad);
+  }
+
+  /// Remove key from the ParameterDict and return its value, throw exception
+  /// if the key is not contained. Please check contains(key) before for a
+  /// non-throwing access.
+  Tensor pop(const std::string& key) {
+    torch::Tensor v = parameters_[key];
+    parameters_.erase(key);
+    return v;
+  }
+
+  /// Return the keys in the dict
+  ::std::vector<std::string> keys() const {
+    return parameters_.keys();
+  }
+
+  /// Return the Values in the dict
+  ::std::vector<torch::Tensor> values() const {
+    return parameters_.values();
+  }
+
+  /// Return an iterator to the start of ParameterDict
+  Iterator begin() {
+    return parameters_.begin();
+  }
+
+  /// Return a const iterator to the start of ParameterDict
+  ConstIterator begin() const {
+    return parameters_.begin();
+  }
+
+  /// Return an iterator to the end of ParameterDict
+  Iterator end() {
+    return parameters_.end();
+  }
+
+  /// Return a const iterator to the end of ParameterDict
+  ConstIterator end() const {
+    return parameters_.end();
+  }
+
+  /// Return the number of items currently stored in the ParameterDict
+  size_t size() const noexcept {
+    return parameters_.size();
+  }
+
+  /// Return true if the ParameterDict is empty, otherwise return false
+  bool empty() const noexcept {
+    return parameters_.is_empty();
+  }
+
+  /// Update the ParameterDict with the key-value pairs from
+  /// another ParameterDict, overwriting existing key
+  template <typename Container>
+  void update(const Container& container) {
+    for (auto& item : container) {
+      parameters_[item.key()] = item.value();
+    }
+  }
+
+  /// Remove all parameters in the ParameterDict
+  void clear() {
+    parameters_.clear();
+  }
+
+  /// Check if the certain parameter with the key in the ParameterDict
+  bool contains(const std::string& key) const noexcept {
+    return parameters_.contains(key);
+  }
+
+  /// Returns the value associated with the given `key`. Throws an exception if
+  /// no such key is stored in the `ParameterDict`. Check contains(key) before
+  /// for a non-throwing way of access
+  const Tensor& get(const std::string& key) const {
+    return parameters_[key];
+  }
+
+  /// Returns the value associated with the given `key`. Throws an exception if
+  /// no such key is stored in the `ParameterDict`. Check contains(key) before
+  /// for a non-throwing way of access
+  Tensor& get(const std::string& key) {
+    return parameters_[key];
+  }
+
+  /// Returns the value associated with the given `key`. Throws an exception if
+  /// no such key is stored in the `ParameterDict`. Check contains(key) before
+  /// for a non-throwing way of access
+  Tensor& operator[](const std::string& key) {
+    return parameters_[key];
+  }
+
+  /// Returns the value associated with the given `key`. Throws an exception if
+  /// no such key is stored in the `ParameterDict`. Check contains(key) before
+  /// for a non-throwing way of access
+  const Tensor& operator[](const std::string& key) const {
+    return parameters_[key];
+  }
+};
+
+TORCH_MODULE(ParameterDict);
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/parameterlist.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/parameterlist.h
new file mode 100644
index 0000000000000000000000000000000000000000..7a53a8e169ecfd356f7a4493bad10acd3648e317
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/parameterlist.h
@@ -0,0 +1,167 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/module.h>
+
+#include <vector>
+
+namespace torch::nn {
+class ParameterListImpl : public Cloneable<ParameterListImpl> {
+ public:
+  using Iterator = typename std::vector<
+      OrderedDict<std::string, torch::Tensor>::Item>::iterator;
+  using ConstIterator = typename std::vector<
+      OrderedDict<std::string, torch::Tensor>::Item>::const_iterator;
+
+  ParameterListImpl() = default;
+
+  /// Constructs the `ParameterList` from a variadic list of ParameterList.
+  template <typename... Tensors>
+  explicit ParameterListImpl(Tensors&&... params) {
+    parameters_.reserve(sizeof...(Tensors));
+    push_back_var(std::forward<Tensors>(params)...);
+  }
+
+  template <typename... Tensors>
+  explicit ParameterListImpl(const Tensors&... params) {
+    parameters_.reserve(sizeof...(Tensors));
+    push_back_var(std::forward<Tensors>(params)...);
+  }
+
+  /// `reset()` is empty for `ParameterList`, since it does not have parameters
+  /// of its own.
+  void reset() override {}
+
+  /// Pretty prints the `ParameterList` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override {
+    stream << "torch::nn::ParameterList(" << '\n';
+    for (const auto& pair : parameters_) {
+      stream << "(" << pair.key() << ")"
+             << ": Parameter containing: [" << pair.value().scalar_type()
+             << " of size " << pair.value().sizes() << "]";
+      ;
+      stream << '\n';
+    }
+    stream << ")";
+  }
+
+  /// push the a given parameter at the end of the list
+  void append(torch::Tensor&& param) {
+    bool requires_grad = param.requires_grad();
+    register_parameter(
+        std::to_string(parameters_.size()), std::move(param), requires_grad);
+  }
+
+  /// push the a given parameter at the end of the list
+  void append(const torch::Tensor& param) {
+    bool requires_grad = param.requires_grad();
+    register_parameter(
+        std::to_string(parameters_.size()), param, requires_grad);
+  }
+
+  /// push the a given parameter at the end of the list
+  /// And the key of the pair will be discarded, only the value
+  /// will be added into the `ParameterList`
+  void append(const OrderedDict<std::string, torch::Tensor>::Item& pair) {
+    register_parameter(
+        std::to_string(parameters_.size()),
+        pair.value(),
+        pair.value().requires_grad());
+  }
+
+  /// extend parameters from a container to the end of the list
+  template <typename Container>
+  void extend(const Container& container) {
+    for (const auto& param : container) {
+      append(param);
+    }
+  }
+
+  /// Returns an iterator to the start of the ParameterList
+  /// the iterator returned will be type of `OrderedDict<std::string,
+  /// torch::Tensor>::Item`
+  Iterator begin() {
+    return parameters_.begin();
+  }
+
+  /// Returns a const iterator to the start of the ParameterList
+  /// the iterator returned will be type of `OrderedDict<std::string,
+  /// torch::Tensor>::Item`
+  ConstIterator begin() const {
+    return parameters_.begin();
+  }
+
+  /// Returns an iterator to the end of the ParameterList
+  /// the iterator returned will be type of `OrderedDict<std::string,
+  /// torch::Tensor>::Item`
+  Iterator end() {
+    return parameters_.end();
+  }
+
+  /// Returns a const iterator to the end of the ParameterList
+  /// the iterator returned will be type of `OrderedDict<std::string,
+  /// torch::Tensor>::Item`
+  ConstIterator end() const {
+    return parameters_.end();
+  }
+
+  /// Returns the value associated with the given `key`. Throws an exception if
+  /// no such key is stored in the `ParameterList`. Check contains(key) before
+  /// for a non-throwing way of access
+  at::Tensor& at(size_t idx) {
+    TORCH_CHECK(idx < size(), "Index out of range");
+    return parameters_[std::to_string(idx)];
+  }
+
+  /// Returns the value associated with the given `key`. Throws an exception if
+  /// no such key is stored in the `ParameterList`. Check contains(key) before
+  /// for a non-throwing way of access
+  const at::Tensor& at(size_t idx) const {
+    TORCH_CHECK(idx < size(), "Index out of range");
+    return parameters_[std::to_string(idx)];
+  }
+
+  /// Returns the value associated with the given `key`. Throws an exception if
+  /// no such key is stored in the `ParameterList`. Check contains(key) before
+  /// for a non-throwing way of access
+  at::Tensor& operator[](size_t idx) {
+    return at(idx);
+  }
+
+  /// Returns the value associated with the given `key`. Throws an exception if
+  /// no such key is stored in the `ParameterList`. Check contains(key) before
+  /// for a non-throwing way of access
+  const at::Tensor& operator[](size_t idx) const {
+    return at(idx);
+  }
+
+  /// Return the size of the ParameterList
+  size_t size() const noexcept {
+    return parameters_.size();
+  }
+  /// True if the ParameterList is empty
+  bool is_empty() const noexcept {
+    return parameters_.is_empty();
+  }
+
+  /// Overload the +=, so that two ParameterList could be incrementally added
+  template <typename Container>
+  Container& operator+=(const Container& other) {
+    extend(other);
+    return *this;
+  }
+
+ private:
+  template <typename Head, typename... Tail>
+  void push_back_var(Head&& head, Tail&&... tail) {
+    append(std::forward<Head>(head));
+    // Recursively calls this method, until the parameter pack only thas this
+    // entry left. Then calls `push_back()` a final time (above).
+    push_back_var(std::forward<Tail>(tail)...);
+  }
+
+  /// The base case, when the list of modules is empty.
+  void push_back_var() {}
+};
+TORCH_MODULE(ParameterList);
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/sequential.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/sequential.h
new file mode 100644
index 0000000000000000000000000000000000000000..c5941b0bdb7798f26b3ebec29bcb497e108a1fa0
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/container/sequential.h
@@ -0,0 +1,387 @@
+#pragma once
+
+#include <torch/detail/static.h>
+#include <torch/nn/cloneable.h>
+#include <torch/nn/module.h>
+#include <torch/nn/modules/container/any.h>
+#include <torch/nn/modules/container/named_any.h>
+#include <torch/nn/pimpl.h>
+#include <torch/types.h>
+
+#include <c10/util/Exception.h>
+
+#include <cstdint>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+namespace torch::nn {
+
+/// A list of `Module`s that acts as a `Module` itself.
+///
+/// A `Sequential` is fundamentally a list of `Module`s, each with a `forward()`
+/// method. `Sequential` provides a `forward()` method of its own, which accepts
+/// any input and forwards it to the first module it stores. It then "chains"
+/// outputs to inputs sequentially for each subsequent module, finally returning
+/// the output of the last module. For example:
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   torch::nn::Sequential seq(
+///     torch::nn::Linear(3, 4),
+///     torch::nn::BatchNorm1d(4),
+///     torch::nn::Dropout(0.5)
+///   );
+///
+///   auto output = seq->forward(torch::ones(3));
+///
+/// \endrst
+///
+/// This can conceptually be thought of as the following loop (using Python as
+/// pseudocode):
+///
+/// \rst
+/// .. code-block:: python
+///
+///   def forward(sequential, input):
+///     for module in sequential:
+///       input = module(input)
+///     return input
+///
+/// \endrst
+///
+/// Why should you use `Sequential` instead of a simple `std::vector`? The value
+/// a `Sequential` provides over manually calling a sequence of modules is that
+/// it allows treating the whole container *as a single module*, such that
+/// performing a transformation on the `Sequential` applies to each of the
+/// modules it stores (which are each a registered submodule of the
+/// `Sequential`). For example, calling
+/// `.to(torch::kCUDA)` on a `Sequential` will move each module in the list to
+/// CUDA memory. For example:
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   torch::nn::Sequential seq(
+///     torch::nn::Linear(3, 4),
+///     torch::nn::BatchNorm1d(4),
+///     torch::nn::Dropout(0.5)
+///   );
+///
+///   // Convert all modules to CUDA.
+///   seq->to(torch::kCUDA);
+///
+/// \endrst
+///
+/// Finally, `Sequential` provides a lightweight container API, such as allowing
+/// iteration over submodules, positional access, adding a new module after
+/// construction via `push_back`, as well as joining two `Sequential`s via
+/// `extend`.
+///
+/// \rst
+/// .. attention::
+///   One current limitation of `Sequential` is that all except the first module
+///   must accept a single argument. If your modules need to take multiple
+///   arguments, you should define them to take and return tuples.
+/// \endrst
+class SequentialImpl : public Cloneable<SequentialImpl> {
+ public:
+  using Iterator = std::vector<AnyModule>::iterator;
+  using ConstIterator = std::vector<AnyModule>::const_iterator;
+
+  SequentialImpl() = default;
+
+  /// Constructs the `Sequential` from a variadic list of modules.
+  template <typename... Modules>
+  explicit SequentialImpl(Modules&&... modules) {
+    modules_.reserve(sizeof...(Modules));
+    push_back(std::forward<Modules>(modules)...);
+  }
+
+  /// Constructs the `Sequential` from an `OrderedDict` of named `AnyModule`s.
+  explicit SequentialImpl(
+      torch::OrderedDict<std::string, AnyModule>&& ordered_dict) {
+    modules_.reserve(ordered_dict.size());
+    for (auto& item : ordered_dict) {
+      push_back(item.key(), std::move(item.value()));
+    }
+  }
+
+  /// Constructs the `Sequential` from a braced-init-list of named `AnyModule`s.
+  /// It enables the following use case:
+  /// `Sequential sequential({{"m1", M(1)}, {"m2", M(2)}})`
+  explicit SequentialImpl(std::initializer_list<NamedAnyModule> named_modules) {
+    modules_.reserve(named_modules.size());
+    for (const auto& named_module : named_modules) {
+      push_back(named_module.name(), named_module.module());
+    }
+  }
+
+  /// Special cloning function for `Sequential` because it does not use
+  /// `reset()`.
+  std::shared_ptr<Module> clone(
+      const std::optional<Device>& device = std::nullopt) const override {
+    auto clone = std::make_shared<SequentialImpl>();
+    for (const auto& module : modules_) {
+      clone->push_back(module.clone(device));
+    }
+    return clone;
+  }
+
+  /// `reset()` is empty for `Sequential`, since it does not have parameters of
+  /// its own.
+  void reset() override {}
+
+  /// Pretty prints the `Sequential` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override {
+    stream << "torch::nn::Sequential";
+  }
+
+  /// Feeds `inputs` to the first module and then chains outputs to inputs,
+  /// returning the last output.
+  ///
+  /// Conceptually the following loop in Python:
+  ///
+  /// \rst
+  /// .. code-block:: python
+  ///
+  ///   def forward(sequential, input):
+  ///     for module in sequential:
+  ///       input = module(input)
+  ///     return input
+  ///
+  /// \endrst
+  ///
+  /// The return type is taken as the first template parameter. It defaults to
+  /// `Tensor`. If the last module in the `Sequential` returns another type `T`,
+  /// you should call `forward<T>(inputs)` instead of just `forward(inputs)`:
+  ///
+  /// \rst
+  /// .. code-block:: cpp
+  ///
+  ///   torch::Tensor tensor = sequential1->forward(inputs);
+  ///   int integer = sequential2->forward<int>(inputs);
+  ///   float value = sequential3->forward<float>(inputs);
+  ///
+  /// \endrst
+  template <typename ReturnType = Tensor, typename... InputTypes>
+  ReturnType forward(InputTypes&&... inputs) {
+    TORCH_CHECK(!is_empty(), "Cannot call forward() on an empty Sequential");
+
+    auto iterator = modules_.begin();
+    auto input = iterator->any_forward(std::forward<InputTypes>(inputs)...);
+
+    for (++iterator; iterator != modules_.end(); ++iterator) {
+      input = iterator->any_forward(std::move(input));
+    }
+
+    // Check the return value and give a nice error message if the requested
+    // return type was incorrect.
+    if (auto* return_value = input.template try_get<ReturnType>()) {
+      return std::move(*return_value);
+    }
+    TORCH_CHECK(
+        false,
+        "The type of the return value is ",
+        c10::demangle(input.type_info().name()),
+        ", but you asked for type ",
+        c10::demangle(typeid(ReturnType).name()));
+  }
+
+  /// Adds a new (boxed) `Module` to the `Sequential` container.
+  template <typename ModuleType>
+  void push_back(std::shared_ptr<ModuleType> module_ptr) {
+    push_back(std::to_string(modules_.size()), std::move(module_ptr));
+  }
+
+  /// Adds a new named (boxed) `Module` to the `Sequential` container.
+  template <typename ModuleType>
+  void push_back(std::string name, std::shared_ptr<ModuleType> module_ptr) {
+    push_back(std::move(name), AnyModule(std::move(module_ptr)));
+  }
+
+  /// Adds a new `Module` to the `Sequential` container, moving or copying it
+  /// into a `shared_ptr` internally. This method allows passing value types,
+  /// and letting the container deal with the boxing. This means you can write
+  /// `Sequential(Module(3, 4))` instead of
+  /// `Sequential(std::make_shared<Module>(3, 4))`.
+  template <typename M, typename = torch::detail::enable_if_module_t<M>>
+  void push_back(M&& module) {
+    push_back(std::to_string(modules_.size()), std::forward<M>(module));
+  }
+
+  /// Adds a new named `Module` to the `Sequential` container, moving or copying
+  /// it into a `shared_ptr` internally. This method allows passing value types,
+  /// and letting the container deal with the boxing.
+  template <typename M, typename = torch::detail::enable_if_module_t<M>>
+  void push_back(std::string name, M&& module) {
+    using Type = typename std::remove_reference_t<M>;
+    push_back(std::move(name), std::make_shared<Type>(std::forward<M>(module)));
+  }
+
+  /// Unwraps the contained module of a `ModuleHolder` and adds it to the
+  /// `Sequential`.
+  template <typename M>
+  void push_back(const ModuleHolder<M>& module_holder) {
+    push_back(std::to_string(modules_.size()), module_holder);
+  }
+
+  /// Unwraps the contained named module of a `ModuleHolder` and adds it to the
+  /// `Sequential`.
+  template <typename M>
+  void push_back(std::string name, const ModuleHolder<M>& module_holder) {
+    push_back(std::move(name), module_holder.ptr());
+  }
+
+  /// Iterates over the container and calls `push_back()` on each value.
+  template <typename Container>
+  void extend(const Container& container) {
+    for (const auto& module : container) {
+      push_back(module);
+    }
+  }
+
+  /// Adds a type-erased `AnyModule` to the `Sequential`.
+  void push_back(AnyModule any_module) {
+    push_back(std::to_string(modules_.size()), std::move(any_module));
+  }
+
+  void push_back(std::string name, AnyModule any_module) {
+    modules_.push_back(std::move(any_module));
+    const auto index = modules_.size() - 1;
+    register_module(std::move(name), modules_[index].ptr());
+  }
+
+  /// Returns an iterator to the start of the `Sequential`.
+  Iterator begin() {
+    return modules_.begin();
+  }
+
+  /// Returns a const iterator to the start of the `Sequential`.
+  ConstIterator begin() const {
+    return modules_.begin();
+  }
+
+  /// Returns an iterator to the end of the `Sequential`.
+  Iterator end() {
+    return modules_.end();
+  }
+
+  /// Returns a const iterator to the end of the `Sequential`.
+  ConstIterator end() const {
+    return modules_.end();
+  }
+
+  /// Attempts to return the module at the given index as the requested type.
+  /// Throws an exception if the index is out of bounds or the types do not
+  /// match.
+  template <typename T>
+  T& at(size_t index) {
+    static_assert(
+        torch::detail::is_module<T>::value,
+        "Can only call Sequential::at with an nn::Module type");
+    TORCH_CHECK(index < size(), "Index out of range");
+    return modules_[index].get<T>();
+  }
+
+  /// Attempts to return the module at the given index as the requested type.
+  /// Throws an exception if the index is out of bounds or the types do not
+  /// match.
+  template <typename T>
+  const T& at(size_t index) const {
+    static_assert(
+        torch::detail::is_module<T>::value,
+        "Can only call Sequential::at with an nn::Module type");
+    TORCH_CHECK(index < size(), "Index out of range");
+    return modules_[index].get<T>();
+  }
+
+  /// Attempts to return a `std::shared_ptr` whose dynamic type is that of the
+  /// underlying module at the given index. Throws an exception if the index is
+  /// out of bounds.
+  std::shared_ptr<Module> ptr(size_t index) const {
+    TORCH_CHECK(index < size(), "Index out of range");
+    return modules_[index].ptr();
+  }
+
+  /// Attempts to return a `std::shared_ptr` whose type is the one provided.
+  /// Throws an exception if the index is out of bounds or the types do not
+  /// match.
+  template <typename T>
+  std::shared_ptr<T> ptr(size_t index) const {
+    static_assert(
+        torch::detail::is_module<T>::value,
+        "Can only call Sequential::ptr with an nn::Module type");
+    TORCH_CHECK(index < size(), "Index out of range");
+    return modules_[index].ptr<T>();
+  }
+
+  /// Like `ptr(index)`.
+  std::shared_ptr<Module> operator[](size_t index) const {
+    // This is the only method we can call without a type.
+    return ptr(index);
+  }
+
+  /// The current size of the `Sequential` container.
+  size_t size() const noexcept {
+    return modules_.size();
+  }
+
+  /// True if there are no modules in the `Sequential`.
+  bool is_empty() const noexcept {
+    return size() == 0;
+  }
+
+ private:
+  /// Takes a First *and* Second parameter, to avoid ambiguity when a parameter
+  /// pack has only one type, in which case the template would be preferred,
+  /// even if the other `push_back` functions are better fits (e.g. `unique_ptr`
+  /// -> `shared_ptr` overload).
+  /// NOTE: We explicitly avoid matching this template with
+  /// `push_back(std::string("name"), module)` or `push_back("name", module)`,
+  /// since they should be handled by their respective `push_back` functions.
+  template <
+      typename First,
+      typename Second,
+      typename... Rest,
+      typename = std::enable_if_t<
+          !std::is_same_v<First, std::string> &&
+          // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
+          !std::is_same_v<std::decay_t<First>, std::decay_t<const char (&)[]>>>>
+  void push_back(First&& first, Second&& second, Rest&&... rest) {
+    push_back(std::forward<First>(first));
+    // Recursively calls this method, until the parameter pack only thas this
+    // entry left. Then calls `push_back()` a final time (above).
+    push_back(std::forward<Second>(second), std::forward<Rest>(rest)...);
+  }
+
+  /// The base case, when the list of modules is empty.
+  void push_back() {}
+
+  // Box the AnyModules to give Sequential reference semantics, like the rest of
+  // the API. Note that this is not required otherwise, this could just be a
+  // `vector<AnyModule>`.
+  std::vector<AnyModule> modules_;
+};
+
+/// A `ModuleHolder` subclass for `SequentialImpl`.
+/// See the documentation for `SequentialImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+class Sequential : public torch::nn::ModuleHolder<SequentialImpl> {
+ public:
+  using torch::nn::ModuleHolder<SequentialImpl>::ModuleHolder;
+
+  Sequential() = default;
+
+  /// Constructs the `Sequential` from a braced-init-list of named `AnyModule`s.
+  /// It enables the following use case:
+  /// `Sequential sequential({{"m1", M(1)}, {"m2", M(2)}})`
+  Sequential(std::initializer_list<NamedAnyModule> named_modules)
+      : ModuleHolder(std::make_shared<SequentialImpl>(named_modules)) {}
+};
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/conv.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/conv.h
new file mode 100644
index 0000000000000000000000000000000000000000..f01836643e3f5bc2980658a5638642c4b585e63e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/conv.h
@@ -0,0 +1,448 @@
+#pragma once
+
+#include <c10/util/irange.h>
+#include <c10/util/overloaded.h>
+
+#include <torch/expanding_array.h>
+#include <torch/nn/cloneable.h>
+#include <torch/nn/init.h>
+#include <torch/nn/modules/common.h>
+#include <torch/nn/modules/utils.h>
+#include <torch/nn/options/conv.h>
+#include <torch/nn/pimpl.h>
+#include <torch/types.h>
+
+#include <torch/csrc/Export.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace torch::nn {
+
+/// Base class for all (dimension-specialized) convolution modules.
+template <size_t D, typename Derived>
+class ConvNdImpl : public torch::nn::Cloneable<Derived> {
+ public:
+  explicit ConvNdImpl(detail::ConvNdOptions<D> options_)
+      : options(std::move(options_)) {
+    ConvNdImpl::reset();
+  }
+
+  void reset() override {
+    TORCH_CHECK(
+        options.in_channels() > 0 && options.groups() > 0 &&
+            options.out_channels() > 0,
+        "in_channels, groups and out_channels must be a positive integer.");
+    TORCH_CHECK(
+        options.in_channels() % options.groups() == 0,
+        "in_channels must be divisible by groups");
+    TORCH_CHECK(
+        options.out_channels() % options.groups() == 0,
+        "out_channels must be divisible by groups");
+
+    std::visit(
+        c10::overloaded(
+            [&](enumtype::kValid) {
+              _reversed_padding_repeated_twice.resize(2 * D);
+              std::fill_n(_reversed_padding_repeated_twice.begin(), 2 * D, 0);
+            },
+            [&](enumtype::kSame) {
+              for (const auto i : c10::irange(D)) {
+                const auto stride = (*options.stride())[i];
+                TORCH_CHECK(
+                    stride == 1,
+                    "padding='same' is not supported for strided convolutions");
+              }
+
+              _reversed_padding_repeated_twice.resize(2 * D);
+              for (const auto i : c10::irange(D)) {
+                const auto dilation = (*options.dilation())[i];
+                const auto kernel_size = (*options.kernel_size())[i];
+                const auto total_padding = dilation * (kernel_size - 1);
+                auto left_pad = total_padding / 2;
+                auto right_pad = total_padding - left_pad;
+                _reversed_padding_repeated_twice[2 * i] = left_pad;
+                _reversed_padding_repeated_twice[2 * i + 1] = right_pad;
+              }
+            },
+            [&](const ExpandingArray<D>& pad) {
+              _reversed_padding_repeated_twice =
+                  torch::nn::modules::utils::_reverse_repeat_vector(pad, 2);
+            }),
+        options.padding());
+
+    if (options.transposed()) {
+      std::vector<int64_t> weight_sizes = {
+          options.in_channels(), options.out_channels() / options.groups()};
+      weight_sizes.insert(
+          weight_sizes.end(),
+          (*options.kernel_size()).begin(),
+          (*options.kernel_size()).end());
+      weight = this->register_parameter("weight", torch::empty(weight_sizes));
+    } else {
+      std::vector<int64_t> weight_sizes = {
+          options.out_channels(), options.in_channels() / options.groups()};
+      weight_sizes.insert(
+          weight_sizes.end(),
+          (*options.kernel_size()).begin(),
+          (*options.kernel_size()).end());
+      weight = this->register_parameter("weight", torch::empty(weight_sizes));
+    }
+
+    if (options.bias()) {
+      bias = this->register_parameter(
+          "bias", torch::empty({options.out_channels()}));
+    } else {
+      this->register_parameter("bias", Tensor(), /*requires_grad=*/false);
+    }
+
+    reset_parameters();
+  }
+
+  void reset_parameters() {
+    init::kaiming_uniform_(
+        weight,
+        /*a=*/std::sqrt(5)); // NOLINT(cppcoreguidelines-avoid-magic-numbers)
+
+    if (bias.defined()) {
+      auto [fan_in, fan_out] = init::_calculate_fan_in_and_fan_out(weight);
+      auto bound = 1 / std::sqrt(fan_in);
+      init::uniform_(bias, -bound, bound);
+    }
+  }
+
+  /// Pretty prints the `Conv{1,2,3}d` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override {
+    stream << "torch::nn::Conv" << D << "d"
+           << "(" << options.in_channels() << ", " << options.out_channels()
+           << ", kernel_size=" << options.kernel_size()
+           << ", stride=" << options.stride();
+    std::visit(
+        c10::overloaded(
+            [&](enumtype::kValid) { stream << ", padding='valid'"; },
+            [&](enumtype::kSame) { stream << ", padding='same'"; },
+            [&](const ExpandingArray<D>& pad) {
+              if (*pad != *ExpandingArray<D>(0)) {
+                stream << ", padding=" << pad;
+              }
+            }),
+        options.padding());
+    if (*options.dilation() != *ExpandingArray<D>(1)) {
+      stream << ", dilation=" << options.dilation();
+    }
+    if (*options.output_padding() != *ExpandingArray<D>(0)) {
+      stream << ", output_padding=" << options.output_padding();
+    }
+    if (options.groups() != 1) {
+      stream << ", groups=" << options.groups();
+    }
+    if (!options.bias()) {
+      stream << ", bias=" << std::boolalpha << false;
+    }
+    if (!std::get_if<enumtype::kZeros>(&options.padding_mode())) {
+      stream << ", padding_mode="
+             << enumtype::get_enum_name(options.padding_mode());
+    }
+    stream << ")";
+  }
+
+  /// The options with which this `Module` was constructed.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  detail::ConvNdOptions<D> options;
+
+  /// The learned kernel (or "weight").
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  Tensor weight;
+
+  /// The learned bias. Only defined if the `bias` option was true.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  Tensor bias;
+
+ protected:
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::vector<int64_t> _reversed_padding_repeated_twice;
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Conv1d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies convolution over a 1-D input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Conv1d to learn about
+/// the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::Conv1dOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Conv1d model(Conv1dOptions(3, 2, 3).stride(1).bias(false));
+/// ```
+class TORCH_API Conv1dImpl : public ConvNdImpl<1, Conv1dImpl> {
+ public:
+  Conv1dImpl(
+      int64_t input_channels,
+      int64_t output_channels,
+      ExpandingArray<1> kernel_size)
+      : Conv1dImpl(
+            Conv1dOptions(input_channels, output_channels, kernel_size)) {}
+  explicit Conv1dImpl(Conv1dOptions options_);
+  Tensor forward(const Tensor& input);
+};
+
+/// A `ModuleHolder` subclass for `Conv1dImpl`.
+/// See the documentation for `Conv1dImpl` class to learn what methods it
+/// provides, and examples of how to use `Conv1d` with
+/// `torch::nn::Conv1dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(Conv1d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Conv2d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies convolution over a 2-D input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Conv2d to learn about
+/// the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::Conv2dOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Conv2d model(Conv2dOptions(3, 2, 3).stride(1).bias(false));
+/// ```
+class TORCH_API Conv2dImpl : public ConvNdImpl<2, Conv2dImpl> {
+ public:
+  Conv2dImpl(
+      int64_t input_channels,
+      int64_t output_channels,
+      ExpandingArray<2> kernel_size)
+      : Conv2dImpl(
+            Conv2dOptions(input_channels, output_channels, kernel_size)) {}
+  explicit Conv2dImpl(Conv2dOptions options_);
+  Tensor forward(const Tensor& input);
+
+ protected:
+  Tensor _conv_forward(const Tensor& input, const Tensor& weight);
+};
+
+/// A `ModuleHolder` subclass for `Conv2dImpl`.
+/// See the documentation for `Conv2dImpl` class to learn what methods it
+/// provides, and examples of how to use `Conv2d` with
+/// `torch::nn::Conv2dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(Conv2d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Conv3d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies convolution over a 3-D input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Conv3d to learn about
+/// the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::Conv3dOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Conv3d model(Conv3dOptions(3, 2, 3).stride(1).bias(false));
+/// ```
+class TORCH_API Conv3dImpl : public ConvNdImpl<3, Conv3dImpl> {
+ public:
+  Conv3dImpl(
+      int64_t input_channels,
+      int64_t output_channels,
+      ExpandingArray<3> kernel_size)
+      : Conv3dImpl(
+            Conv3dOptions(input_channels, output_channels, kernel_size)) {}
+  explicit Conv3dImpl(Conv3dOptions options_);
+  Tensor forward(const Tensor& input);
+};
+
+/// A `ModuleHolder` subclass for `Conv3dImpl`.
+/// See the documentation for `Conv3dImpl` class to learn what methods it
+/// provides, and examples of how to use `Conv3d` with
+/// `torch::nn::Conv3dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(Conv3d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~ ConvTranspose ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Base class for all (dimension-specialized) convolution transpose modules.
+template <size_t D, typename Derived>
+class ConvTransposeNdImpl : public ConvNdImpl<D, Derived> {
+ public:
+  using torch::nn::ConvNdImpl<D, Derived>::ConvNdImpl;
+  explicit ConvTransposeNdImpl(detail::ConvNdOptions<D> options_)
+      : ConvNdImpl<D, Derived>(options_) {
+    TORCH_INTERNAL_ASSERT(
+        std::holds_alternative<ExpandingArray<D>>(this->options.padding()),
+        "ConvTranspose padding cannot be a string");
+  }
+
+  /// Pretty prints the `ConvTranspose{1,2,3}d` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override {
+    stream << "torch::nn::ConvTranspose" << D << "d"
+           << "(" << this->options.in_channels() << ", "
+           << this->options.out_channels()
+           << ", kernel_size=" << this->options.kernel_size()
+           << ", stride=" << this->options.stride();
+    const auto& pad = padding();
+    if (*pad != *ExpandingArray<D>(0)) {
+      stream << ", padding=" << pad;
+    }
+    if (*this->options.dilation() != *ExpandingArray<D>(1)) {
+      stream << ", dilation=" << this->options.dilation();
+    }
+    if (*this->options.output_padding() != *ExpandingArray<D>(0)) {
+      stream << ", output_padding=" << this->options.output_padding();
+    }
+    if (this->options.groups() != 1) {
+      stream << ", groups=" << this->options.groups();
+    }
+    if (!this->options.bias()) {
+      stream << ", bias=" << std::boolalpha << false;
+    }
+    if (!std::get_if<enumtype::kZeros>(&this->options.padding_mode())) {
+      stream << ", padding_mode="
+             << enumtype::get_enum_name(this->options.padding_mode());
+    }
+    stream << ")";
+  }
+
+ protected:
+  const ExpandingArray<D>& padding() const {
+    return std::get<ExpandingArray<D>>(this->options.padding());
+  }
+
+  std::vector<int64_t> _output_padding(
+      const Tensor& input,
+      const std::optional<at::IntArrayRef>& output_size,
+      const ExpandingArray<D>& stride,
+      const ExpandingArray<D>& padding,
+      const ExpandingArray<D>& kernel_size);
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ConvTranspose1d
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the ConvTranspose1d function.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.ConvTranspose1d to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::ConvTranspose1dOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// ConvTranspose1d model(ConvTranspose1dOptions(3, 2,
+/// 3).stride(1).bias(false));
+/// ```
+class TORCH_API ConvTranspose1dImpl
+    : public ConvTransposeNdImpl<1, ConvTranspose1dImpl> {
+ public:
+  ConvTranspose1dImpl(
+      int64_t input_channels,
+      int64_t output_channels,
+      ExpandingArray<1> kernel_size)
+      : ConvTranspose1dImpl(ConvTranspose1dOptions(
+            input_channels,
+            output_channels,
+            kernel_size)) {}
+  explicit ConvTranspose1dImpl(ConvTranspose1dOptions options_);
+  Tensor forward(
+      const Tensor& input,
+      const std::optional<at::IntArrayRef>& output_size = std::nullopt);
+
+ protected:
+  FORWARD_HAS_DEFAULT_ARGS({1, AnyValue(std::optional<at::IntArrayRef>())})
+};
+
+/// A `ModuleHolder` subclass for `ConvTranspose1dImpl`.
+/// See the documentation for `ConvTranspose1dImpl` class to learn what methods
+/// it provides, and examples of how to use `ConvTranspose1d` with
+/// `torch::nn::ConvTranspose1dOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(ConvTranspose1d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ConvTranspose2d
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the ConvTranspose2d function.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.ConvTranspose2d to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::ConvTranspose2dOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// ConvTranspose2d model(ConvTranspose2dOptions(3, 2,
+/// 3).stride(1).bias(false));
+/// ```
+class TORCH_API ConvTranspose2dImpl
+    : public ConvTransposeNdImpl<2, ConvTranspose2dImpl> {
+ public:
+  ConvTranspose2dImpl(
+      int64_t input_channels,
+      int64_t output_channels,
+      ExpandingArray<2> kernel_size)
+      : ConvTranspose2dImpl(ConvTranspose2dOptions(
+            input_channels,
+            output_channels,
+            kernel_size)) {}
+  explicit ConvTranspose2dImpl(ConvTranspose2dOptions options_);
+  Tensor forward(
+      const Tensor& input,
+      const std::optional<at::IntArrayRef>& output_size = std::nullopt);
+
+ protected:
+  FORWARD_HAS_DEFAULT_ARGS({1, AnyValue(std::optional<at::IntArrayRef>())})
+};
+
+/// A `ModuleHolder` subclass for `ConvTranspose2dImpl`.
+/// See the documentation for `ConvTranspose2dImpl` class to learn what methods
+/// it provides, and examples of how to use `ConvTranspose2d` with
+/// `torch::nn::ConvTranspose2dOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(ConvTranspose2d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ConvTranspose3d
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the ConvTranspose3d function.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.ConvTranspose3d to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::ConvTranspose3dOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// ConvTranspose3d model(ConvTranspose3dOptions(2, 2,
+/// 2).stride(1).bias(false));
+/// ```
+class TORCH_API ConvTranspose3dImpl
+    : public ConvTransposeNdImpl<3, ConvTranspose3dImpl> {
+ public:
+  ConvTranspose3dImpl(
+      int64_t input_channels,
+      int64_t output_channels,
+      ExpandingArray<3> kernel_size)
+      : ConvTranspose3dImpl(ConvTranspose3dOptions(
+            input_channels,
+            output_channels,
+            kernel_size)) {}
+  explicit ConvTranspose3dImpl(ConvTranspose3dOptions options_);
+  Tensor forward(
+      const Tensor& input,
+      const std::optional<at::IntArrayRef>& output_size = std::nullopt);
+
+ protected:
+  FORWARD_HAS_DEFAULT_ARGS({1, AnyValue(std::optional<at::IntArrayRef>())})
+};
+
+/// A `ModuleHolder` subclass for `ConvTranspose3dImpl`.
+/// See the documentation for `ConvTranspose3dImpl` class to learn what methods
+/// it provides, and examples of how to use `ConvTranspose3d` with
+/// `torch::nn::ConvTranspose3dOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(ConvTranspose3d);
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/distance.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/distance.h
new file mode 100644
index 0000000000000000000000000000000000000000..ccc6a4d21bb7c1339c501e123b03d7e317d6cc46
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/distance.h
@@ -0,0 +1,84 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/functional/distance.h>
+#include <torch/nn/options/distance.h>
+#include <torch/nn/pimpl.h>
+#include <torch/types.h>
+
+#include <torch/csrc/Export.h>
+
+namespace torch::nn {
+
+/// Returns the cosine similarity between :math:`x_1` and :math:`x_2`, computed
+/// along `dim`.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.CosineSimilarity to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::CosineSimilarityOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// CosineSimilarity model(CosineSimilarityOptions().dim(0).eps(0.5));
+/// ```
+class TORCH_API CosineSimilarityImpl : public Cloneable<CosineSimilarityImpl> {
+ public:
+  explicit CosineSimilarityImpl(const CosineSimilarityOptions& options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `CosineSimilarity` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input1, const Tensor& input2);
+
+  /// The options with which this `Module` was constructed.
+  CosineSimilarityOptions options;
+};
+
+/// A `ModuleHolder` subclass for `CosineSimilarityImpl`.
+/// See the documentation for `CosineSimilarityImpl` class to learn what methods
+/// it provides, and examples of how to use `CosineSimilarity` with
+/// `torch::nn::CosineSimilarityOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(CosineSimilarity);
+
+// ============================================================================
+
+/// Returns the batchwise pairwise distance between vectors :math:`v_1`,
+/// :math:`v_2` using the p-norm.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.PairwiseDistance to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::PairwiseDistanceOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// PairwiseDistance
+/// model(PairwiseDistanceOptions().p(3).eps(0.5).keepdim(true));
+/// ```
+class TORCH_API PairwiseDistanceImpl : public Cloneable<PairwiseDistanceImpl> {
+ public:
+  explicit PairwiseDistanceImpl(const PairwiseDistanceOptions& options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `PairwiseDistance` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input1, const Tensor& input2);
+
+  /// The options with which this `Module` was constructed.
+  PairwiseDistanceOptions options;
+};
+
+/// A `ModuleHolder` subclass for `PairwiseDistanceImpl`.
+/// See the documentation for `PairwiseDistanceImpl` class to learn what methods
+/// it provides, and examples of how to use `PairwiseDistance` with
+/// `torch::nn::PairwiseDistanceOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(PairwiseDistance);
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/dropout.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/dropout.h
new file mode 100644
index 0000000000000000000000000000000000000000..803d6d0a0bd61b7733e6c9ab8dd2c8722e7c5481
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/dropout.h
@@ -0,0 +1,184 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/options/dropout.h>
+#include <torch/nn/pimpl.h>
+#include <torch/types.h>
+
+#include <torch/csrc/Export.h>
+
+namespace torch::nn {
+
+namespace detail {
+
+template <typename Derived>
+class _DropoutNd : public torch::nn::Cloneable<Derived> {
+ public:
+  _DropoutNd(double p) : _DropoutNd(DropoutOptions().p(p)) {}
+
+  explicit _DropoutNd(const DropoutOptions& options_ = {}) : options(options_) {
+    _DropoutNd::reset();
+  }
+
+  void reset() override {
+    TORCH_CHECK(
+        options.p() >= 0. && options.p() <= 1.,
+        "dropout probability has to be between 0 and 1, but got ",
+        options.p());
+  }
+
+  /// The options with which this `Module` was constructed.
+  DropoutOptions options;
+};
+
+} // namespace detail
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Dropout ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies dropout over a 1-D input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Dropout to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::DropoutOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Dropout model(DropoutOptions().p(0.42).inplace(true));
+/// ```
+class TORCH_API DropoutImpl : public detail::_DropoutNd<DropoutImpl> {
+ public:
+  using detail::_DropoutNd<DropoutImpl>::_DropoutNd;
+
+  Tensor forward(Tensor input);
+
+  /// Pretty prints the `Dropout` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+};
+
+/// A `ModuleHolder` subclass for `DropoutImpl`.
+/// See the documentation for `DropoutImpl` class to learn what methods it
+/// provides, and examples of how to use `Dropout` with
+/// `torch::nn::DropoutOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(Dropout);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Dropout2d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies dropout over a 2-D input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Dropout2d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::Dropout2dOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Dropout2d model(Dropout2dOptions().p(0.42).inplace(true));
+/// ```
+class TORCH_API Dropout2dImpl : public detail::_DropoutNd<Dropout2dImpl> {
+ public:
+  using detail::_DropoutNd<Dropout2dImpl>::_DropoutNd;
+
+  Tensor forward(Tensor input);
+
+  /// Pretty prints the `Dropout2d` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+};
+
+/// A `ModuleHolder` subclass for `Dropout2dImpl`.
+/// See the documentation for `Dropout2dImpl` class to learn what methods it
+/// provides, and examples of how to use `Dropout2d` with
+/// `torch::nn::Dropout2dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(Dropout2d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Dropout3d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies dropout over a 3-D input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Dropout3d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::Dropout3dOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Dropout3d model(Dropout3dOptions().p(0.42).inplace(true));
+/// ```
+class TORCH_API Dropout3dImpl : public detail::_DropoutNd<Dropout3dImpl> {
+ public:
+  using detail::_DropoutNd<Dropout3dImpl>::_DropoutNd;
+
+  Tensor forward(Tensor input);
+
+  /// Pretty prints the `Dropout3d` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+};
+
+/// A `ModuleHolder` subclass for `Dropout3dImpl`.
+/// See the documentation for `Dropout3dImpl` class to learn what methods it
+/// provides, and examples of how to use `Dropout3d` with
+/// `torch::nn::Dropout3dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(Dropout3d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ AlphaDropout ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies Alpha Dropout over the input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.AlphaDropout to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::AlphaDropoutOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// AlphaDropout model(AlphaDropoutOptions(0.2).inplace(true));
+/// ```
+class TORCH_API AlphaDropoutImpl : public detail::_DropoutNd<AlphaDropoutImpl> {
+ public:
+  using detail::_DropoutNd<AlphaDropoutImpl>::_DropoutNd;
+
+  Tensor forward(const Tensor& input);
+
+  /// Pretty prints the `AlphaDropout` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+};
+
+/// A `ModuleHolder` subclass for `AlphaDropoutImpl`.
+/// See the documentation for `AlphaDropoutImpl` class to learn what methods it
+/// provides, and examples of how to use `AlphaDropout` with
+/// `torch::nn::AlphaDropoutOptions`. See the documentation for `ModuleHolder`
+/// to learn about PyTorch's module storage semantics.
+TORCH_MODULE(AlphaDropout);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FeatureAlphaDropout
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// See the documentation for `torch::nn::FeatureAlphaDropoutOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// FeatureAlphaDropout model(FeatureAlphaDropoutOptions(0.2).inplace(true));
+/// ```
+class TORCH_API FeatureAlphaDropoutImpl
+    : public detail::_DropoutNd<FeatureAlphaDropoutImpl> {
+ public:
+  using detail::_DropoutNd<FeatureAlphaDropoutImpl>::_DropoutNd;
+
+  Tensor forward(const Tensor& input);
+
+  /// Pretty prints the `FeatureAlphaDropout` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+};
+
+/// A `ModuleHolder` subclass for `FeatureAlphaDropoutImpl`.
+/// See the documentation for `FeatureAlphaDropoutImpl` class to learn what
+/// methods it provides, and examples of how to use `FeatureAlphaDropout` with
+/// `torch::nn::FeatureAlphaDropoutOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(FeatureAlphaDropout);
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/embedding.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/embedding.h
new file mode 100644
index 0000000000000000000000000000000000000000..943a1240e0b7e96e54f4213906d904aace6a7d63
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/embedding.h
@@ -0,0 +1,165 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/functional/embedding.h>
+#include <torch/nn/modules/common.h>
+#include <torch/nn/options/embedding.h>
+#include <torch/nn/pimpl.h>
+#include <torch/types.h>
+
+#include <cstddef>
+
+namespace torch::nn {
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Embedding
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Performs a lookup in a fixed size embedding table.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Embedding to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::EmbeddingOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Embedding model(EmbeddingOptions(10,
+/// 2).padding_idx(3).max_norm(2).norm_type(2.5).scale_grad_by_freq(true).sparse(true));
+/// ```
+class TORCH_API EmbeddingImpl : public torch::nn::Cloneable<EmbeddingImpl> {
+ public:
+  EmbeddingImpl(int64_t num_embeddings, int64_t embedding_dim)
+      : EmbeddingImpl(EmbeddingOptions(num_embeddings, embedding_dim)) {}
+  explicit EmbeddingImpl(EmbeddingOptions options_);
+
+  void reset() override;
+
+  void reset_parameters();
+
+  /// Pretty prints the `Embedding` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// Performs a lookup on the embedding table stored in `weight` using the
+  /// `indices` supplied and returns the result.
+  Tensor forward(const Tensor& indices);
+
+  /// The `Options` used to configure this `Embedding` module.
+  /// Changes to `EmbeddingOptions` *after construction* have no effect.
+  EmbeddingOptions options;
+
+  /// The embedding table.
+  Tensor weight;
+};
+
+/// A `ModuleHolder` subclass for `EmbeddingImpl`.
+/// See the documentation for `EmbeddingImpl` class to learn what methods it
+/// provides, and examples of how to use `Embedding` with
+/// `torch::nn::EmbeddingOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+class Embedding : public torch::nn::ModuleHolder<EmbeddingImpl> {
+ public:
+  using torch::nn::ModuleHolder<EmbeddingImpl>::ModuleHolder;
+
+  /// See the documentation for `torch::nn::EmbeddingFromPretrainedOptions`
+  /// class to learn what optional arguments are supported for this function.
+  static Embedding from_pretrained(
+      const torch::Tensor& embeddings,
+      const EmbeddingFromPretrainedOptions& options = {}) {
+    TORCH_CHECK(
+        embeddings.dim() == 2,
+        "Embeddings parameter is expected to be 2-dimensional");
+
+    auto rows = embeddings.size(0);
+    auto cols = embeddings.size(1);
+
+    Embedding embedding(EmbeddingOptions(rows, cols)
+                            ._weight(embeddings)
+                            .padding_idx(options.padding_idx())
+                            .max_norm(options.max_norm())
+                            .norm_type(options.norm_type())
+                            .scale_grad_by_freq(options.scale_grad_by_freq())
+                            .sparse(options.sparse()));
+    embedding->weight.set_requires_grad(!options.freeze());
+    return embedding;
+  }
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ EmbeddingBag
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Computes sums or means of 'bags' of embeddings, without instantiating the
+/// intermediate embeddings.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.EmbeddingBag to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::EmbeddingBagOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// EmbeddingBag model(EmbeddingBagOptions(10,
+/// 2).max_norm(2).norm_type(2.5).scale_grad_by_freq(true).sparse(true).mode(torch::kSum).padding_idx(1));
+/// ```
+class TORCH_API EmbeddingBagImpl
+    : public torch::nn::Cloneable<EmbeddingBagImpl> {
+ public:
+  EmbeddingBagImpl(int64_t num_embeddings, int64_t embedding_dim)
+      : EmbeddingBagImpl(EmbeddingBagOptions(num_embeddings, embedding_dim)) {}
+  explicit EmbeddingBagImpl(EmbeddingBagOptions options_);
+
+  void reset() override;
+
+  void reset_parameters();
+
+  /// Pretty prints the `EmbeddingBag` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The `Options` used to configure this `EmbeddingBag` module.
+  EmbeddingBagOptions options;
+  /// The embedding table.
+  Tensor weight;
+
+  Tensor forward(
+      const Tensor& input,
+      const Tensor& offsets = {},
+      const Tensor& per_sample_weights = {});
+
+ protected:
+  FORWARD_HAS_DEFAULT_ARGS({1, AnyValue(Tensor())}, {2, AnyValue(Tensor())})
+};
+
+/// A `ModuleHolder` subclass for `EmbeddingBagImpl`.
+/// See the documentation for `EmbeddingBagImpl` class to learn what methods it
+/// provides, and examples of how to use `EmbeddingBag` with
+/// `torch::nn::EmbeddingBagOptions`. See the documentation for `ModuleHolder`
+/// to learn about PyTorch's module storage semantics.
+class EmbeddingBag : public torch::nn::ModuleHolder<EmbeddingBagImpl> {
+ public:
+  using torch::nn::ModuleHolder<EmbeddingBagImpl>::ModuleHolder;
+
+  /// See the documentation for `torch::nn::EmbeddingBagFromPretrainedOptions`
+  /// class to learn what optional arguments are supported for this function.
+  static EmbeddingBag from_pretrained(
+      const torch::Tensor& embeddings,
+      const EmbeddingBagFromPretrainedOptions& options = {}) {
+    TORCH_CHECK(
+        embeddings.dim() == 2,
+        "Embeddings parameter is expected to be 2-dimensional");
+
+    auto rows = embeddings.size(0);
+    auto cols = embeddings.size(1);
+
+    EmbeddingBag embeddingbag(
+        EmbeddingBagOptions(rows, cols)
+            ._weight(embeddings)
+            .max_norm(options.max_norm())
+            .norm_type(options.norm_type())
+            .scale_grad_by_freq(options.scale_grad_by_freq())
+            .mode(options.mode())
+            .sparse(options.sparse())
+            .padding_idx(options.padding_idx()));
+    embeddingbag->weight.set_requires_grad(!options.freeze());
+    return embeddingbag;
+  }
+};
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/fold.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/fold.h
new file mode 100644
index 0000000000000000000000000000000000000000..f5618bc1c2217d79356aeebfb49a5793d0ca4ae0
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/fold.h
@@ -0,0 +1,85 @@
+#pragma once
+
+#include <torch/expanding_array.h>
+#include <torch/nn/cloneable.h>
+#include <torch/nn/functional/fold.h>
+#include <torch/nn/options/fold.h>
+#include <torch/nn/pimpl.h>
+#include <torch/types.h>
+
+namespace torch::nn {
+
+/// Applies fold over a 3-D input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Fold to learn about
+/// the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::FoldOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Fold model(FoldOptions({8, 8}, {3, 3}).dilation(2).padding({2,
+/// 1}).stride(2));
+/// ```
+class TORCH_API FoldImpl : public torch::nn::Cloneable<FoldImpl> {
+ public:
+  FoldImpl(ExpandingArray<2> output_size, ExpandingArray<2> kernel_size)
+      : FoldImpl(FoldOptions(output_size, kernel_size)) {}
+  explicit FoldImpl(const FoldOptions& options_);
+
+  void reset() override;
+
+  /// Pretty prints the `Fold` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input);
+
+  /// The options with which this `Module` was constructed.
+  FoldOptions options;
+};
+
+/// A `ModuleHolder` subclass for `FoldImpl`.
+/// See the documentation for `FoldImpl` class to learn what methods it
+/// provides, and examples of how to use `Fold` with `torch::nn::FoldOptions`.
+/// See the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(Fold);
+
+// ============================================================================
+
+/// Applies unfold over a 4-D input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Unfold to learn about
+/// the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::UnfoldOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Unfold model(UnfoldOptions({2, 4}).dilation(2).padding({2, 1}).stride(2));
+/// ```
+class TORCH_API UnfoldImpl : public Cloneable<UnfoldImpl> {
+ public:
+  UnfoldImpl(ExpandingArray<2> kernel_size)
+      : UnfoldImpl(UnfoldOptions(kernel_size)) {}
+  explicit UnfoldImpl(const UnfoldOptions& options_);
+
+  void reset() override;
+
+  /// Pretty prints the `Unfold` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input);
+
+  /// The options with which this `Module` was constructed.
+  UnfoldOptions options;
+};
+
+/// A `ModuleHolder` subclass for `UnfoldImpl`.
+/// See the documentation for `UnfoldImpl` class to learn what methods it
+/// provides, and examples of how to use `Unfold` with
+/// `torch::nn::UnfoldOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(Unfold);
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/instancenorm.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/instancenorm.h
new file mode 100644
index 0000000000000000000000000000000000000000..31368eb11bbbd229474c124d1ce3eb0509cccbcb
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/instancenorm.h
@@ -0,0 +1,153 @@
+#pragma once
+
+#include <torch/nn/functional/instancenorm.h>
+#include <torch/nn/modules/batchnorm.h>
+#include <torch/nn/options/instancenorm.h>
+
+namespace torch::nn {
+
+/// Base class for all (dimension-specialized) instance norm modules
+template <size_t D, typename Derived>
+// NOLINTNEXTLINE(bugprone-crtp-constructor-accessibility)
+class InstanceNormImpl
+    : public torch::nn::NormImplBase<D, Derived, InstanceNormOptions> {
+ private:
+  inline Tensor apply_instance_norm(const Tensor& input) {
+    return torch::nn::functional::detail::instance_norm(
+        input,
+        this->running_mean,
+        this->running_var,
+        this->weight,
+        this->bias,
+        this->is_training() || !this->options.track_running_stats(),
+        this->options.momentum(),
+        this->options.eps());
+  }
+
+  inline Tensor handle_no_batch_input(const Tensor& input) {
+    return this->apply_instance_norm(input.unsqueeze(0)).squeeze(0);
+  }
+
+ public:
+  using torch::nn::NormImplBase<D, Derived, InstanceNormOptions>::NormImplBase;
+
+  Tensor forward(const Tensor& input) {
+    this->_check_input_dim(input);
+
+    // For InstanceNorm1D, 2D is unbatched and 3D is batched
+    // For InstanceNorm2D, 3D is unbatched and 4D is batched
+    // For InstanceNorm3D, 4D is unbatched and 5D is batched
+    // check if input does not have a batch-dim
+    if (input.dim() == D + 1) {
+      return this->handle_no_batch_input(input);
+    }
+
+    return this->apply_instance_norm(input);
+  }
+
+  /// Pretty prints the `InstanceNorm{1,2,3}d` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override {
+    stream << std::boolalpha << "torch::nn::InstanceNorm" << D << "d("
+           << this->options.num_features() << ", "
+           << "eps=" << this->options.eps() << ", "
+           << "momentum=" << this->options.momentum() << ", "
+           << "affine=" << this->options.affine() << ", "
+           << "track_running_stats=" << this->options.track_running_stats()
+           << ")";
+  }
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ InstanceNorm1d
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the InstanceNorm1d function.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.InstanceNorm1d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::InstanceNorm1dOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// InstanceNorm1d
+/// model(InstanceNorm1dOptions(4).eps(0.5).momentum(0.1).affine(false).track_running_stats(true));
+/// ```
+class TORCH_API InstanceNorm1dImpl
+    : public InstanceNormImpl<1, InstanceNorm1dImpl> {
+ protected:
+  void _check_input_dim(const Tensor& input) override;
+
+ public:
+  using InstanceNormImpl<1, InstanceNorm1dImpl>::InstanceNormImpl;
+};
+
+/// A `ModuleHolder` subclass for `InstanceNorm1dImpl`.
+/// See the documentation for `InstanceNorm1dImpl` class to learn what methods
+/// it provides, and examples of how to use `InstanceNorm1d` with
+/// `torch::nn::InstanceNorm1dOptions`. See the documentation for `ModuleHolder`
+/// to learn about PyTorch's module storage semantics.
+TORCH_MODULE(InstanceNorm1d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ InstanceNorm2d
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the InstanceNorm2d function.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.InstanceNorm2d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::InstanceNorm2dOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// InstanceNorm2d
+/// model(InstanceNorm2dOptions(4).eps(0.5).momentum(0.1).affine(false).track_running_stats(true));
+/// ```
+class TORCH_API InstanceNorm2dImpl
+    : public InstanceNormImpl<2, InstanceNorm2dImpl> {
+ protected:
+  void _check_input_dim(const Tensor& input) override;
+
+ public:
+  using InstanceNormImpl<2, InstanceNorm2dImpl>::InstanceNormImpl;
+};
+
+/// A `ModuleHolder` subclass for `InstanceNorm2dImpl`.
+/// See the documentation for `InstanceNorm2dImpl` class to learn what methods
+/// it provides, and examples of how to use `InstanceNorm2d` with
+/// `torch::nn::InstanceNorm2dOptions`. See the documentation for `ModuleHolder`
+/// to learn about PyTorch's module storage semantics.
+TORCH_MODULE(InstanceNorm2d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ InstanceNorm3d
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the InstanceNorm3d function.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.InstanceNorm3d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::InstanceNorm3dOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// InstanceNorm3d
+/// model(InstanceNorm3dOptions(4).eps(0.5).momentum(0.1).affine(false).track_running_stats(true));
+/// ```
+class TORCH_API InstanceNorm3dImpl
+    : public InstanceNormImpl<3, InstanceNorm3dImpl> {
+ protected:
+  void _check_input_dim(const Tensor& input) override;
+
+ public:
+  using InstanceNormImpl<3, InstanceNorm3dImpl>::InstanceNormImpl;
+};
+
+/// A `ModuleHolder` subclass for `InstanceNorm3dImpl`.
+/// See the documentation for `InstanceNorm3dImpl` class to learn what methods
+/// it provides, and examples of how to use `InstanceNorm3d` with
+/// `torch::nn::InstanceNorm3dOptions`. See the documentation for `ModuleHolder`
+/// to learn about PyTorch's module storage semantics.
+TORCH_MODULE(InstanceNorm3d);
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/linear.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/linear.h
new file mode 100644
index 0000000000000000000000000000000000000000..d238f58d6a4f8d170d2a12d7f40814a53f9c7276
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/linear.h
@@ -0,0 +1,214 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/functional/linear.h>
+#include <torch/nn/module.h>
+#include <torch/nn/options/linear.h>
+#include <torch/nn/pimpl.h>
+#include <torch/types.h>
+
+#include <cstddef>
+#include <utility>
+#include <vector>
+
+namespace torch::nn {
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Identity ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// A placeholder identity operator that is argument-insensitive.
+/// See https://pytorch.org/docs/main/generated/torch.nn.Identity.html to
+/// learn about the exact behavior of this module.
+class TORCH_API IdentityImpl : public Cloneable<IdentityImpl> {
+ public:
+  void reset() override;
+
+  /// Pretty prints the `Identity` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input);
+};
+
+/// A `ModuleHolder` subclass for `IdentityImpl`.
+/// See the documentation for `IdentityImpl` class to learn what methods it
+/// provides, or the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(Identity);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Linear ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies a linear transformation with optional bias.
+/// See https://pytorch.org/docs/main/generated/torch.nn.Linear.html to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::LinearOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Linear model(LinearOptions(5, 2).bias(false));
+/// ```
+class TORCH_API LinearImpl : public Cloneable<LinearImpl> {
+ public:
+  LinearImpl(int64_t in_features, int64_t out_features)
+      : LinearImpl(LinearOptions(in_features, out_features)) {}
+  explicit LinearImpl(const LinearOptions& options_);
+
+  void reset() override;
+
+  void reset_parameters();
+
+  /// Pretty prints the `Linear` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// Transforms the `input` tensor by multiplying with the `weight` and
+  /// optionally adding the `bias`, if `with_bias` is true in the options.
+  Tensor forward(const Tensor& input);
+
+  /// The options used to configure this module.
+  LinearOptions options;
+
+  /// The learned weight.
+  Tensor weight;
+
+  /// The learned bias. If `bias` is false in the `options`, this tensor is
+  /// undefined.
+  Tensor bias;
+};
+
+/// A `ModuleHolder` subclass for `LinearImpl`.
+/// See the documentation for `LinearImpl` class to learn what methods it
+/// provides, and examples of how to use `Linear` with
+/// `torch::nn::LinearOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(Linear);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Flatten ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// A placeholder for Flatten operator
+/// See https://pytorch.org/docs/main/generated/torch.nn.Flatten.html to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::FlattenOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Flatten model(FlattenOptions().start_dim(2).end_dim(4));
+/// ```
+class TORCH_API FlattenImpl : public Cloneable<FlattenImpl> {
+ public:
+  explicit FlattenImpl(const FlattenOptions& options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `Flatten` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// Applies a flatten transform on the `input`.
+  Tensor forward(const Tensor& input);
+
+  /// The options used to configure this module.
+  FlattenOptions options;
+};
+
+/// A `ModuleHolder` subclass for `FlattenImpl`.
+/// See the documentation for `FlattenImpl` class to learn what methods it
+/// provides, and examples of how to use `Flatten` with
+/// `torch::nn::FlattenOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(Flatten);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Unflatten
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// A placeholder for unflatten operator
+/// See https://pytorch.org/docs/main/generated/torch.nn.Unflatten.html to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::UnflattenOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Unflatten model(UnflattenOptions(0, {2, 2}));
+/// Unflatten model(UnflattenOptions("B", {{"B1", 2}, {"B2", 2}}));
+/// ```
+class TORCH_API UnflattenImpl : public Cloneable<UnflattenImpl> {
+ public:
+  UnflattenImpl(int64_t dim, std::vector<int64_t> sizes)
+      : UnflattenImpl(UnflattenOptions(dim, std::move(sizes))) {}
+  UnflattenImpl(std::string dimname, UnflattenOptions::namedshape_t namedshape)
+      : UnflattenImpl(
+            UnflattenOptions(std::move(dimname), std::move(namedshape))) {}
+  explicit UnflattenImpl(UnflattenOptions options_);
+
+  void reset() override;
+
+  /// Pretty prints the `Unflatten` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// Applies an unflatten transform on the `input`.
+  Tensor forward(const Tensor& input);
+
+  /// The options used to configure this module.
+  UnflattenOptions options;
+};
+
+/// A `ModuleHolder` subclass for `UnflattenImpl`.
+/// See the documentation for `UnflattenImpl` class to learn what methods it
+/// provides, and examples of how to use `Unflatten` with
+/// `torch::nn::UnflattenOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(Unflatten);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Bilinear ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies a billinear transformation with optional bias.
+/// See https://pytorch.org/docs/main/generated/torch.nn.Bilinear.html to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::BilinearOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Bilinear model(BilinearOptions(3, 2, 4).bias(false));
+/// ```
+class TORCH_API BilinearImpl : public Cloneable<BilinearImpl> {
+ public:
+  BilinearImpl(int64_t in1_features, int64_t in2_features, int64_t out_features)
+      : BilinearImpl(
+            BilinearOptions(in1_features, in2_features, out_features)) {}
+  explicit BilinearImpl(const BilinearOptions& options_);
+
+  void reset() override;
+
+  void reset_parameters();
+
+  /// Pretty prints the `Bilinear` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// Applies a bilinear transform on the `input1` and `input2` tensor by
+  /// multiplying with the `weight` and optionally adding the `bias`, if
+  /// `with_bias` is true in the options.
+  Tensor forward(const Tensor& input1, const Tensor& input2);
+
+  /// The options used to configure this module.
+  BilinearOptions options;
+
+  /// The learned weight.
+  Tensor weight;
+
+  /// The learned bias. If `with_bias` is false in the `options`, this tensor is
+  /// undefined.
+  Tensor bias;
+};
+
+/// A `ModuleHolder` subclass for `BilinearImpl`.
+/// See the documentation for `BilinearImpl` class to learn what methods it
+/// provides, and examples of how to use `Bilinear` with
+/// `torch::nn::BilinearOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(Bilinear);
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/loss.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/loss.h
new file mode 100644
index 0000000000000000000000000000000000000000..2ab046e3d677358dab448c1cd69d826ea1d13e03
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/loss.h
@@ -0,0 +1,803 @@
+#pragma once
+
+#include <torch/expanding_array.h>
+#include <torch/nn/cloneable.h>
+#include <torch/nn/functional/loss.h>
+#include <torch/nn/options/loss.h>
+#include <torch/nn/pimpl.h>
+#include <torch/types.h>
+
+#include <torch/csrc/Export.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace torch::nn {
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ L1Loss ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Creates a criterion that measures the mean absolute error (MAE) between each
+/// element in the input : math :`x` and target : `y`.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.L1Loss to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::L1LossOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// L1Loss model(L1LossOptions(torch::kNone));
+/// ```
+struct TORCH_API L1LossImpl : Cloneable<L1LossImpl> {
+  explicit L1LossImpl(L1LossOptions options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `L1Loss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input, const Tensor& target);
+
+  /// The options with which this `Module` was constructed.
+  L1LossOptions options;
+};
+
+/// A `ModuleHolder` subclass for `L1LossImpl`.
+/// See the documentation for `L1LossImpl` class to learn what methods it
+/// provides, and examples of how to use `L1Loss` with
+/// `torch::nn::L1LossOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(L1Loss);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ KLDivLoss
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// The Kullback-Leibler divergence loss measure
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.KLDivLoss to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::KLDivLossOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// KLDivLoss model(KLDivLossOptions().reduction(torch::kNone));
+/// ```
+struct TORCH_API KLDivLossImpl : Cloneable<KLDivLossImpl> {
+  explicit KLDivLossImpl(KLDivLossOptions options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `KLDivLoss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input, const Tensor& target);
+
+  /// The options with which this `Module` was constructed.
+  KLDivLossOptions options;
+};
+
+/// A `ModuleHolder` subclass for `KLDivLossImpl`.
+/// See the documentation for `KLDivLossImpl` class to learn what methods it
+/// provides, and examples of how to use `KLDivLoss` with
+/// `torch::nn::KLDivLossOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(KLDivLoss);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MSELoss ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Creates a criterion that measures the mean squared error (squared L2 norm)
+/// between each element in the input :math:`x` and target :math:`y`.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.MSELoss to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::MSELossOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// MSELoss model(MSELossOptions(torch::kNone));
+/// ```
+struct TORCH_API MSELossImpl : Cloneable<MSELossImpl> {
+  explicit MSELossImpl(MSELossOptions options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `MSELoss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input, const Tensor& target);
+
+  /// The options with which this `Module` was constructed.
+  MSELossOptions options;
+};
+
+/// A `ModuleHolder` subclass for `MSELossImpl`.
+/// See the documentation for `MSELossImpl` class to learn what methods it
+/// provides, and examples of how to use `MSELoss` with
+/// `torch::nn::MSELossOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(MSELoss);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ BCELoss ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Creates a criterion that measures the Binary Cross Entropy
+/// between the target and the output.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.BCELoss to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::BCELossOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// BCELoss model(BCELossOptions().reduction(torch::kNone).weight(weight));
+/// ```
+struct TORCH_API BCELossImpl : Cloneable<BCELossImpl> {
+  explicit BCELossImpl(BCELossOptions options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `BCELoss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input, const Tensor& target);
+
+  /// The options with which this `Module` was constructed.
+  BCELossOptions options;
+};
+
+/// A `ModuleHolder` subclass for `BCELossImpl`.
+/// See the documentation for `BCELossImpl` class to learn what methods it
+/// provides, and examples of how to use `BCELoss` with
+/// `torch::nn::BCELossOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(BCELoss);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ HingeEmbeddingLoss
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Creates a criterion that measures the loss given an input tensor :math:`x`
+/// and a labels tensor :math:`y` (containing 1 or -1).
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.HingeEmbeddingLoss to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::HingeEmbeddingLossOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// HingeEmbeddingLoss
+/// model(HingeEmbeddingLossOptions().margin(4).reduction(torch::kNone));
+/// ```
+struct TORCH_API HingeEmbeddingLossImpl : Cloneable<HingeEmbeddingLossImpl> {
+  explicit HingeEmbeddingLossImpl(HingeEmbeddingLossOptions options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `HingeEmbeddingLoss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input, const Tensor& target);
+
+  /// The options with which this `Module` was constructed.
+  HingeEmbeddingLossOptions options;
+};
+
+/// A `ModuleHolder` subclass for `HingeEmbeddingLossImpl`.
+/// See the documentation for `HingeEmbeddingLossImpl` class to learn what
+/// methods it provides, and examples of how to use `HingeEmbeddingLoss` with
+/// `torch::nn::HingeEmbeddingLossOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(HingeEmbeddingLoss);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MultiMarginLoss
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Creates a criterion that optimizes a multi-class classification hinge
+/// loss (margin-based loss) between input :math:`x` (a 2D mini-batch `Tensor`)
+/// and output :math:`y` (which is a 1D tensor of target class indices, :math:`0
+/// \leq y \leq \text{x.size}(1)-1`). See
+/// https://pytorch.org/docs/main/nn.html#torch.nn.MultiMarginLoss to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::MultiMarginLossOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// MultiMarginLoss model(MultiMarginLossOptions().margin(2).weight(weight));
+/// ```
+struct TORCH_API MultiMarginLossImpl : public Cloneable<MultiMarginLossImpl> {
+  explicit MultiMarginLossImpl(MultiMarginLossOptions options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `MultiMarginLoss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input, const Tensor& target);
+
+  /// The options with which this `Module` was constructed.
+  MultiMarginLossOptions options;
+};
+
+/// A `ModuleHolder` subclass for `MultiMarginLossImpl`.
+/// See the documentation for `MultiMarginLossImpl` class to learn what methods
+/// it provides, and examples of how to use `MultiMarginLoss` with
+/// `torch::nn::MultiMarginLossOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(MultiMarginLoss);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CosineEmbeddingLoss
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Creates a criterion that measures the loss given input tensors
+/// `input1`, `input2`, and a `Tensor` label `target` with values 1 or
+/// -1. This is used for measuring whether two inputs are similar or
+/// dissimilar, using the cosine distance, and is typically used for learning
+/// nonlinear embeddings or semi-supervised learning.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.CosineEmbeddingLoss to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::CosineEmbeddingLossOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// CosineEmbeddingLoss model(CosineEmbeddingLossOptions().margin(0.5));
+/// ```
+struct TORCH_API CosineEmbeddingLossImpl
+    : public Cloneable<CosineEmbeddingLossImpl> {
+  explicit CosineEmbeddingLossImpl(CosineEmbeddingLossOptions options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `CosineEmbeddingLoss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(
+      const Tensor& input1,
+      const Tensor& input2,
+      const Tensor& target);
+
+  /// The options with which this `Module` was constructed.
+  CosineEmbeddingLossOptions options;
+};
+
+/// A `ModuleHolder` subclass for `CosineEmbeddingLossImpl`.
+/// See the documentation for `CosineEmbeddingLossImpl` class to learn what
+/// methods it provides, and examples of how to use `CosineEmbeddingLoss` with
+/// `torch::nn::CosineEmbeddingLossOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(CosineEmbeddingLoss);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SmoothL1Loss
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Creates a criterion that uses a squared term if the absolute
+/// element-wise error falls below beta and an L1 term otherwise.
+/// It is less sensitive to outliers than the `MSELoss` and in some cases
+/// prevents exploding gradients (e.g. see the paper `Fast R-CNN` by Ross
+/// Girshick). See https://pytorch.org/docs/main/nn.html#torch.nn.SmoothL1Loss
+/// to learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::SmoothL1LossOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// SmoothL1Loss model(SmoothL1LossOptions().reduction(torch::kNone).beta(0.5));
+/// ```
+struct TORCH_API SmoothL1LossImpl : public Cloneable<SmoothL1LossImpl> {
+  explicit SmoothL1LossImpl(SmoothL1LossOptions options = {});
+
+  void reset() override;
+
+  /// Pretty prints the `L1Loss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input, const Tensor& target);
+
+  /// The options with which this `Module` was constructed.
+  SmoothL1LossOptions options;
+};
+
+/// A `ModuleHolder` subclass for `SmoothL1LossImpl`.
+/// See the documentation for `SmoothL1LossImpl` class to learn what methods it
+/// provides, and examples of how to use `SmoothL1Loss` with
+/// `torch::nn::SmoothL1LossOptions`. See the documentation for `ModuleHolder`
+/// to learn about PyTorch's module storage semantics.
+TORCH_MODULE(SmoothL1Loss);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ HuberLoss
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Creates a criterion that uses a squared term if the absolute
+/// element-wise error falls below delta and a delta-scaled L1 term otherwise.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.HuberLoss to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::HuberLossOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// HuberLoss model(HuberLossOptions().reduction(torch::kNone).delta(0.5));
+/// ```
+struct TORCH_API HuberLossImpl : public Cloneable<HuberLossImpl> {
+  explicit HuberLossImpl(HuberLossOptions options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `HuberLoss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input, const Tensor& target);
+
+  /// The options with which this `Module` was constructed.
+  HuberLossOptions options;
+};
+
+/// A `ModuleHolder` subclass for `HuberLossImpl`.
+/// See the documentation for `HuberLossImpl` class to learn what methods it
+/// provides, and examples of how to use `HuberLoss` with
+/// `torch::nn::HuberLossOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(HuberLoss);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MultiLabelMarginLoss
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Creates a criterion that optimizes a multi-class multi-classification
+/// hinge loss (margin-based loss) between input :math:`x` (a 2D mini-batch
+/// `Tensor`) and output :math:`y` (which is a 2D `Tensor` of target class
+/// indices). See
+/// https://pytorch.org/docs/main/nn.html#torch.nn.MultiLabelMarginLoss to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::MultiLabelMarginLossOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// MultiLabelMarginLoss model(MultiLabelMarginLossOptions(torch::kNone));
+/// ```
+struct TORCH_API MultiLabelMarginLossImpl
+    : public Cloneable<MultiLabelMarginLossImpl> {
+  explicit MultiLabelMarginLossImpl(MultiLabelMarginLossOptions options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `L1Loss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input, const Tensor& target);
+
+  /// The options with which this `Module` was constructed.
+  MultiLabelMarginLossOptions options;
+};
+
+/// A `ModuleHolder` subclass for `MultiLabelMarginLossImpl`.
+/// See the documentation for `MultiLabelMarginLossImpl` class to learn what
+/// methods it provides, and examples of how to use `MultiLabelMarginLoss` with
+/// `torch::nn::MultiLabelMarginLossOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(MultiLabelMarginLoss);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SoftMarginLoss
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Creates a criterion that optimizes a two-class classification
+/// logistic loss between input tensor :math:`x` and target tensor :math:`y`
+/// (containing 1 or -1).
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.SoftMarginLoss to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::SoftMarginLossOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// SoftMarginLoss model(SoftMarginLossOptions(torch::kNone));
+/// ```
+struct TORCH_API SoftMarginLossImpl : public Cloneable<SoftMarginLossImpl> {
+  explicit SoftMarginLossImpl(SoftMarginLossOptions options_ = {});
+
+  /// Pretty prints the `SoftMarginLoss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  void reset() override;
+
+  Tensor forward(const Tensor& input, const Tensor& target);
+
+  /// The options with which this `Module` was constructed.
+  SoftMarginLossOptions options;
+};
+
+/// A `ModuleHolder` subclass for `SoftMarginLossImpl`.
+/// See the documentation for `SoftMarginLossImpl` class to learn what methods
+/// it provides, and examples of how to use `SoftMarginLoss` with
+/// `torch::nn::SoftMarginLossOptions`. See the documentation for `ModuleHolder`
+/// to learn about PyTorch's module storage semantics.
+TORCH_MODULE(SoftMarginLoss);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MultiLabelSoftMarginLoss
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Creates a criterion that optimizes a multi-label one-versus-all
+/// loss based on max-entropy, between input :math:`x` and target :math:`y` of
+/// size :math:`(N, C)`. See
+/// https://pytorch.org/docs/main/nn.html#torch.nn.MultiLabelSoftMarginLoss to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::MultiLabelSoftMarginLossOptions` class
+/// to learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// MultiLabelSoftMarginLoss
+/// model(MultiLabelSoftMarginLossOptions().reduction(torch::kNone).weight(weight));
+/// ```
+struct TORCH_API MultiLabelSoftMarginLossImpl
+    : public Cloneable<MultiLabelSoftMarginLossImpl> {
+  explicit MultiLabelSoftMarginLossImpl(
+      MultiLabelSoftMarginLossOptions options_ = {});
+
+  /// Pretty prints the `MultiLabelSoftMarginLoss` module into the given
+  /// `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  void reset() override;
+
+  Tensor forward(const Tensor& input, const Tensor& target);
+
+  /// The options with which this `Module` was constructed.
+  MultiLabelSoftMarginLossOptions options;
+};
+
+/// A `ModuleHolder` subclass for `MultiLabelSoftMarginLossImpl`.
+/// See the documentation for `MultiLabelSoftMarginLossImpl` class to learn what
+/// methods it provides, and examples of how to use `MultiLabelSoftMarginLoss`
+/// with `torch::nn::MultiLabelSoftMarginLossOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(MultiLabelSoftMarginLoss);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ TripletMarginLoss
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Creates a criterion that measures the triplet loss given an input
+/// tensors :math:`x1`, :math:`x2`, :math:`x3` and a margin with a value greater
+/// than :math:`0`. This is used for measuring a relative similarity between
+/// samples. A triplet is composed by `a`, `p` and `n` (i.e., `anchor`,
+/// `positive examples` and `negative examples` respectively). The
+/// shapes of all input tensors should be :math:`(N, D)`.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.TripletMarginLoss to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::TripletMarginLossOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// TripletMarginLoss
+/// model(TripletMarginLossOptions().margin(3).p(2).eps(1e-06).swap(false));
+/// ```
+struct TORCH_API TripletMarginLossImpl
+    : public Cloneable<TripletMarginLossImpl> {
+  explicit TripletMarginLossImpl(TripletMarginLossOptions options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `TripletMarginLoss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(
+      const Tensor& anchor,
+      const Tensor& positive,
+      const Tensor& negative);
+
+  /// The options with which this `Module` was constructed.
+  TripletMarginLossOptions options;
+};
+
+/// A `ModuleHolder` subclass for `TripletMarginLossImpl`.
+/// See the documentation for `TripletMarginLossImpl` class to learn what
+/// methods it provides, and examples of how to use `TripletMarginLoss` with
+/// `torch::nn::TripletMarginLossOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(TripletMarginLoss);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ TripletMarginWithDistanceLoss
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Creates a criterion that measures the triplet loss given input
+/// tensors :math:`a`, :math:`p`, and :math:`n` (representing anchor,
+/// positive, and negative examples, respectively); and a nonnegative,
+/// real-valued function
+/// ("distance function") used to compute the relationships between the anchor
+/// and positive example ("positive distance") and the anchor and negative
+/// example ("negative distance").
+/// See
+/// https://pytorch.org/docs/main/nn.html#torch.nn.TripletMarginWithDistanceLoss
+/// to learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::TripletMarginWithDistanceLossOptions`
+/// class to learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// TripletMarginWithDistanceLoss
+/// model(TripletMarginWithDistanceLossOptions().margin(3).swap(false));
+/// ```
+struct TORCH_API TripletMarginWithDistanceLossImpl
+    : public Cloneable<TripletMarginWithDistanceLossImpl> {
+  explicit TripletMarginWithDistanceLossImpl(
+      TripletMarginWithDistanceLossOptions options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `TripletMarginWithDistanceLoss` module into the given
+  /// `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(
+      const Tensor& anchor,
+      const Tensor& positive,
+      const Tensor& negative);
+
+  /// The options with which this `Module` was constructed.
+  TripletMarginWithDistanceLossOptions options;
+};
+
+/// A `ModuleHolder` subclass for `TripletMarginWithDistanceLossImpl`.
+/// See the documentation for `TripletMarginWithDistanceLossImpl` class to learn
+/// what methods it provides, and examples of how to use
+/// `TripletMarginWithDistanceLoss` with
+/// `torch::nn::TripletMarginWithDistanceLossOptions`.
+/// See the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(TripletMarginWithDistanceLoss);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CTCLoss ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// The Connectionist Temporal Classification loss.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.CTCLoss to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::CTCLossOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// CTCLoss
+/// model(CTCLossOptions().blank(42).zero_infinity(false).reduction(torch::kSum));
+/// ```
+struct TORCH_API CTCLossImpl : public Cloneable<CTCLossImpl> {
+  explicit CTCLossImpl(CTCLossOptions options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `CTCLoss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(
+      const Tensor& log_probs,
+      const Tensor& targets,
+      const Tensor& input_lengths,
+      const Tensor& target_lengths);
+
+  /// The options with which this `Module` was constructed.
+  CTCLossOptions options;
+};
+
+/// A `ModuleHolder` subclass for `CTCLossImpl`.
+/// See the documentation for `CTCLossImpl` class to learn what methods it
+/// provides, and examples of how to use `CTCLoss` with
+/// `torch::nn::CTCLossOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(CTCLoss);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PoissonNLLLoss
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Negative log likelihood loss with Poisson distribution of target.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.PoissonNLLLoss to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::PoissonNLLLossOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// PoissonNLLLoss
+/// model(PoissonNLLLossOptions().log_input(false).full(true).eps(0.42).reduction(torch::kSum));
+/// ```
+struct TORCH_API PoissonNLLLossImpl : public Cloneable<PoissonNLLLossImpl> {
+  explicit PoissonNLLLossImpl(PoissonNLLLossOptions options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `PoissonNLLLoss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& log_input, const Tensor& targets);
+
+  /// The options with which this `Module` was constructed.
+  PoissonNLLLossOptions options;
+};
+
+/// A `ModuleHolder` subclass for `PoissonNLLLossImpl`.
+/// See the documentation for `PoissonNLLLossImpl` class to learn what methods
+/// it provides, and examples of how to use `PoissonNLLLoss` with
+/// `torch::nn::PoissonNLLLossOptions`. See the documentation for `ModuleHolder`
+/// to learn about PyTorch's module storage semantics.
+TORCH_MODULE(PoissonNLLLoss);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MarginRankingLoss
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Creates a criterion that measures the loss given
+/// inputs :math:`x1`, :math:`x2`, two 1D mini-batch `Tensors`,
+/// and a label 1D mini-batch tensor :math:`y` (containing 1 or -1).
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.MarginRankingLoss to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::MarginRankingLossOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// MarginRankingLoss
+/// model(MarginRankingLossOptions().margin(0.5).reduction(torch::kSum));
+/// ```
+struct TORCH_API MarginRankingLossImpl
+    : public Cloneable<MarginRankingLossImpl> {
+  explicit MarginRankingLossImpl(MarginRankingLossOptions options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `MarginRankingLoss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(
+      const Tensor& input1,
+      const Tensor& input2,
+      const Tensor& targets);
+
+  /// The options with which this `Module` was constructed.
+  MarginRankingLossOptions options;
+};
+
+/// A `ModuleHolder` subclass for `MarginRankingLossImpl`.
+/// See the documentation for `MarginRankingLossImpl` class to learn what
+/// methods it provides, and examples of how to use `MarginRankingLoss` with
+/// `torch::nn::MarginRankingLossOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(MarginRankingLoss);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ NLLLoss ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// The negative log likelihood loss. It is useful to train a classification
+/// problem with `C` classes.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.NLLLoss to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::NLLLossOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// NLLLoss model(NLLLossOptions().ignore_index(-100).reduction(torch::kMean));
+/// ```
+struct TORCH_API NLLLossImpl : public Cloneable<NLLLossImpl> {
+  explicit NLLLossImpl(NLLLossOptions options_ = {});
+
+  /// Pretty prints the `NLLLoss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  void reset() override;
+
+  Tensor forward(const Tensor& input, const Tensor& target);
+
+  /// The options with which this `Module` was constructed.
+  NLLLossOptions options;
+
+  /// A manual rescaling weight given to to each class.
+  Tensor weight;
+};
+
+/// A `ModuleHolder` subclass for `NLLLossImpl`.
+/// See the documentation for `NLLLossImpl` class to learn what methods it
+/// provides, and examples of how to use `NLLLoss` with
+/// `torch::nn::NLLLossOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(NLLLoss);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CrossEntropyLoss
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Creates a criterion that computes cross entropy loss between input and
+/// target. See
+/// https://pytorch.org/docs/main/nn.html#torch.nn.CrossEntropyLoss to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::CrossEntropyLossOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// CrossEntropyLoss
+/// model(CrossEntropyLossOptions().ignore_index(-100).reduction(torch::kMean));
+/// ```
+struct TORCH_API CrossEntropyLossImpl : public Cloneable<CrossEntropyLossImpl> {
+  explicit CrossEntropyLossImpl(CrossEntropyLossOptions options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `CrossEntropyLoss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input, const Tensor& target);
+
+  /// The options with which this `Module` was constructed.
+  CrossEntropyLossOptions options;
+
+  /// A manual rescaling weight given to to each class.
+  Tensor weight;
+};
+
+/// A `ModuleHolder` subclass for `CrossEntropyLossImpl`.
+/// See the documentation for `CrossEntropyLossImpl` class to learn what methods
+/// it provides, and examples of how to use `CrossEntropyLoss` with
+/// `torch::nn::CrossEntropyLossOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(CrossEntropyLoss);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ BCEWithLogitsLoss
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// This loss combines a `Sigmoid` layer and the `BCELoss` in one single
+/// class. This version is more numerically stable than using a plain `Sigmoid`
+/// followed by a `BCELoss` as, by combining the operations into one layer,
+/// we take advantage of the log-sum-exp trick for numerical stability.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.BCEWithLogitsLoss to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::BCEWithLogitsLossOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// BCEWithLogitsLoss
+/// model(BCEWithLogitsLossOptions().reduction(torch::kNone).weight(weight));
+/// ```
+struct TORCH_API BCEWithLogitsLossImpl
+    : public Cloneable<BCEWithLogitsLossImpl> {
+  explicit BCEWithLogitsLossImpl(BCEWithLogitsLossOptions options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `BCEWithLogitsLoss` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input, const Tensor& target);
+
+  /// The options with which this `Module` was constructed.
+  BCEWithLogitsLossOptions options;
+
+  /// A manual rescaling weight given to the loss of each batch element.
+  Tensor weight;
+
+  /// A weight of positive examples.
+  Tensor pos_weight;
+};
+
+/// A `ModuleHolder` subclass for `BCEWithLogitsLossImpl`.
+/// See the documentation for `BCEWithLogitsLossImpl` class to learn what
+/// methods it provides, and examples of how to use `BCEWithLogitsLoss` with
+/// `torch::nn::BCEWithLogitsLossOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(BCEWithLogitsLoss);
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/normalization.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/normalization.h
new file mode 100644
index 0000000000000000000000000000000000000000..2546e57ff1a86e1704ec31f1cf936075ab60e697
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/normalization.h
@@ -0,0 +1,197 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/functional/normalization.h>
+#include <torch/nn/modules/_functions.h>
+#include <torch/nn/options/normalization.h>
+#include <torch/nn/pimpl.h>
+#include <torch/types.h>
+
+#include <cstddef>
+#include <utility>
+#include <vector>
+
+namespace torch::nn {
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ LayerNorm ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies Layer Normalization over a mini-batch of inputs as described in
+/// the paper `Layer Normalization`_ .
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.LayerNorm to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::LayerNormOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// LayerNorm model(LayerNormOptions({2,
+/// 2}).elementwise_affine(false).eps(2e-5));
+/// ```
+class TORCH_API LayerNormImpl : public torch::nn::Cloneable<LayerNormImpl> {
+ public:
+  LayerNormImpl(std::vector<int64_t> normalized_shape)
+      : LayerNormImpl(LayerNormOptions(std::move(normalized_shape))) {}
+  explicit LayerNormImpl(LayerNormOptions options_);
+
+  void reset() override;
+
+  void reset_parameters();
+
+  /// Pretty prints the `LayerNorm` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// Applies layer normalization over a mini-batch of inputs as described in
+  /// the paper `Layer Normalization`_ .
+  ///
+  /// The mean and standard-deviation are calculated separately over the last
+  /// certain number dimensions which have to be of the shape specified by
+  /// input `normalized_shape`.
+  ///
+  /// `Layer Normalization`: https://arxiv.org/abs/1607.06450
+  Tensor forward(const Tensor& input);
+
+  /// The options with which this module was constructed.
+  LayerNormOptions options;
+
+  /// The learned weight.
+  /// Initialized to ones if the `elementwise_affine` option is set to `true`
+  /// upon construction.
+  Tensor weight;
+
+  /// The learned bias.
+  /// Initialized to zeros `elementwise_affine` option is set to `true` upon
+  /// construction.
+  Tensor bias;
+};
+
+/// A `ModuleHolder` subclass for `LayerNormImpl`.
+/// See the documentation for `LayerNormImpl` class to learn what methods it
+/// provides, and examples of how to use `LayerNorm` with
+/// `torch::nn::LayerNormOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(LayerNorm);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ LocalResponseNorm
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies local response normalization over an input signal composed
+/// of several input planes, where channels occupy the second dimension.
+/// Applies normalization across channels.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.LocalResponseNorm to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::LocalResponseNormOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// LocalResponseNorm
+/// model(LocalResponseNormOptions(2).alpha(0.0002).beta(0.85).k(2.));
+/// ```
+class TORCH_API LocalResponseNormImpl
+    : public Cloneable<LocalResponseNormImpl> {
+ public:
+  LocalResponseNormImpl(int64_t size)
+      : LocalResponseNormImpl(LocalResponseNormOptions(size)) {}
+  explicit LocalResponseNormImpl(const LocalResponseNormOptions& options_);
+
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `LocalResponseNormImpl` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  LocalResponseNormOptions options;
+};
+
+/// A `ModuleHolder` subclass for `LocalResponseNormImpl`.
+/// See the documentation for `LocalResponseNormImpl` class to learn what
+/// methods it provides, and examples of how to use `LocalResponseNorm` with
+/// `torch::nn::LocalResponseNormOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(LocalResponseNorm);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CrossMapLRN2d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// See the documentation for `torch::nn::CrossMapLRN2dOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// CrossMapLRN2d model(CrossMapLRN2dOptions(3).alpha(1e-5).beta(0.1).k(10));
+/// ```
+class TORCH_API CrossMapLRN2dImpl
+    : public torch::nn::Cloneable<CrossMapLRN2dImpl> {
+ public:
+  CrossMapLRN2dImpl(int64_t size)
+      : CrossMapLRN2dImpl(CrossMapLRN2dOptions(size)) {}
+  explicit CrossMapLRN2dImpl(const CrossMapLRN2dOptions& options_)
+      : options(options_) {}
+
+  void reset() override;
+
+  /// Pretty prints the `CrossMapLRN2d` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  torch::Tensor forward(const torch::Tensor& input);
+
+  CrossMapLRN2dOptions options;
+};
+
+/// A `ModuleHolder` subclass for `CrossMapLRN2dImpl`.
+/// See the documentation for `CrossMapLRN2dImpl` class to learn what methods it
+/// provides, and examples of how to use `CrossMapLRN2d` with
+/// `torch::nn::CrossMapLRN2dOptions`. See the documentation for `ModuleHolder`
+/// to learn about PyTorch's module storage semantics.
+TORCH_MODULE(CrossMapLRN2d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GroupNorm ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies Group Normalization over a mini-batch of inputs as described in
+/// the paper `Group Normalization`_ .
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.GroupNorm to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::GroupNormOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// GroupNorm model(GroupNormOptions(2, 2).eps(2e-5).affine(false));
+/// ```
+class TORCH_API GroupNormImpl : public torch::nn::Cloneable<GroupNormImpl> {
+ public:
+  GroupNormImpl(int64_t num_groups, int64_t num_channels)
+      : GroupNormImpl(GroupNormOptions(num_groups, num_channels)) {}
+  explicit GroupNormImpl(const GroupNormOptions& options_);
+
+  void reset() override;
+
+  void reset_parameters();
+
+  /// Pretty prints the `GroupNorm` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input);
+
+  /// The options with which this module was constructed.
+  GroupNormOptions options;
+
+  /// The learned weight.
+  Tensor weight;
+
+  /// The learned bias.
+  Tensor bias;
+};
+
+/// A `ModuleHolder` subclass for `GroupNormImpl`.
+/// See the documentation for `GroupNormImpl` class to learn what methods it
+/// provides, and examples of how to use `GroupNorm` with
+/// `torch::nn::GroupNormOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(GroupNorm);
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/padding.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/padding.h
new file mode 100644
index 0000000000000000000000000000000000000000..f35806e794a48f34ac05f13aebb5c8432f602803
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/padding.h
@@ -0,0 +1,376 @@
+#pragma once
+
+#include <torch/expanding_array.h>
+#include <torch/nn/cloneable.h>
+#include <torch/nn/functional/padding.h>
+
+#include <torch/csrc/Export.h>
+
+namespace torch::nn {
+
+/// Base class for all (dimension-specialized) ReflectionPad modules.
+template <size_t D, typename Derived>
+class TORCH_API ReflectionPadImpl : public torch::nn::Cloneable<Derived> {
+ public:
+  ReflectionPadImpl(ExpandingArray<D * 2> padding)
+      : ReflectionPadImpl(ReflectionPadOptions<D>(padding)) {}
+  explicit ReflectionPadImpl(const ReflectionPadOptions<D>& options_);
+
+  void reset() override;
+
+  Tensor forward(const Tensor& input);
+
+  /// Pretty prints the `ReflectionPad{1,2}d` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  ReflectionPadOptions<D> options;
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ReflectionPad1d
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies ReflectionPad over a 1-D input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.ReflectionPad1d to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::ReflectionPad1dOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// ReflectionPad1d model(ReflectionPad1dOptions({3, 1}));
+/// ```
+class TORCH_API ReflectionPad1dImpl
+    : public ReflectionPadImpl<1, ReflectionPad1dImpl> {
+ public:
+  using ReflectionPadImpl<1, ReflectionPad1dImpl>::ReflectionPadImpl;
+};
+
+/// A `ModuleHolder` subclass for `ReflectionPad1dImpl`.
+/// See the documentation for `ReflectionPad1dImpl` class to learn what methods
+/// it provides, and examples of how to use `ReflectionPad1d` with
+/// `torch::nn::ReflectionPad1dOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(ReflectionPad1d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ReflectionPad2d
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies ReflectionPad over a 2-D input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.ReflectionPad2d to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::ReflectionPad2dOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// ReflectionPad2d model(ReflectionPad2dOptions({1, 1, 2, 0}));
+/// ```
+class TORCH_API ReflectionPad2dImpl
+    : public ReflectionPadImpl<2, ReflectionPad2dImpl> {
+ public:
+  using ReflectionPadImpl<2, ReflectionPad2dImpl>::ReflectionPadImpl;
+};
+
+/// A `ModuleHolder` subclass for `ReflectionPad2dImpl`.
+/// See the documentation for `ReflectionPad2dImpl` class to learn what methods
+/// it provides, and examples of how to use `ReflectionPad2d` with
+/// `torch::nn::ReflectionPad2dOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(ReflectionPad2d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ReflectionPad3d
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies ReflectionPad over a 3-D input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.ReflectionPad3d to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::ReflectionPad3dOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// ReflectionPad3d model(ReflectionPad3dOptions(1));
+/// ReflectionPad3d model(ReflectionPad3dOptions({1, 1, 2, 0, 1, 2}));
+/// ```
+class TORCH_API ReflectionPad3dImpl
+    : public ReflectionPadImpl<3, ReflectionPad3dImpl> {
+ public:
+  using ReflectionPadImpl<3, ReflectionPad3dImpl>::ReflectionPadImpl;
+};
+
+/// A `ModuleHolder` subclass for `ReflectionPad3dImpl`.
+/// See the documentation for `ReflectionPad3dImpl` class to learn what methods
+/// it provides, and examples of how to use `ReflectionPad3d` with
+/// `torch::nn::ReflectionPad3dOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(ReflectionPad3d);
+
+// ============================================================================
+
+/// Base class for all (dimension-specialized) ReplicationPad modules.
+template <size_t D, typename Derived>
+class TORCH_API ReplicationPadImpl : public torch::nn::Cloneable<Derived> {
+ public:
+  ReplicationPadImpl(ExpandingArray<D * 2> padding)
+      : ReplicationPadImpl(ReplicationPadOptions<D>(padding)) {}
+  explicit ReplicationPadImpl(const ReplicationPadOptions<D>& options_);
+
+  void reset() override;
+
+  Tensor forward(const Tensor& input);
+
+  /// Pretty prints the `ReplicationPad{1,2}d` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  ReplicationPadOptions<D> options;
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ReplicationPad1d
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies ReplicationPad over a 1-D input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.ReplicationPad1d to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::ReplicationPad1dOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// ReplicationPad1d model(ReplicationPad1dOptions({3, 1}));
+/// ```
+class TORCH_API ReplicationPad1dImpl
+    : public ReplicationPadImpl<1, ReplicationPad1dImpl> {
+ public:
+  using ReplicationPadImpl<1, ReplicationPad1dImpl>::ReplicationPadImpl;
+};
+
+/// A `ModuleHolder` subclass for `ReplicationPad1dImpl`.
+/// See the documentation for `ReplicationPad1dImpl` class to learn what methods
+/// it provides, and examples of how to use `ReplicationPad1d` with
+/// `torch::nn::ReplicationPad1dOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(ReplicationPad1d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ReplicationPad2d
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies ReplicationPad over a 2-D input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.ReplicationPad2d to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::ReplicationPad2dOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// ReplicationPad2d model(ReplicationPad2dOptions({1, 1, 2, 0}));
+/// ```
+class TORCH_API ReplicationPad2dImpl
+    : public ReplicationPadImpl<2, ReplicationPad2dImpl> {
+ public:
+  using ReplicationPadImpl<2, ReplicationPad2dImpl>::ReplicationPadImpl;
+};
+
+/// A `ModuleHolder` subclass for `ReplicationPad2dImpl`.
+/// See the documentation for `ReplicationPad2dImpl` class to learn what methods
+/// it provides, and examples of how to use `ReplicationPad2d` with
+/// `torch::nn::ReplicationPad2dOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(ReplicationPad2d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ReplicationPad3d
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies ReplicationPad over a 3-D input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.ReplicationPad3d to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::ReplicationPad3dOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// ReplicationPad3d model(ReplicationPad3dOptions({1, 2, 1, 2, 1, 2}));
+/// ```
+class TORCH_API ReplicationPad3dImpl
+    : public ReplicationPadImpl<3, ReplicationPad3dImpl> {
+ public:
+  using ReplicationPadImpl<3, ReplicationPad3dImpl>::ReplicationPadImpl;
+};
+
+/// A `ModuleHolder` subclass for `ReplicationPad3dImpl`.
+/// See the documentation for `ReplicationPad3dImpl` class to learn what methods
+/// it provides, and examples of how to use `ReplicationPad3d` with
+/// `torch::nn::ReplicationPad3dOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(ReplicationPad3d);
+
+// ============================================================================
+
+/// Base class for all (dimension-specialized) ZeroPad modules.
+template <size_t D, typename Derived>
+class TORCH_API ZeroPadImpl : public torch::nn::Cloneable<Derived> {
+ public:
+  ZeroPadImpl(ExpandingArray<D * 2> padding)
+      : ZeroPadImpl(ZeroPadOptions<D>(padding)) {}
+  explicit ZeroPadImpl(const ZeroPadOptions<D>& options_);
+
+  void reset() override;
+
+  Tensor forward(const Tensor& input);
+
+  /// Pretty prints the `ZeroPad{1,2}d` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  ZeroPadOptions<D> options;
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ZeroPad1d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// Applies ZeroPad over a 1-D input.
+class TORCH_API ZeroPad1dImpl : public ZeroPadImpl<1, ZeroPad1dImpl> {
+ public:
+  using ZeroPadImpl<1, ZeroPad1dImpl>::ZeroPadImpl;
+};
+
+/// A `ModuleHolder` subclass for `ZeroPad1dImpl`.
+/// See the documentation for `ZeroPad1dImpl` class to learn what methods it
+/// provides, and examples of how to use `ZeroPad1d` with
+/// `torch::nn::ZeroPad1dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(ZeroPad1d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ZeroPad2d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// Applies ZeroPad over a 2-D input.
+class TORCH_API ZeroPad2dImpl : public ZeroPadImpl<2, ZeroPad2dImpl> {
+ public:
+  using ZeroPadImpl<2, ZeroPad2dImpl>::ZeroPadImpl;
+};
+
+/// A `ModuleHolder` subclass for `ZeroPad2dImpl`.
+/// See the documentation for `ZeroPad2dImpl` class to learn what methods it
+/// provides, and examples of how to use `ZeroPad2d` with
+/// `torch::nn::ZeroPad2dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(ZeroPad2d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ZeroPad3d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// Applies ZeroPad over a 3-D input.
+class TORCH_API ZeroPad3dImpl : public ZeroPadImpl<3, ZeroPad3dImpl> {
+ public:
+  using ZeroPadImpl<3, ZeroPad3dImpl>::ZeroPadImpl;
+};
+
+/// A `ModuleHolder` subclass for `ZeroPad3dImpl`.
+/// See the documentation for `ZeroPad3dImpl` class to learn what methods it
+/// provides, and examples of how to use `ZeroPad3d` with
+/// `torch::nn::ZeroPad3dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(ZeroPad3d);
+
+// ============================================================================
+
+/// Base class for all (dimension-specialized) ConstantPad modules.
+template <size_t D, typename Derived>
+class TORCH_API ConstantPadImpl : public torch::nn::Cloneable<Derived> {
+ public:
+  ConstantPadImpl(ExpandingArray<D * 2> padding, double value)
+      : ConstantPadImpl(ConstantPadOptions<D>(padding, value)) {}
+  explicit ConstantPadImpl(const ConstantPadOptions<D>& options_);
+
+  void reset() override;
+
+  Tensor forward(const Tensor& input);
+
+  /// Pretty prints the `ConstantPad{1,2}d` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  ConstantPadOptions<D> options;
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ConstantPad1d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies ConstantPad over a 1-D input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.ConstantPad1d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::ConstantPad1dOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// ConstantPad1d model(ConstantPad1dOptions({3, 1}, 3.5));
+/// ```
+class TORCH_API ConstantPad1dImpl
+    : public ConstantPadImpl<1, ConstantPad1dImpl> {
+ public:
+  using ConstantPadImpl<1, ConstantPad1dImpl>::ConstantPadImpl;
+};
+
+/// A `ModuleHolder` subclass for `ConstantPad1dImpl`.
+/// See the documentation for `ConstantPad1dImpl` class to learn what methods it
+/// provides, and examples of how to use `ConstantPad1d` with
+/// `torch::nn::ConstantPad1dOptions`. See the documentation for `ModuleHolder`
+/// to learn about PyTorch's module storage semantics.
+TORCH_MODULE(ConstantPad1d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ConstantPad2d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies ConstantPad over a 2-D input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.ConstantPad2d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::ConstantPad2dOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// ConstantPad2d model(ConstantPad2dOptions({3, 0, 2, 1}, 3.5));
+/// ```
+class TORCH_API ConstantPad2dImpl
+    : public ConstantPadImpl<2, ConstantPad2dImpl> {
+ public:
+  using ConstantPadImpl<2, ConstantPad2dImpl>::ConstantPadImpl;
+};
+
+/// A `ModuleHolder` subclass for `ConstantPad2dImpl`.
+/// See the documentation for `ConstantPad2dImpl` class to learn what methods it
+/// provides, and examples of how to use `ConstantPad2d` with
+/// `torch::nn::ConstantPad2dOptions`. See the documentation for `ModuleHolder`
+/// to learn about PyTorch's module storage semantics.
+TORCH_MODULE(ConstantPad2d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ConstantPad3d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies ConstantPad over a 3-D input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.ConstantPad3d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::ConstantPad3dOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// ConstantPad3d model(ConstantPad3dOptions({1, 2, 1, 2, 1, 2}, 3.5));
+/// ```
+class TORCH_API ConstantPad3dImpl
+    : public ConstantPadImpl<3, ConstantPad3dImpl> {
+ public:
+  using ConstantPadImpl<3, ConstantPad3dImpl>::ConstantPadImpl;
+};
+
+/// A `ModuleHolder` subclass for `ConstantPad3dImpl`.
+/// See the documentation for `ConstantPad3dImpl` class to learn what methods it
+/// provides, and examples of how to use `ConstantPad3d` with
+/// `torch::nn::ConstantPad3dOptions`. See the documentation for `ModuleHolder`
+/// to learn about PyTorch's module storage semantics.
+TORCH_MODULE(ConstantPad3d);
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/pixelshuffle.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/pixelshuffle.h
new file mode 100644
index 0000000000000000000000000000000000000000..a33d295a4eccbb4ba531a3d65fc051fd2261bcfb
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/pixelshuffle.h
@@ -0,0 +1,86 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/functional/pixelshuffle.h>
+#include <torch/nn/options/pixelshuffle.h>
+
+#include <torch/csrc/Export.h>
+
+namespace torch::nn {
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PixelShuffle
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Rearranges elements in a tensor of shape :math:`(*, C \times r^2, H, W)`
+/// to a tensor of shape :math:`(*, C, H \times r, W \times r)`, where r is an
+/// upscale factor. See
+/// https://pytorch.org/docs/main/nn.html#torch.nn.PixelShuffle to learn about
+/// the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::PixelShuffleOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// PixelShuffle model(PixelShuffleOptions(5));
+/// ```
+struct TORCH_API PixelShuffleImpl
+    : public torch::nn::Cloneable<PixelShuffleImpl> {
+  explicit PixelShuffleImpl(const PixelShuffleOptions& options_);
+
+  /// Pretty prints the `PixelShuffle` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// The options with which this `Module` was constructed.
+  PixelShuffleOptions options;
+};
+
+/// A `ModuleHolder` subclass for `PixelShuffleImpl`.
+/// See the documentation for `PixelShuffleImpl` class to learn what methods it
+/// provides, and examples of how to use `PixelShuffle` with
+/// `torch::nn::PixelShuffleOptions`. See the documentation for `ModuleHolder`
+/// to learn about PyTorch's module storage semantics.
+TORCH_MODULE(PixelShuffle);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PixelUnshuffle ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Reverses the PixelShuffle operation by rearranging elements in a tensor of
+/// shape :math:`(*, C, H \times r, W \times r)` to a tensor of shape :math:`(*,
+/// C \times r^2, H, W)`, where r is a downscale factor. See
+/// https://pytorch.org/docs/main/nn.html#torch.nn.PixelUnshuffle to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::PixelUnshuffleOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// PixelUnshuffle model(PixelUnshuffleOptions(5));
+/// ```
+struct TORCH_API PixelUnshuffleImpl
+    : public torch::nn::Cloneable<PixelUnshuffleImpl> {
+  explicit PixelUnshuffleImpl(const PixelUnshuffleOptions& options_);
+
+  /// Pretty prints the `PixelUnshuffle` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// The options with which this `Module` was constructed.
+  PixelUnshuffleOptions options;
+};
+
+/// A `ModuleHolder` subclass for `PixelUnshuffleImpl`.
+/// See the documentation for `PixelUnshuffleImpl` class to learn what methods
+/// it provides, and examples of how to use `PixelUnshuffle` with
+/// `torch::nn::PixelUnshuffleOptions`. See the documentation for `ModuleHolder`
+/// to learn about PyTorch's module storage semantics.
+TORCH_MODULE(PixelUnshuffle);
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/pooling.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/pooling.h
new file mode 100644
index 0000000000000000000000000000000000000000..b87e6b385bea699f9061aaeb0499c2e18e10fd28
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/pooling.h
@@ -0,0 +1,777 @@
+#pragma once
+
+#include <torch/expanding_array.h>
+#include <torch/nn/cloneable.h>
+#include <torch/nn/functional/pooling.h>
+#include <torch/nn/modules/common.h>
+#include <torch/nn/options/pooling.h>
+
+#include <torch/csrc/Export.h>
+
+namespace torch::nn {
+
+/// Base class for all (dimension-specialized) avgpool modules.
+template <size_t D, typename Derived>
+class TORCH_API AvgPoolImpl : public torch::nn::Cloneable<Derived> {
+ public:
+  AvgPoolImpl(ExpandingArray<D> kernel_size)
+      : AvgPoolImpl(AvgPoolOptions<D>(kernel_size)) {}
+  explicit AvgPoolImpl(const AvgPoolOptions<D>& options_);
+
+  void reset() override;
+
+  /// Pretty prints the `AvgPool{1,2,3}d` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  AvgPoolOptions<D> options;
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ AvgPool1d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies avgpool over a 1-D input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.AvgPool1d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::AvgPool1dOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// AvgPool1d model(AvgPool1dOptions(3).stride(2));
+/// ```
+class TORCH_API AvgPool1dImpl : public AvgPoolImpl<1, AvgPool1dImpl> {
+ public:
+  using AvgPoolImpl<1, AvgPool1dImpl>::AvgPoolImpl;
+  Tensor forward(const Tensor& input);
+};
+
+/// A `ModuleHolder` subclass for `AvgPool1dImpl`.
+/// See the documentation for `AvgPool1dImpl` class to learn what methods it
+/// provides, and examples of how to use `AvgPool1d` with
+/// `torch::nn::AvgPool1dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(AvgPool1d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ AvgPool2d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies avgpool over a 2-D input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.AvgPool2d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::AvgPool2dOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// AvgPool2d model(AvgPool2dOptions({3, 2}).stride({2, 2}));
+/// ```
+class TORCH_API AvgPool2dImpl : public AvgPoolImpl<2, AvgPool2dImpl> {
+ public:
+  using AvgPoolImpl<2, AvgPool2dImpl>::AvgPoolImpl;
+  Tensor forward(const Tensor& input);
+};
+
+/// A `ModuleHolder` subclass for `AvgPool2dImpl`.
+/// See the documentation for `AvgPool2dImpl` class to learn what methods it
+/// provides, and examples of how to use `AvgPool2d` with
+/// `torch::nn::AvgPool2dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(AvgPool2d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ AvgPool3d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies avgpool over a 3-D input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.AvgPool3d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::AvgPool3dOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// AvgPool3d model(AvgPool3dOptions(5).stride(2));
+/// ```
+class TORCH_API AvgPool3dImpl : public AvgPoolImpl<3, AvgPool3dImpl> {
+ public:
+  using AvgPoolImpl<3, AvgPool3dImpl>::AvgPoolImpl;
+  Tensor forward(const Tensor& input);
+};
+
+/// A `ModuleHolder` subclass for `AvgPool3dImpl`.
+/// See the documentation for `AvgPool3dImpl` class to learn what methods it
+/// provides, and examples of how to use `AvgPool3d` with
+/// `torch::nn::AvgPool3dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(AvgPool3d);
+
+// ============================================================================
+
+/// Base class for all (dimension-specialized) maxpool modules.
+template <size_t D, typename Derived>
+class TORCH_API MaxPoolImpl : public torch::nn::Cloneable<Derived> {
+ public:
+  MaxPoolImpl(ExpandingArray<D> kernel_size)
+      : MaxPoolImpl(MaxPoolOptions<D>(kernel_size)) {}
+  explicit MaxPoolImpl(const MaxPoolOptions<D>& options_);
+
+  void reset() override;
+
+  /// Pretty prints the `MaxPool{1,2,3}d` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  MaxPoolOptions<D> options;
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MaxPool1d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies maxpool over a 1-D input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.MaxPool1d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::MaxPool1dOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// MaxPool1d model(MaxPool1dOptions(3).stride(2));
+/// ```
+class TORCH_API MaxPool1dImpl : public MaxPoolImpl<1, MaxPool1dImpl> {
+ public:
+  using MaxPoolImpl<1, MaxPool1dImpl>::MaxPoolImpl;
+  Tensor forward(const Tensor& input);
+
+  /// Returns the outputs and the indices of the max values.
+  /// Useful for `torch::nn::MaxUnpool1d` later.
+  std::tuple<Tensor, Tensor> forward_with_indices(const Tensor& input);
+};
+
+/// A `ModuleHolder` subclass for `MaxPool1dImpl`.
+/// See the documentation for `MaxPool1dImpl` class to learn what methods it
+/// provides, and examples of how to use `MaxPool1d` with
+/// `torch::nn::MaxPool1dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(MaxPool1d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MaxPool2d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies maxpool over a 2-D input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.MaxPool2d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::MaxPool2dOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// MaxPool2d model(MaxPool2dOptions({3, 2}).stride({2, 2}));
+/// ```
+class TORCH_API MaxPool2dImpl : public MaxPoolImpl<2, MaxPool2dImpl> {
+ public:
+  using MaxPoolImpl<2, MaxPool2dImpl>::MaxPoolImpl;
+  Tensor forward(const Tensor& input);
+
+  /// Returns the outputs and the indices of the max values.
+  /// Useful for `torch::nn::MaxUnpool2d` later.
+  std::tuple<Tensor, Tensor> forward_with_indices(const Tensor& input);
+};
+
+/// A `ModuleHolder` subclass for `MaxPool2dImpl`.
+/// See the documentation for `MaxPool2dImpl` class to learn what methods it
+/// provides, and examples of how to use `MaxPool2d` with
+/// `torch::nn::MaxPool2dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(MaxPool2d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MaxPool3d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies maxpool over a 3-D input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.MaxPool3d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::MaxPool3dOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// MaxPool3d model(MaxPool3dOptions(3).stride(2));
+/// ```
+class TORCH_API MaxPool3dImpl : public MaxPoolImpl<3, MaxPool3dImpl> {
+ public:
+  using MaxPoolImpl<3, MaxPool3dImpl>::MaxPoolImpl;
+  Tensor forward(const Tensor& input);
+
+  /// Returns the outputs and the indices of the max values.
+  /// Useful for `torch::nn::MaxUnpool3d` later.
+  std::tuple<Tensor, Tensor> forward_with_indices(const Tensor& input);
+};
+
+/// A `ModuleHolder` subclass for `MaxPool3dImpl`.
+/// See the documentation for `MaxPool3dImpl` class to learn what methods it
+/// provides, and examples of how to use `MaxPool3d` with
+/// `torch::nn::MaxPool3dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(MaxPool3d);
+
+// ============================================================================
+
+/// Base class for all (dimension-specialized) adaptive maxpool modules.
+template <size_t D, typename output_size_t, typename Derived>
+class TORCH_API AdaptiveMaxPoolImpl : public torch::nn::Cloneable<Derived> {
+ public:
+  AdaptiveMaxPoolImpl(output_size_t output_size)
+      : AdaptiveMaxPoolImpl(
+            AdaptiveMaxPoolOptions<output_size_t>(output_size)) {}
+  explicit AdaptiveMaxPoolImpl(
+      const AdaptiveMaxPoolOptions<output_size_t>& options_)
+      : options(options_) {}
+
+  void reset() override {}
+
+  /// Pretty prints the `AdaptiveMaxPool{1,2,3}d` module into the given
+  /// `stream`.
+  void pretty_print(std::ostream& stream) const override {
+    stream << "torch::nn::AdaptiveMaxPool" << D << "d"
+           << "(output_size=" << options.output_size() << ")";
+  }
+
+  /// The options with which this `Module` was constructed.
+  AdaptiveMaxPoolOptions<output_size_t> options;
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~ AdaptiveMaxPool1d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies adaptive maxpool over a 1-D input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.AdaptiveMaxPool1d to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::AdaptiveMaxPool1dOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// AdaptiveMaxPool1d model(AdaptiveMaxPool1dOptions(3));
+/// ```
+class TORCH_API AdaptiveMaxPool1dImpl
+    : public AdaptiveMaxPoolImpl<1, ExpandingArray<1>, AdaptiveMaxPool1dImpl> {
+ public:
+  using AdaptiveMaxPoolImpl<1, ExpandingArray<1>, AdaptiveMaxPool1dImpl>::
+      AdaptiveMaxPoolImpl;
+
+  Tensor forward(const Tensor& input);
+
+  /// Returns the indices along with the outputs.
+  /// Useful to pass to nn.MaxUnpool1d.
+  std::tuple<Tensor, Tensor> forward_with_indices(const Tensor& input);
+};
+
+/// A `ModuleHolder` subclass for `AdaptiveMaxPool1dImpl`.
+/// See the documentation for `AdaptiveMaxPool1dImpl` class to learn what
+/// methods it provides, and examples of how to use `AdaptiveMaxPool1d` with
+/// `torch::nn::AdaptiveMaxPool1dOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(AdaptiveMaxPool1d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ AdaptiveMaxPool2d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies adaptive maxpool over a 2-D input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.AdaptiveMaxPool2d to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::AdaptiveMaxPool2dOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// AdaptiveMaxPool2d model(AdaptiveMaxPool2dOptions({3, 2}));
+/// ```
+class TORCH_API AdaptiveMaxPool2dImpl : public AdaptiveMaxPoolImpl<
+                                            2,
+                                            ExpandingArrayWithOptionalElem<2>,
+                                            AdaptiveMaxPool2dImpl> {
+ public:
+  using AdaptiveMaxPoolImpl<
+      2,
+      ExpandingArrayWithOptionalElem<2>,
+      AdaptiveMaxPool2dImpl>::AdaptiveMaxPoolImpl;
+
+  Tensor forward(const Tensor& input);
+
+  /// Returns the indices along with the outputs.
+  /// Useful to pass to nn.MaxUnpool2d.
+  std::tuple<Tensor, Tensor> forward_with_indices(const Tensor& input);
+};
+
+/// A `ModuleHolder` subclass for `AdaptiveMaxPool2dImpl`.
+/// See the documentation for `AdaptiveMaxPool2dImpl` class to learn what
+/// methods it provides, and examples of how to use `AdaptiveMaxPool2d` with
+/// `torch::nn::AdaptiveMaxPool2dOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(AdaptiveMaxPool2d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ AdaptiveMaxPool3d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies adaptive maxpool over a 3-D input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.AdaptiveMaxPool3d to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::AdaptiveMaxPool3dOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// AdaptiveMaxPool3d model(AdaptiveMaxPool3dOptions(3));
+/// ```
+class TORCH_API AdaptiveMaxPool3dImpl : public AdaptiveMaxPoolImpl<
+                                            3,
+                                            ExpandingArrayWithOptionalElem<3>,
+                                            AdaptiveMaxPool3dImpl> {
+ public:
+  using AdaptiveMaxPoolImpl<
+      3,
+      ExpandingArrayWithOptionalElem<3>,
+      AdaptiveMaxPool3dImpl>::AdaptiveMaxPoolImpl;
+
+  Tensor forward(const Tensor& input);
+
+  /// Returns the indices along with the outputs.
+  /// Useful to pass to nn.MaxUnpool3d.
+  std::tuple<Tensor, Tensor> forward_with_indices(const Tensor& input);
+};
+
+/// A `ModuleHolder` subclass for `AdaptiveMaxPool3dImpl`.
+/// See the documentation for `AdaptiveMaxPool3dImpl` class to learn what
+/// methods it provides, and examples of how to use `AdaptiveMaxPool3d` with
+/// `torch::nn::AdaptiveMaxPool3dOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(AdaptiveMaxPool3d);
+
+// ============================================================================
+
+/// Base class for all (dimension-specialized) adaptive avgpool modules.
+template <size_t D, typename output_size_t, typename Derived>
+class TORCH_API AdaptiveAvgPoolImpl : public torch::nn::Cloneable<Derived> {
+ public:
+  AdaptiveAvgPoolImpl(output_size_t output_size)
+      : AdaptiveAvgPoolImpl(
+            AdaptiveAvgPoolOptions<output_size_t>(output_size)) {}
+  explicit AdaptiveAvgPoolImpl(
+      const AdaptiveAvgPoolOptions<output_size_t>& options_)
+      : options(options_) {}
+
+  void reset() override {}
+
+  /// Pretty prints the `AdaptiveAvgPool{1,2,3}d` module into the given
+  /// `stream`.
+  void pretty_print(std::ostream& stream) const override {
+    stream << "torch::nn::AdaptiveAvgPool" << D << "d"
+           << "(output_size=" << options.output_size() << ")";
+  }
+
+  /// The options with which this `Module` was constructed.
+  AdaptiveAvgPoolOptions<output_size_t> options;
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~ AdaptiveAvgPool1d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies adaptive avgpool over a 1-D input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.AdaptiveAvgPool1d to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::AdaptiveAvgPool1dOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// AdaptiveAvgPool1d model(AdaptiveAvgPool1dOptions(5));
+/// ```
+class TORCH_API AdaptiveAvgPool1dImpl
+    : public AdaptiveAvgPoolImpl<1, ExpandingArray<1>, AdaptiveAvgPool1dImpl> {
+ public:
+  using AdaptiveAvgPoolImpl<1, ExpandingArray<1>, AdaptiveAvgPool1dImpl>::
+      AdaptiveAvgPoolImpl;
+
+  Tensor forward(const Tensor& input);
+};
+
+/// A `ModuleHolder` subclass for `AdaptiveAvgPool1dImpl`.
+/// See the documentation for `AdaptiveAvgPool1dImpl` class to learn what
+/// methods it provides, and examples of how to use `AdaptiveAvgPool1d` with
+/// `torch::nn::AdaptiveAvgPool1dOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(AdaptiveAvgPool1d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~ AdaptiveAvgPool2d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies adaptive avgpool over a 2-D input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.AdaptiveAvgPool2d to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::AdaptiveAvgPool2dOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// AdaptiveAvgPool2d model(AdaptiveAvgPool2dOptions({3, 2}));
+/// ```
+class TORCH_API AdaptiveAvgPool2dImpl : public AdaptiveAvgPoolImpl<
+                                            2,
+                                            ExpandingArrayWithOptionalElem<2>,
+                                            AdaptiveAvgPool2dImpl> {
+ public:
+  using AdaptiveAvgPoolImpl<
+      2,
+      ExpandingArrayWithOptionalElem<2>,
+      AdaptiveAvgPool2dImpl>::AdaptiveAvgPoolImpl;
+
+  Tensor forward(const Tensor& input);
+};
+
+/// A `ModuleHolder` subclass for `AdaptiveAvgPool2dImpl`.
+/// See the documentation for `AdaptiveAvgPool2dImpl` class to learn what
+/// methods it provides, and examples of how to use `AdaptiveAvgPool2d` with
+/// `torch::nn::AdaptiveAvgPool2dOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(AdaptiveAvgPool2d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~ AdaptiveAvgPool3d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies adaptive avgpool over a 3-D input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.AdaptiveAvgPool3d to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::AdaptiveAvgPool3dOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// AdaptiveAvgPool3d model(AdaptiveAvgPool3dOptions(3));
+/// ```
+class TORCH_API AdaptiveAvgPool3dImpl : public AdaptiveAvgPoolImpl<
+                                            3,
+                                            ExpandingArrayWithOptionalElem<3>,
+                                            AdaptiveAvgPool3dImpl> {
+ public:
+  using AdaptiveAvgPoolImpl<
+      3,
+      ExpandingArrayWithOptionalElem<3>,
+      AdaptiveAvgPool3dImpl>::AdaptiveAvgPoolImpl;
+
+  Tensor forward(const Tensor& input);
+};
+
+/// A `ModuleHolder` subclass for `AdaptiveAvgPool3dImpl`.
+/// See the documentation for `AdaptiveAvgPool3dImpl` class to learn what
+/// methods it provides, and examples of how to use `AdaptiveAvgPool3d` with
+/// `torch::nn::AdaptiveAvgPool3dOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(AdaptiveAvgPool3d);
+
+// ============================================================================
+
+/// Base class for all (dimension-specialized) maxunpool modules.
+template <size_t D, typename Derived>
+class TORCH_API MaxUnpoolImpl : public torch::nn::Cloneable<Derived> {
+ public:
+  MaxUnpoolImpl(ExpandingArray<D> kernel_size)
+      : MaxUnpoolImpl(MaxUnpoolOptions<D>(kernel_size)) {}
+  explicit MaxUnpoolImpl(const MaxUnpoolOptions<D>& options_);
+
+  void reset() override;
+
+  /// Pretty prints the `MaxUnpool{1,2,3}d` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  MaxUnpoolOptions<D> options;
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MaxUnpool1d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies maxunpool over a 1-D input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.MaxUnpool1d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::MaxUnpool1dOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// MaxUnpool1d model(MaxUnpool1dOptions(3).stride(2).padding(1));
+/// ```
+class TORCH_API MaxUnpool1dImpl : public MaxUnpoolImpl<1, MaxUnpool1dImpl> {
+ public:
+  using MaxUnpoolImpl<1, MaxUnpool1dImpl>::MaxUnpoolImpl;
+  Tensor forward(
+      const Tensor& input,
+      const Tensor& indices,
+      const std::optional<std::vector<int64_t>>& output_size = std::nullopt);
+
+ protected:
+  FORWARD_HAS_DEFAULT_ARGS({2, AnyValue(std::optional<std::vector<int64_t>>())})
+};
+
+/// A `ModuleHolder` subclass for `MaxUnpool1dImpl`.
+/// See the documentation for `MaxUnpool1dImpl` class to learn what methods it
+/// provides, and examples of how to use `MaxUnpool1d` with
+/// `torch::nn::MaxUnpool1dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(MaxUnpool1d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MaxUnpool2d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies maxunpool over a 2-D input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.MaxUnpool2d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::MaxUnpool2dOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// MaxUnpool2d model(MaxUnpool2dOptions(3).stride(2).padding(1));
+/// ```
+class TORCH_API MaxUnpool2dImpl : public MaxUnpoolImpl<2, MaxUnpool2dImpl> {
+ public:
+  using MaxUnpoolImpl<2, MaxUnpool2dImpl>::MaxUnpoolImpl;
+  Tensor forward(
+      const Tensor& input,
+      const Tensor& indices,
+      const std::optional<std::vector<int64_t>>& output_size = std::nullopt);
+
+ protected:
+  FORWARD_HAS_DEFAULT_ARGS({2, AnyValue(std::optional<std::vector<int64_t>>())})
+};
+
+/// A `ModuleHolder` subclass for `MaxUnpool2dImpl`.
+/// See the documentation for `MaxUnpool2dImpl` class to learn what methods it
+/// provides, and examples of how to use `MaxUnpool2d` with
+/// `torch::nn::MaxUnpool2dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(MaxUnpool2d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MaxUnpool3d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies maxunpool over a 3-D input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.MaxUnpool3d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::MaxUnpool3dOptions` class to learn
+/// what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// MaxUnpool3d model(MaxUnpool3dOptions(3).stride(2).padding(1));
+/// ```
+class TORCH_API MaxUnpool3dImpl : public MaxUnpoolImpl<3, MaxUnpool3dImpl> {
+ public:
+  using MaxUnpoolImpl<3, MaxUnpool3dImpl>::MaxUnpoolImpl;
+  Tensor forward(
+      const Tensor& input,
+      const Tensor& indices,
+      const std::optional<std::vector<int64_t>>& output_size = std::nullopt);
+
+ protected:
+  FORWARD_HAS_DEFAULT_ARGS({2, AnyValue(std::optional<std::vector<int64_t>>())})
+};
+
+/// A `ModuleHolder` subclass for `MaxUnpool3dImpl`.
+/// See the documentation for `MaxUnpool3dImpl` class to learn what methods it
+/// provides, and examples of how to use `MaxUnpool3d` with
+/// `torch::nn::MaxUnpool3dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(MaxUnpool3d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FractionalMaxPool2d
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies fractional maxpool over a 2-D input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.FractionalMaxPool2d to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::FractionalMaxPool2dOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// FractionalMaxPool2d model(FractionalMaxPool2dOptions(5).output_size(1));
+/// ```
+class TORCH_API FractionalMaxPool2dImpl
+    : public torch::nn::Cloneable<FractionalMaxPool2dImpl> {
+ public:
+  FractionalMaxPool2dImpl(ExpandingArray<2> kernel_size)
+      : FractionalMaxPool2dImpl(FractionalMaxPool2dOptions(kernel_size)) {}
+  explicit FractionalMaxPool2dImpl(FractionalMaxPool2dOptions options_);
+
+  void reset() override;
+
+  /// Pretty prints the `FractionalMaxPool2d` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input);
+
+  /// Returns the outputs and the indices of the max values.
+  /// Useful for `torch::nn::MaxUnpool2d` later.
+  std::tuple<Tensor, Tensor> forward_with_indices(const Tensor& input);
+
+  /// The options with which this `Module` was constructed.
+  FractionalMaxPool2dOptions options;
+
+  Tensor _random_samples;
+};
+
+/// A `ModuleHolder` subclass for `FractionalMaxPool2dImpl`.
+/// See the documentation for `FractionalMaxPool2dImpl` class to learn what
+/// methods it provides, and examples of how to use `FractionalMaxPool2d` with
+/// `torch::nn::FractionalMaxPool2dOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(FractionalMaxPool2d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FractionalMaxPool3d
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies fractional maxpool over a 3-D input.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.FractionalMaxPool3d to
+/// learn about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::FractionalMaxPool3dOptions` class to
+/// learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// FractionalMaxPool3d model(FractionalMaxPool3dOptions(5).output_size(1));
+/// ```
+class TORCH_API FractionalMaxPool3dImpl
+    : public torch::nn::Cloneable<FractionalMaxPool3dImpl> {
+ public:
+  FractionalMaxPool3dImpl(ExpandingArray<3> kernel_size)
+      : FractionalMaxPool3dImpl(FractionalMaxPool3dOptions(kernel_size)) {}
+  explicit FractionalMaxPool3dImpl(FractionalMaxPool3dOptions options_);
+
+  void reset() override;
+
+  /// Pretty prints the `FractionalMaxPool3d` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input);
+
+  /// Returns the outputs and the indices of the max values.
+  /// Useful for `torch::nn::MaxUnpool3d` later.
+  std::tuple<Tensor, Tensor> forward_with_indices(const Tensor& input);
+
+  /// The options with which this `Module` was constructed.
+  FractionalMaxPool3dOptions options;
+
+  Tensor _random_samples;
+};
+
+/// A `ModuleHolder` subclass for `FractionalMaxPool3dImpl`.
+/// See the documentation for `FractionalMaxPool3dImpl` class to learn what
+/// methods it provides, and examples of how to use `FractionalMaxPool3d` with
+/// `torch::nn::FractionalMaxPool3dOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(FractionalMaxPool3d);
+
+// ============================================================================
+
+/// Base class for all (dimension-specialized) lppool modules.
+template <size_t D, typename Derived>
+class TORCH_API LPPoolImpl : public torch::nn::Cloneable<Derived> {
+ public:
+  LPPoolImpl(double norm_type, ExpandingArray<D> kernel_size)
+      : LPPoolImpl(LPPoolOptions<D>(norm_type, kernel_size)) {}
+  explicit LPPoolImpl(const LPPoolOptions<D>& options_);
+
+  void reset() override;
+
+  /// Pretty prints the `LPPool{1,2}d` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  LPPoolOptions<D> options;
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ LPPool1d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the LPPool1d function element-wise.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.LPPool1d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::LPPool1dOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// LPPool1d model(LPPool1dOptions(1, 2).stride(5).ceil_mode(true));
+/// ```
+class TORCH_API LPPool1dImpl : public LPPoolImpl<1, LPPool1dImpl> {
+ public:
+  using LPPoolImpl<1, LPPool1dImpl>::LPPoolImpl;
+
+  Tensor forward(const Tensor& input);
+};
+
+/// A `ModuleHolder` subclass for `LPPool1dImpl`.
+/// See the documentation for `LPPool1dImpl` class to learn what methods it
+/// provides, and examples of how to use `LPPool1d` with
+/// `torch::nn::LPPool1dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(LPPool1d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ LPPool2d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the LPPool2d function element-wise.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.LPPool2d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::LPPool2dOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// LPPool2d model(LPPool2dOptions(1, std::vector<int64_t>({3, 4})).stride({5,
+/// 6}).ceil_mode(true));
+/// ```
+class TORCH_API LPPool2dImpl : public LPPoolImpl<2, LPPool2dImpl> {
+ public:
+  using LPPoolImpl<2, LPPool2dImpl>::LPPoolImpl;
+
+  Tensor forward(const Tensor& input);
+};
+
+/// A `ModuleHolder` subclass for `LPPool2dImpl`.
+/// See the documentation for `LPPool2dImpl` class to learn what methods it
+/// provides, and examples of how to use `LPPool2d` with
+/// `torch::nn::LPPool2dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(LPPool2d);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ LPPool3d ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the LPPool3d function element-wise.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.LPPool3d to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::LPPool3dOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// LPPool3d model(LPPool3dOptions(1, std::vector<int64_t>({3, 4, 5})).stride(
+/// {5, 6, 7}).ceil_mode(true));
+/// ```
+class TORCH_API LPPool3dImpl : public LPPoolImpl<3, LPPool3dImpl> {
+ public:
+  using LPPoolImpl<3, LPPool3dImpl>::LPPoolImpl;
+
+  Tensor forward(const Tensor& input);
+};
+
+/// A `ModuleHolder` subclass for `LPPool3dImpl`.
+/// See the documentation for `LPPool3dImpl` class to learn what methods it
+/// provides, and examples of how to use `LPPool3d` with
+/// `torch::nn::LPPool3dOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(LPPool3d);
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/rnn.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/rnn.h
new file mode 100644
index 0000000000000000000000000000000000000000..31779b329082fed38dcbcfde10eecd2cf8792084
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/rnn.h
@@ -0,0 +1,399 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/modules/common.h>
+#include <torch/nn/modules/dropout.h>
+#include <torch/nn/options/rnn.h>
+#include <torch/nn/pimpl.h>
+#include <torch/nn/utils/rnn.h>
+#include <torch/types.h>
+
+#include <ATen/ATen.h>
+#include <c10/util/Exception.h>
+
+#include <cstddef>
+#include <functional>
+#include <memory>
+#include <vector>
+
+namespace torch::nn {
+
+namespace detail {
+/// Base class for all RNN implementations (intended for code sharing).
+template <typename Derived>
+class TORCH_API RNNImplBase : public torch::nn::Cloneable<Derived> {
+ public:
+  explicit RNNImplBase(const RNNOptionsBase& options_);
+
+  /// Initializes the parameters of the RNN module.
+  void reset() override;
+
+  void reset_parameters();
+
+  /// Overrides `nn::Module::to()` to call `flatten_parameters()` after the
+  /// original operation.
+  void to(torch::Device device, torch::Dtype dtype, bool non_blocking = false)
+      override;
+  void to(torch::Dtype dtype, bool non_blocking = false) override;
+  void to(torch::Device device, bool non_blocking = false) override;
+
+  /// Pretty prints the RNN module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// Modifies the internal storage of weights for optimization purposes.
+  ///
+  /// On CPU, this method should be called if any of the weight or bias vectors
+  /// are changed (i.e. weights are added or removed). On GPU, it should be
+  /// called __any time the storage of any parameter is modified__, e.g. any
+  /// time a parameter is assigned a new value. This allows using the fast path
+  /// in cuDNN implementations of respective RNN `forward()` methods. It is
+  /// called once upon construction, inside `reset()`.
+  void flatten_parameters();
+
+  std::vector<Tensor> all_weights() const;
+
+  /// The RNN's options.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  RNNOptionsBase options_base;
+
+ protected:
+  // Resets flat_weights_
+  // Note: be v. careful before removing this, as 3rd party device types
+  // likely rely on this behavior to properly .to() modules like LSTM.
+  void reset_flat_weights();
+
+  void check_input(const Tensor& input, const Tensor& batch_sizes) const;
+
+  std::tuple<int64_t, int64_t, int64_t> get_expected_hidden_size(
+      const Tensor& input,
+      const Tensor& batch_sizes) const;
+
+  void check_hidden_size(
+      const Tensor& hx,
+      std::tuple<int64_t, int64_t, int64_t> expected_hidden_size,
+      std::string msg = "Expected hidden size {1}, got {2}") const;
+
+  void check_forward_args(Tensor input, Tensor hidden, Tensor batch_sizes)
+      const;
+
+  Tensor permute_hidden(Tensor hx, const Tensor& permutation) const;
+
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::vector<std::string> flat_weights_names_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::vector<std::vector<std::string>> all_weights_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::vector<Tensor> flat_weights_;
+};
+} // namespace detail
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RNN ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// A multi-layer Elman RNN module with Tanh or ReLU activation.
+/// See https://pytorch.org/docs/main/generated/torch.nn.RNN.html to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::RNNOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// RNN model(RNNOptions(128,
+/// 64).num_layers(3).dropout(0.2).nonlinearity(torch::kTanh));
+/// ```
+class TORCH_API RNNImpl : public detail::RNNImplBase<RNNImpl> {
+ public:
+  RNNImpl(int64_t input_size, int64_t hidden_size)
+      : RNNImpl(RNNOptions(input_size, hidden_size)) {}
+  explicit RNNImpl(const RNNOptions& options_);
+
+  std::tuple<Tensor, Tensor> forward(const Tensor& input, Tensor hx = {});
+
+ protected:
+  FORWARD_HAS_DEFAULT_ARGS({1, AnyValue(Tensor())})
+
+ public:
+  std::tuple<torch::nn::utils::rnn::PackedSequence, Tensor>
+  forward_with_packed_input(
+      const torch::nn::utils::rnn::PackedSequence& packed_input,
+      Tensor hx = {});
+
+  RNNOptions options;
+
+ protected:
+  std::tuple<Tensor, Tensor> forward_helper(
+      const Tensor& input,
+      const Tensor& batch_sizes,
+      const Tensor& sorted_indices,
+      int64_t max_batch_size,
+      Tensor hx);
+};
+
+/// A `ModuleHolder` subclass for `RNNImpl`.
+/// See the documentation for `RNNImpl` class to learn what methods it
+/// provides, and examples of how to use `RNN` with `torch::nn::RNNOptions`.
+/// See the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(RNN);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ LSTM ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// A multi-layer long-short-term-memory (LSTM) module.
+/// See https://pytorch.org/docs/main/generated/torch.nn.LSTM.html to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::LSTMOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// LSTM model(LSTMOptions(2,
+/// 4).num_layers(3).batch_first(false).bidirectional(true));
+/// ```
+class TORCH_API LSTMImpl : public detail::RNNImplBase<LSTMImpl> {
+ public:
+  LSTMImpl(int64_t input_size, int64_t hidden_size)
+      : LSTMImpl(LSTMOptions(input_size, hidden_size)) {}
+  explicit LSTMImpl(const LSTMOptions& options_);
+
+  std::tuple<Tensor, std::tuple<Tensor, Tensor>> forward(
+      const Tensor& input,
+      std::optional<std::tuple<Tensor, Tensor>> hx_opt = {});
+
+ protected:
+  FORWARD_HAS_DEFAULT_ARGS(
+      {1, AnyValue(std::optional<std::tuple<Tensor, Tensor>>())})
+
+ public:
+  std::tuple<torch::nn::utils::rnn::PackedSequence, std::tuple<Tensor, Tensor>>
+  forward_with_packed_input(
+      const torch::nn::utils::rnn::PackedSequence& packed_input,
+      std::optional<std::tuple<Tensor, Tensor>> hx_opt = {});
+
+  LSTMOptions options;
+
+ protected:
+  void check_forward_args(
+      const Tensor& input,
+      std::tuple<Tensor, Tensor> hidden,
+      const Tensor& batch_sizes) const;
+
+  std::tuple<int64_t, int64_t, int64_t> get_expected_cell_size(
+      const Tensor& input,
+      const Tensor& batch_sizes) const;
+
+  std::tuple<Tensor, Tensor> permute_hidden(
+      std::tuple<Tensor, Tensor> hx,
+      const Tensor& permutation) const;
+
+  std::tuple<Tensor, std::tuple<Tensor, Tensor>> forward_helper(
+      const Tensor& input,
+      const Tensor& batch_sizes,
+      const Tensor& sorted_indices,
+      int64_t max_batch_size,
+      std::optional<std::tuple<Tensor, Tensor>> hx_opt);
+};
+
+/// A `ModuleHolder` subclass for `LSTMImpl`.
+/// See the documentation for `LSTMImpl` class to learn what methods it
+/// provides, and examples of how to use `LSTM` with `torch::nn::LSTMOptions`.
+/// See the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(LSTM);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GRU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// A multi-layer gated recurrent unit (GRU) module.
+/// See https://pytorch.org/docs/main/generated/torch.nn.GRU.html to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::GRUOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// GRU model(GRUOptions(2,
+/// 4).num_layers(3).batch_first(false).bidirectional(true));
+/// ```
+class TORCH_API GRUImpl : public detail::RNNImplBase<GRUImpl> {
+ public:
+  GRUImpl(int64_t input_size, int64_t hidden_size)
+      : GRUImpl(GRUOptions(input_size, hidden_size)) {}
+  explicit GRUImpl(const GRUOptions& options_);
+
+  std::tuple<Tensor, Tensor> forward(const Tensor& input, Tensor hx = {});
+
+ protected:
+  FORWARD_HAS_DEFAULT_ARGS({1, AnyValue(torch::Tensor())})
+
+ public:
+  std::tuple<torch::nn::utils::rnn::PackedSequence, Tensor>
+  forward_with_packed_input(
+      const torch::nn::utils::rnn::PackedSequence& packed_input,
+      Tensor hx = {});
+
+  GRUOptions options;
+
+ protected:
+  std::tuple<Tensor, Tensor> forward_helper(
+      const Tensor& input,
+      const Tensor& batch_sizes,
+      const Tensor& sorted_indices,
+      int64_t max_batch_size,
+      Tensor hx);
+};
+
+/// A `ModuleHolder` subclass for `GRUImpl`.
+/// See the documentation for `GRUImpl` class to learn what methods it
+/// provides, and examples of how to use `GRU` with `torch::nn::GRUOptions`.
+/// See the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(GRU);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RNNCellImplBase
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+namespace detail {
+/// Base class for all RNNCell implementations (intended for code sharing).
+template <typename Derived>
+class TORCH_API RNNCellImplBase : public torch::nn::Cloneable<Derived> {
+ public:
+  explicit RNNCellImplBase(const RNNCellOptionsBase& options_);
+
+  /// Initializes the parameters of the RNNCell module.
+  void reset() override;
+
+  void reset_parameters();
+
+  /// Pretty prints the RNN module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  RNNCellOptionsBase options_base;
+
+  Tensor weight_ih;
+  Tensor weight_hh;
+  Tensor bias_ih;
+  Tensor bias_hh;
+
+ protected:
+  void check_forward_input(const Tensor& input, const std::string& name) const;
+  virtual std::string get_nonlinearity_str() const;
+};
+} // namespace detail
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RNNCell
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// An Elman RNN cell with tanh or ReLU non-linearity.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.RNNCell to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::RNNCellOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// RNNCell model(RNNCellOptions(20,
+/// 10).bias(false).nonlinearity(torch::kReLU));
+/// ```
+class TORCH_API RNNCellImpl : public detail::RNNCellImplBase<RNNCellImpl> {
+ public:
+  RNNCellImpl(int64_t input_size, int64_t hidden_size)
+      : RNNCellImpl(RNNCellOptions(input_size, hidden_size)) {}
+  explicit RNNCellImpl(const RNNCellOptions& options_);
+
+  Tensor forward(const Tensor& input, const Tensor& hx = {});
+
+ protected:
+  FORWARD_HAS_DEFAULT_ARGS({1, AnyValue(Tensor())})
+
+ public:
+  RNNCellOptions options;
+
+ protected:
+  std::string get_nonlinearity_str() const override;
+};
+
+/// A `ModuleHolder` subclass for `RNNCellImpl`.
+/// See the documentation for `RNNCellImpl` class to learn what methods it
+/// provides, and examples of how to use `RNNCell` with
+/// `torch::nn::RNNCellOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(RNNCell);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ LSTMCell
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// A long short-term memory (LSTM) cell.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.LSTMCell to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::LSTMCellOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// LSTMCell model(LSTMCellOptions(20, 10).bias(false));
+/// ```
+class TORCH_API LSTMCellImpl : public detail::RNNCellImplBase<LSTMCellImpl> {
+ public:
+  LSTMCellImpl(int64_t input_size, int64_t hidden_size)
+      : LSTMCellImpl(LSTMCellOptions(input_size, hidden_size)) {}
+  explicit LSTMCellImpl(const LSTMCellOptions& options_);
+
+  std::tuple<Tensor, Tensor> forward(
+      const Tensor& input,
+      std::optional<std::tuple<Tensor, Tensor>> hx_opt = {});
+
+ protected:
+  FORWARD_HAS_DEFAULT_ARGS(
+      {1, AnyValue(std::optional<std::tuple<Tensor, Tensor>>())})
+
+ public:
+  LSTMCellOptions options;
+};
+
+/// A `ModuleHolder` subclass for `LSTMCellImpl`.
+/// See the documentation for `LSTMCellImpl` class to learn what methods it
+/// provides, and examples of how to use `LSTMCell` with
+/// `torch::nn::LSTMCellOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(LSTMCell);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GRUCell
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// A gated recurrent unit (GRU) cell.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.GRUCell to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::GRUCellOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// GRUCell model(GRUCellOptions(20, 10).bias(false));
+/// ```
+class TORCH_API GRUCellImpl : public detail::RNNCellImplBase<GRUCellImpl> {
+ public:
+  GRUCellImpl(int64_t input_size, int64_t hidden_size)
+      : GRUCellImpl(GRUCellOptions(input_size, hidden_size)) {}
+  explicit GRUCellImpl(const GRUCellOptions& options_);
+
+  Tensor forward(const Tensor& input, const Tensor& hx = {});
+
+ protected:
+  FORWARD_HAS_DEFAULT_ARGS({1, AnyValue(Tensor())})
+
+ public:
+  GRUCellOptions options;
+};
+
+/// A `ModuleHolder` subclass for `GRUCellImpl`.
+/// See the documentation for `GRUCellImpl` class to learn what methods it
+/// provides, and examples of how to use `GRUCell` with
+/// `torch::nn::GRUCellOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(GRUCell);
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/transformer.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/transformer.h
new file mode 100644
index 0000000000000000000000000000000000000000..d8f031051aff11caff1a072bfc386ab630f7271f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/transformer.h
@@ -0,0 +1,141 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/module.h>
+#include <torch/nn/modules/common.h>
+#include <torch/nn/options/transformer.h>
+#include <torch/nn/pimpl.h>
+
+#include <torch/types.h>
+
+#include <ostream>
+
+namespace torch::nn {
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Transformer ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// A transformer model. User is able to modify the attributes as needed. The
+/// architecture is based on the paper "Attention Is All You Need". Ashish
+/// Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N
+/// Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need.
+/// In Advances in Neural Information Processing Systems, pages 6000-6010.
+///
+/// See https://pytorch.org/docs/stable/generated/torch.nn.Transformer.html to
+/// learn about the exact behavior of this transformer model
+///
+/// See the documentation for `torch::nn::Transformer` class to learn what
+/// constructor arguments are supported for this encoder layer model
+///
+/// Example:
+/// ```
+/// Transformer trans(TransformerOptions(512, 8));
+/// ```
+class TORCH_API TransformerImpl : public Cloneable<TransformerImpl> {
+ public:
+  explicit TransformerImpl(TransformerOptions options_);
+
+  /// forward function for Transformer Module
+  /// Args:
+  ///   src: the sequence to the encoder (required).
+  ///   tgt: the sequence to the decoder (required).
+  ///   src_mask: the additive mask for the src sequence (optional).
+  ///   tgt_mask: the additive mask for the tgt sequence (optional).
+  ///   memory_mask: the additive mask for the encoder output (optional).
+  ///   src_key_padding_mask: the ByteTensor mask for src keys per batch
+  ///   (optional). tgt_key_padding_mask: the ByteTensor mask for tgt keys per
+  ///   batch (optional). memory_key_padding_mask: the ByteTensor mask for
+  ///   memory keys per batch (optional).
+  ///
+  /// Shape:
+  ///   src: `(S, N, E)`
+  ///   tgt: `(T, N, E)`
+  ///   src_mask: `(S, S)`
+  ///   tgt_mask: `(T, T)`
+  ///   memory_mask: `(T, S)`
+  ///   src_key_padding_mask: `(N, S)`
+  ///   tgt_key_padding_mask: `(N, T)`
+  ///   memory_key_padding_mask: `(N, S)`
+  ///
+  ///   Note:
+  ///     [src/tgt/memory]_mask ensures that position i is allowed to attend the
+  ///     unmasked positions. If a ByteTensor is provided, the non-zero
+  ///     positions are not allowed to attend while the zero positions will be
+  ///     unchanged. If a BoolTensor is provided, positions with `True` are not
+  ///     allowed to attend while `False` values will be unchanged. If a
+  ///     FloatTensor is provided, it will be added to the attention weight.
+  ///
+  ///     [src/tgt/memory]_key_padding_mask provides specified elements in the
+  ///     key to be ignored by the attention. If a ByteTensor is provided, the
+  ///     non-zero positions will be ignored while the zero positions will be
+  ///     unchanged. If a BoolTensor is provided, the positions with the value
+  ///     of `True` will be ignored while the position with the value of `False`
+  ///     will be unchanged.
+  ///
+  ///   output: `(T, N, E)`
+  ///
+  ///   Note:
+  ///     Due to the multi-head attention architecture in the transformer model,
+  ///     the output sequence length of a transformer is same as the input
+  ///     sequence (i.e. target) length of the decode.
+  ///
+  ///   where
+  ///   S is the source sequence length,
+  ///   T is the target sequence length,
+  ///   N is the batch size,
+  ///   E is the feature number.
+  Tensor forward(
+      const Tensor& src,
+      const Tensor& tgt,
+      const Tensor& src_mask = {},
+      const Tensor& tgt_mask = {},
+      const Tensor& memory_mask = {},
+      const Tensor& src_key_padding_mask = {},
+      const Tensor& tgt_key_padding_mask = {},
+      const Tensor& memory_key_padding_mask = {});
+
+  void reset() override;
+
+  void reset_parameters();
+
+  /// Generate a square mask for the sequence.
+  /// The masked positions are filled with `-inf` in float type.
+  /// Unmasked positions are filled with `0.0` in float type.
+  /// Note:
+  ///   1. This function will always return a CPU tensor.
+  ///   2. This function requires the platform support IEEE754, since `-inf` is
+  ///   guaranteed to
+  ///      be valid only when IEEE754 is supported. If the platform doesn't
+  ///      support IEEE754, this function will fill the mask with the smallest
+  ///      float number instead of `-inf`, a one time warning will pop up as
+  ///      well.
+  static Tensor generate_square_subsequent_mask(int64_t sz);
+
+ protected:
+  FORWARD_HAS_DEFAULT_ARGS(
+      {2, AnyValue(Tensor())},
+      {3, AnyValue(Tensor())},
+      {4, AnyValue(Tensor())},
+      {5, AnyValue(Tensor())},
+      {6, AnyValue(Tensor())},
+      {7, AnyValue(Tensor())})
+
+ public:
+  /// options with which this `Transformer` was constructed
+  TransformerOptions options;
+
+  /// encoder module
+  AnyModule encoder;
+
+  /// decoder module
+  AnyModule decoder;
+};
+
+/// A `ModuleHolder` subclass for `TransformerImpl`.
+/// See the documentation for `TransformerImpl` class to learn what
+/// methods it provides, and examples of how to use `Transformer` with
+/// `torch::nn::TransformerOptions`.
+/// See the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(Transformer);
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/transformercoder.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/transformercoder.h
new file mode 100644
index 0000000000000000000000000000000000000000..c35f49417b300897cfd82084f10281016e827fc2
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/transformercoder.h
@@ -0,0 +1,152 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/module.h>
+#include <torch/nn/modules/common.h>
+#include <torch/nn/modules/container/any.h>
+#include <torch/nn/modules/container/modulelist.h>
+#include <torch/nn/options/transformercoder.h>
+#include <torch/nn/pimpl.h>
+
+#include <torch/types.h>
+
+#include <utility>
+
+namespace torch::nn {
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ TransformerEncoder
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// TransformerEncoder module.
+/// See
+/// https://pytorch.org/docs/main/generated/torch.nn.TransformerEncoder.html
+/// to learn abouut the exact behavior of this encoder layer module.
+///
+/// See the documentation for `torch::nn::TransformerEncoder` class to learn
+/// what constructor arguments are supported for this encoder module.
+///
+/// Example:
+/// ```
+/// TransformerEncoderLayer encoderLayer(TransformerEncoderLayerOptions(512,
+/// 8).dropout(0.1)); TransformerEncoder
+/// encoder(TransformerEncoderOptions(encoderLayer,
+/// 6).norm(LayerNorm(LayerNormOptions({2}))));
+/// ```
+class TORCH_API TransformerEncoderImpl
+    : public Cloneable<TransformerEncoderImpl> {
+ public:
+  TransformerEncoderImpl(
+      TransformerEncoderLayer encoder_layer,
+      int64_t num_layers)
+      : TransformerEncoderImpl(
+            TransformerEncoderOptions(std::move(encoder_layer), num_layers)) {}
+  explicit TransformerEncoderImpl(TransformerEncoderOptions options_);
+
+  Tensor forward(
+      const Tensor& src,
+      const Tensor& src_mask = {},
+      const Tensor& src_key_padding_mask = {});
+
+  void reset() override;
+
+  void reset_parameters();
+
+ protected:
+  FORWARD_HAS_DEFAULT_ARGS({1, AnyValue(Tensor())}, {2, AnyValue(Tensor())})
+
+ public:
+  /// options with which this `TransformerEncoder` was constructed
+  TransformerEncoderOptions options;
+
+  /// module list that contains all the encoder layers
+  ModuleList layers = nullptr;
+
+  /// optional normalization module
+  AnyModule norm;
+};
+
+/// A `ModuleHolder` subclass for `TransformerEncoderImpl`.
+/// See the documentation for `TransformerEncoderImpl` class to learn what
+/// methods it provides, and examples of how to use `TransformerEncoder` with
+/// `torch::nn::TransformerEncoderOptions`.
+/// See the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(TransformerEncoder);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ TransformerDecoder
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// TransformerDecoder is a stack of N decoder layers.
+/// See
+/// https://pytorch.org/docs/main/generated/torch.nn.TransformerDecoder.html
+/// to learn abouut the exact behavior of this decoder module
+///
+/// See the documentation for `torch::nn::TransformerDecoderOptions` class to
+/// learn what constructor arguments are supported for this decoder module
+///
+/// Example:
+/// ```
+/// TransformerDecoderLayer decoder_layer(TransformerDecoderLayerOptions(512,
+/// 8).dropout(0.1)); TransformerDecoder
+/// transformer_decoder(TransformerDecoderOptions(decoder_layer,
+/// 6).norm(LayerNorm(LayerNormOptions({2})))); const auto memory =
+/// torch::rand({10, 32, 512}); const auto tgt = torch::rand({20, 32, 512});
+/// auto out = transformer_decoder(tgt, memory);
+/// ```
+class TORCH_API TransformerDecoderImpl
+    : public Cloneable<TransformerDecoderImpl> {
+ public:
+  TransformerDecoderImpl(
+      TransformerDecoderLayer decoder_layer,
+      int64_t num_layers)
+      : TransformerDecoderImpl(
+            TransformerDecoderOptions(std::move(decoder_layer), num_layers)) {}
+  explicit TransformerDecoderImpl(TransformerDecoderOptions options_);
+
+  void reset() override;
+
+  void reset_parameters();
+
+  /// Pass the inputs (and mask) through the decoder layer in turn.
+  /// Args:
+  ///       tgt: the sequence to the decoder layer (required).
+  ///       memory: the sequence from the last layer of the encoder (required).
+  ///       tgt_mask: the mask for the tgt sequence (optional).
+  ///       memory_mask: the mask for the memory sequence (optional).
+  ///       tgt_key_padding_mask: the mask for the tgt keys per batch
+  ///       (optional). memory_key_padding_mask: the mask for the memory keys
+  ///       per batch (optional).
+  Tensor forward(
+      const Tensor& tgt,
+      const Tensor& memory,
+      const Tensor& tgt_mask = {},
+      const Tensor& memory_mask = {},
+      const Tensor& tgt_key_padding_mask = {},
+      const Tensor& memory_key_padding_mask = {});
+
+  /// The options used to configure this module.
+  TransformerDecoderOptions options;
+
+  /// Cloned layers of decoder layers
+  ModuleList layers{nullptr};
+
+  /// optional layer normalization module
+  AnyModule norm;
+
+ protected:
+  FORWARD_HAS_DEFAULT_ARGS(
+      {2, AnyValue(Tensor())},
+      {3, AnyValue(Tensor())},
+      {4, AnyValue(Tensor())},
+      {5, AnyValue(Tensor())})
+};
+
+/// A `ModuleHolder` subclass for `TransformerDecoderImpl`.
+/// See the documentation for `TransformerDecoderImpl` class to learn what
+/// methods it provides, and examples of how to use `TransformerDecoder` with
+/// `torch::nn::TransformerDecoderOptions`.
+/// See the documentation for `ModuleHolder` to learn about PyTorch's
+/// module storage semantics.
+TORCH_MODULE(TransformerDecoder);
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/transformerlayer.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/transformerlayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..ff4a2f366f367500f06d6a364ef895b4ab3243e8
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/transformerlayer.h
@@ -0,0 +1,193 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/module.h>
+#include <torch/nn/modules/activation.h>
+#include <torch/nn/modules/common.h>
+#include <torch/nn/modules/dropout.h>
+#include <torch/nn/modules/linear.h>
+#include <torch/nn/modules/normalization.h>
+#include <torch/nn/options/transformerlayer.h>
+#include <torch/nn/pimpl.h>
+
+#include <torch/types.h>
+
+#include <ostream>
+
+namespace torch::nn {
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ TransformerEncoderLayer
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// TransformerEncoderLayer module.
+/// See
+/// https://pytorch.org/docs/main/generated/torch.nn.TransformerEncoderLayer.html
+/// to learn abouut the exact behavior of this encoder layer model
+///
+/// See the documentation for `torch::nn::TransformerEncoderLayer` class to
+/// learn what constructor arguments are supported for this encoder layer model
+///
+/// Example:
+/// ```
+/// TransformerEncoderLayer encoderLayer(TransformerEncoderLayerOptions(512,
+/// 8).dropout(0.1));
+/// ```
+class TORCH_API TransformerEncoderLayerImpl
+    : public Cloneable<TransformerEncoderLayerImpl> {
+ public:
+  TransformerEncoderLayerImpl(int64_t d_model, int64_t nhead)
+      : TransformerEncoderLayerImpl(
+            TransformerEncoderLayerOptions(d_model, nhead)) {}
+  explicit TransformerEncoderLayerImpl(TransformerEncoderLayerOptions options_);
+
+  Tensor forward(
+      const Tensor& src,
+      const Tensor& src_mask = {},
+      const Tensor& src_key_padding_mask = {});
+
+  void reset() override;
+
+  void reset_parameters();
+
+ protected:
+  FORWARD_HAS_DEFAULT_ARGS({1, AnyValue(Tensor())}, {2, AnyValue(Tensor())})
+
+ public:
+  /// options with which this `TransformerEncoderLayer` was constructed
+  TransformerEncoderLayerOptions options;
+
+  /// self attention
+  MultiheadAttention self_attn = nullptr;
+
+  /// feedforward first linear layer
+  Linear linear1 = nullptr;
+
+  /// feedforward dropout layer
+  Dropout dropout = nullptr;
+
+  /// feedforward second linear layer
+  Linear linear2 = nullptr;
+
+  /// pre feedforward, normalization layer
+  LayerNorm norm1 = nullptr;
+  /// post feedfastward, normalization layer
+  LayerNorm norm2 = nullptr;
+
+  /// pre feedfastward, dropout layer
+  Dropout dropout1 = nullptr;
+  /// post feedfastward, dropout layer
+  Dropout dropout2 = nullptr;
+};
+
+/// A `ModuleHolder` subclass for `TransformerEncoderLayerImpl``.
+/// See the documentation for `TransformerEncoderLayerImpl` class to learn what
+/// methods it provides, and examples of how to use `TransformerEncoderLayer`
+/// with `torch::nn::TransformerEncoderLayerOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(TransformerEncoderLayer);
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ TransformerDecoderLayer
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// TransformerDecoderLayer is made up of self-attn, multi-head-attn and
+/// feedforward network. This standard decoder layer is based on the paper
+/// "Attention Is All You Need". Ashish Vaswani, Noam Shazeer, Niki Parmar,
+/// Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and Illia
+/// Polosukhin. 2017. Attention is all you need. In Advances in Neural
+/// Information Processing Systems, pages 6000-6010. Users may modify or
+/// implement in a different way during application. See
+/// https://pytorch.org/docs/main/nn.html#transformer-layers to learn about
+/// the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::TransformerDecoderLayerOptions` class
+/// to learn what constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// TransformerDecoderLayer model(TransformerDecoderLayerOptions(512,
+/// 8).dropout(0.2));
+/// ```
+class TORCH_API TransformerDecoderLayerImpl
+    : public Cloneable<TransformerDecoderLayerImpl> {
+ public:
+  TransformerDecoderLayerImpl(int64_t d_model, int64_t nhead)
+      : TransformerDecoderLayerImpl(
+            TransformerDecoderLayerOptions(d_model, nhead)) {}
+  explicit TransformerDecoderLayerImpl(TransformerDecoderLayerOptions options_);
+
+  void reset() override;
+
+  void reset_parameters();
+
+  /// Pass the inputs (and mask) through the decoder layer.
+  /// Args:
+  ///       tgt: the sequence to the decoder layer (required).
+  ///       memory: the sequence from the last layer of the encoder (required).
+  ///       tgt_mask: the mask for the tgt sequence (optional).
+  ///       memory_mask: the mask for the memory sequence (optional).
+  ///       tgt_key_padding_mask: the mask for the tgt keys per batch
+  ///       (optional). memory_key_padding_mask: the mask for the memory keys
+  ///       per batch (optional).
+  Tensor forward(
+      Tensor tgt,
+      const Tensor& memory,
+      const Tensor& tgt_mask = {},
+      const Tensor& memory_mask = {},
+      const Tensor& tgt_key_padding_mask = {},
+      const Tensor& memory_key_padding_mask = {});
+
+  /// The options used to configure this module.
+  TransformerDecoderLayerOptions options;
+
+  /// self attention
+  MultiheadAttention self_attn{nullptr};
+
+  /// Dropout, post self attention
+  Dropout dropout1{nullptr};
+
+  /// Normalization, post self attention
+  LayerNorm norm1{nullptr};
+
+  /// Multi-headed attention
+  MultiheadAttention multihead_attn{nullptr};
+
+  /// Dropout, post multi-headed attention
+  Dropout dropout2{nullptr};
+
+  /// Normalization, post multi-headed attention
+  LayerNorm norm2{nullptr};
+
+  /// Feed forward first linear layer
+  Linear linear1{nullptr};
+
+  /// Feed forward dropout layer
+  Dropout dropout{nullptr};
+
+  /// Feed forward second linear layer
+  Linear linear2{nullptr};
+
+  /// Dropout, post feed forward
+  Dropout dropout3{nullptr};
+
+  /// Normalization, post feed forward
+  LayerNorm norm3{nullptr};
+
+ protected:
+  FORWARD_HAS_DEFAULT_ARGS(
+      {2, AnyValue(Tensor())},
+      {3, AnyValue(Tensor())},
+      {4, AnyValue(Tensor())},
+      {5, AnyValue(Tensor())})
+
+  /// Apply activation based on configuration
+  Tensor activation(const Tensor& input);
+};
+
+/// A `ModuleHolder` subclass for `TransformerDecoderLayerImpl`.
+/// See the documentation for `TransformerDecoderLayerImpl` class to learn what
+/// methods it provides, and examples of how to use `TransformerDecoderLayer`
+/// with `torch::nn::TransformerDecoderLayerOptions`. See the documentation for
+/// `ModuleHolder` to learn about PyTorch's module storage semantics.
+TORCH_MODULE(TransformerDecoderLayer);
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/upsampling.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/upsampling.h
new file mode 100644
index 0000000000000000000000000000000000000000..f6f2c847ecb33e32d489c437b104d664fb690c00
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/upsampling.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <torch/nn/cloneable.h>
+#include <torch/nn/functional/upsampling.h>
+#include <torch/nn/options/upsampling.h>
+#include <torch/nn/pimpl.h>
+#include <torch/types.h>
+
+#include <torch/csrc/Export.h>
+
+#include <cstddef>
+#include <ostream>
+
+namespace torch::nn {
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Upsample ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Upsamples a given multi-channel 1D (temporal), 2D (spatial) or 3D
+/// (volumetric) data.
+/// See https://pytorch.org/docs/main/nn.html#torch.nn.Upsample to learn
+/// about the exact behavior of this module.
+///
+/// See the documentation for `torch::nn::UpsampleOptions` class to learn what
+/// constructor arguments are supported for this module.
+///
+/// Example:
+/// ```
+/// Upsample
+/// model(UpsampleOptions().scale_factor({3}).mode(torch::kLinear).align_corners(false));
+/// ```
+class TORCH_API UpsampleImpl : public Cloneable<UpsampleImpl> {
+ public:
+  explicit UpsampleImpl(UpsampleOptions options_ = {});
+
+  void reset() override;
+
+  /// Pretty prints the `Upsample` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  Tensor forward(const Tensor& input);
+
+  /// The options with which this `Module` was constructed.
+  UpsampleOptions options;
+};
+
+/// A `ModuleHolder` subclass for `UpsampleImpl`.
+/// See the documentation for `UpsampleImpl` class to learn what methods it
+/// provides, and examples of how to use `Upsample` with
+/// `torch::nn::UpsampleOptions`. See the documentation for `ModuleHolder` to
+/// learn about PyTorch's module storage semantics.
+TORCH_MODULE(Upsample);
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/utils.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..2b505f7ac8f40a9599e8e39d46e04e32fbe0b5dd
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/modules/utils.h
@@ -0,0 +1,48 @@
+#pragma once
+
+#include <c10/util/ArrayRef.h>
+#include <c10/util/irange.h>
+#include <optional>
+
+#include <vector>
+
+namespace torch::nn::modules::utils {
+
+// Reverse the order of `t` and repeat each element for `n` times.
+// This can be used to translate padding arg used by Conv and Pooling modules
+// to the ones used by `F::pad`.
+//
+// This mirrors `_reverse_repeat_tuple` in `torch/nn/modules/utils.py`.
+inline std::vector<int64_t> _reverse_repeat_vector(
+    c10::ArrayRef<int64_t> t,
+    int64_t n) {
+  TORCH_INTERNAL_ASSERT(n >= 0);
+  std::vector<int64_t> ret;
+  ret.reserve(t.size() * n);
+  for (auto rit = t.rbegin(); rit != t.rend(); ++rit) {
+    for ([[maybe_unused]] const auto i : c10::irange(n)) {
+      ret.emplace_back(*rit);
+    }
+  }
+  return ret;
+}
+
+inline std::vector<int64_t> _list_with_default(
+    c10::ArrayRef<std::optional<int64_t>> out_size,
+    c10::IntArrayRef defaults) {
+  TORCH_CHECK(
+      defaults.size() > out_size.size(),
+      "Input dimension should be at least ",
+      out_size.size() + 1);
+  std::vector<int64_t> ret;
+  c10::IntArrayRef defaults_slice =
+      defaults.slice(defaults.size() - out_size.size(), out_size.size());
+  for (const auto i : c10::irange(out_size.size())) {
+    auto v = out_size.at(i);
+    auto d = defaults_slice.at(i);
+    ret.emplace_back(v.has_value() ? v.value() : d);
+  }
+  return ret;
+}
+
+} // namespace torch::nn::modules::utils
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb6b619bfa0316e6a87432c49a34db8f24e482fe
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <torch/nn/options/batchnorm.h>
+#include <torch/nn/options/conv.h>
+#include <torch/nn/options/dropout.h>
+#include <torch/nn/options/fold.h>
+#include <torch/nn/options/linear.h>
+#include <torch/nn/options/loss.h>
+#include <torch/nn/options/normalization.h>
+#include <torch/nn/options/padding.h>
+#include <torch/nn/options/pixelshuffle.h>
+#include <torch/nn/options/pooling.h>
+#include <torch/nn/options/rnn.h>
+#include <torch/nn/options/transformer.h>
+#include <torch/nn/options/transformercoder.h>
+#include <torch/nn/options/transformerlayer.h>
+#include <torch/nn/options/upsampling.h>
+#include <torch/nn/options/vision.h>
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/activation.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/activation.h
new file mode 100644
index 0000000000000000000000000000000000000000..2fc2973acab991e601ec893fd3cb9b72cf8e0846
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/activation.h
@@ -0,0 +1,712 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/enum.h>
+#include <torch/types.h>
+
+namespace torch::nn {
+
+/// Options for the `ELU` module.
+///
+/// Example:
+/// ```
+/// ELU model(ELUOptions().alpha(42.42).inplace(true));
+/// ```
+struct TORCH_API ELUOptions {
+  /// The `alpha` value for the ELU formulation. Default: 1.0
+  TORCH_ARG(double, alpha) = 1.0;
+
+  /// can optionally do the operation in-place. Default: False
+  TORCH_ARG(bool, inplace) = false;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::elu`.
+///
+/// See the documentation for `torch::nn::ELUOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::elu(x, F::ELUFuncOptions().alpha(0.42).inplace(true));
+/// ```
+using ELUFuncOptions = ELUOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `SELU` module.
+///
+/// Example:
+/// ```
+/// SELU model(SELUOptions().inplace(true));
+/// ```
+struct TORCH_API SELUOptions {
+  /* implicit */ SELUOptions(bool inplace = false);
+
+  /// can optionally do the operation in-place. Default: False
+  TORCH_ARG(bool, inplace);
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::selu`.
+///
+/// See the documentation for `torch::nn::SELUOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::selu(input, F::SELUFuncOptions(false));
+/// ```
+using SELUFuncOptions = SELUOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `GLU` module.
+///
+/// Example:
+/// ```
+/// GLU model(GLUOptions(1));
+/// ```
+struct TORCH_API GLUOptions {
+  /* implicit */ GLUOptions(int64_t dim = -1);
+
+  /// the dimension on which to split the input. Default: -1
+  TORCH_ARG(int64_t, dim);
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::glu`.
+///
+/// See the documentation for `torch::nn::GLUOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::glu(input, GLUFuncOptions(1));
+/// ```
+using GLUFuncOptions = GLUOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `GELU` module.
+///
+/// Example:
+/// ```
+/// GELU model(GELUOptions().approximate("none"));
+/// ```
+struct TORCH_API GELUOptions {
+  /// Specifies the approximation to apply to the output.
+  TORCH_ARG(std::string, approximate) = "none";
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::gelu`.
+///
+/// See the documentation for `torch::nn::GELUOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::gelu(input, F::GELUFuncOptions().approximate("none"));
+/// ```
+using GELUFuncOptions = GELUOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `Hardshrink` module.
+///
+/// Example:
+/// ```
+/// Hardshrink model(HardshrinkOptions().lambda(42.42));
+/// ```
+struct TORCH_API HardshrinkOptions {
+  /* implicit */ HardshrinkOptions(double lambda = 0.5);
+
+  /// the `lambda` value for the Hardshrink formulation. Default: 0.5
+  TORCH_ARG(double, lambda);
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::hardshrink`.
+///
+/// See the documentation for `torch::nn::HardshrinkOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::hardshrink(x, F::HardshrinkFuncOptions().lambda(0.42));
+/// ```
+using HardshrinkFuncOptions = HardshrinkOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `Hardtanh` module.
+///
+/// Example:
+/// ```
+/// Hardtanh
+/// model(HardtanhOptions().min_val(-42.42).max_val(0.42).inplace(true));
+/// ```
+struct TORCH_API HardtanhOptions {
+  /// minimum value of the linear region range. Default: -1
+  TORCH_ARG(double, min_val) = -1.0;
+
+  /// maximum value of the linear region range. Default: 1
+  TORCH_ARG(double, max_val) = 1.0;
+
+  /// can optionally do the operation in-place. Default: False
+  TORCH_ARG(bool, inplace) = false;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::hardtanh`.
+///
+/// See the documentation for `torch::nn::HardtanhOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::hardtanh(x,
+/// F::HardtanhFuncOptions().min_val(-1.0).max_val(1.0).inplace(true));
+/// ```
+using HardtanhFuncOptions = HardtanhOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `LeakyReLU` module.
+///
+/// Example:
+/// ```
+/// LeakyReLU model(LeakyReLUOptions().negative_slope(0.42).inplace(true));
+/// ```
+struct TORCH_API LeakyReLUOptions {
+  /// Controls the angle of the negative slope. Default: 1e-2
+  TORCH_ARG(double, negative_slope) = 1e-2;
+
+  /// can optionally do the operation in-place. Default: False
+  TORCH_ARG(bool, inplace) = false;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::leaky_relu`.
+///
+/// See the documentation for `torch::nn::LeakyReLUOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::leaky_relu(x,
+/// F::LeakyReLUFuncOptions().negative_slope(0.42).inplace(true));
+/// ```
+using LeakyReLUFuncOptions = LeakyReLUOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `Softmax` module.
+///
+/// Example:
+/// ```
+/// Softmax model(SoftmaxOptions(1));
+/// ```
+struct TORCH_API SoftmaxOptions {
+  SoftmaxOptions(int64_t dim);
+
+  /// Dimension along which Softmax will be computed.
+  TORCH_ARG(int64_t, dim);
+};
+
+// ============================================================================
+
+namespace functional {
+
+/// Options for `torch::nn::functional::softmax`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::softmax(input, F::SoftmaxFuncOptions(1));
+/// ```
+struct TORCH_API SoftmaxFuncOptions {
+  SoftmaxFuncOptions(int64_t dim);
+
+  /// Dimension along which Softmax will be computed.
+  TORCH_ARG(int64_t, dim);
+
+  /// the desired data type of returned tensor.
+  /// If specified, the input tensor is casted to `dtype` before the operation
+  /// is performed. This is useful for preventing data type overflows. Default:
+  /// None.
+  TORCH_ARG(std::optional<torch::Dtype>, dtype) = std::nullopt;
+};
+
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `Softmin` module.
+///
+/// Example:
+/// ```
+/// Softmin model(SoftminOptions(1));
+/// ```
+struct TORCH_API SoftminOptions {
+  SoftminOptions(int64_t dim);
+
+  /// Dimension along which Softmin will be computed.
+  TORCH_ARG(int64_t, dim);
+};
+
+// ============================================================================
+
+namespace functional {
+
+/// Options for `torch::nn::functional::softmin`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::softmin(input, F::SoftminFuncOptions(1));
+/// ```
+struct TORCH_API SoftminFuncOptions {
+  SoftminFuncOptions(int64_t dim);
+
+  /// Dimension along which Softmin will be computed.
+  TORCH_ARG(int64_t, dim);
+
+  /// the desired data type of returned tensor.
+  /// If specified, the input tensor is casted to `dtype` before the operation
+  /// is performed. This is useful for preventing data type overflows. Default:
+  /// None.
+  TORCH_ARG(std::optional<torch::Dtype>, dtype) = std::nullopt;
+};
+
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `LogSoftmax` module.
+///
+/// Example:
+/// ```
+/// LogSoftmax model(LogSoftmaxOptions(1));
+/// ```
+struct TORCH_API LogSoftmaxOptions {
+  LogSoftmaxOptions(int64_t dim);
+
+  /// Dimension along which LogSoftmax will be computed.
+  TORCH_ARG(int64_t, dim);
+};
+
+// ============================================================================
+
+namespace functional {
+
+/// Options for `torch::nn::functional::log_softmax`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::log_softmax(input, LogSoftmaxFuncOptions(1));
+/// ```
+struct TORCH_API LogSoftmaxFuncOptions {
+  LogSoftmaxFuncOptions(int64_t dim);
+
+  /// Dimension along which LogSoftmax will be computed.
+  TORCH_ARG(int64_t, dim);
+
+  /// the desired data type of returned tensor.
+  /// If specified, the input tensor is casted to `dtype` before the operation
+  /// is performed. This is useful for preventing data type overflows. Default:
+  /// None.
+  TORCH_ARG(std::optional<torch::Dtype>, dtype) = std::nullopt;
+};
+
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `PReLU` module.
+///
+/// Example:
+/// ```
+/// PReLU model(PReLUOptions().num_parameters(42));
+/// ```
+struct TORCH_API PReLUOptions {
+  /// number of `a` to learn. Although it takes an int as input, there is only
+  /// two values are legitimate: 1, or the number of channels at input. Default:
+  /// 1
+  TORCH_ARG(int64_t, num_parameters) = 1;
+
+  /// the initial value of `a`. Default: 0.25
+  TORCH_ARG(double, init) = 0.25;
+};
+
+// ============================================================================
+
+/// Options for the `ReLU` module.
+///
+/// Example:
+/// ```
+/// ReLU model(ReLUOptions().inplace(true));
+/// ```
+struct TORCH_API ReLUOptions {
+  /* implicit */ ReLUOptions(bool inplace = false);
+
+  /// can optionally do the operation in-place. Default: False
+  TORCH_ARG(bool, inplace);
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::relu`.
+///
+/// See the documentation for `torch::nn::ReLUOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::relu(x, F::ReLUFuncOptions().inplace(true));
+/// ```
+using ReLUFuncOptions = ReLUOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `ReLU6` module.
+///
+/// Example:
+/// ```
+/// ReLU6 model(ReLU6Options().inplace(true));
+/// ```
+struct TORCH_API ReLU6Options {
+  /* implicit */ ReLU6Options(bool inplace = false);
+
+  /// can optionally do the operation in-place. Default: False
+  TORCH_ARG(bool, inplace);
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::relu6`.
+///
+/// See the documentation for `torch::nn::ReLU6Options` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::relu6(x, F::ReLU6FuncOptions().inplace(true));
+/// ```
+using ReLU6FuncOptions = ReLU6Options;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `RReLU` module.
+///
+/// Example:
+/// ```
+/// RReLU model(RReLUOptions().lower(0.24).upper(0.42).inplace(true));
+/// ```
+struct TORCH_API RReLUOptions {
+  /// lower bound of the uniform distribution. Default: 1/8
+  TORCH_ARG(double, lower) = 1.0 / 8.0;
+
+  /// upper bound of the uniform distribution. Default: 1/3
+  TORCH_ARG(double, upper) = 1.0 / 3.0;
+
+  /// can optionally do the operation in-place. Default: False
+  TORCH_ARG(bool, inplace) = false;
+};
+
+// ============================================================================
+
+namespace functional {
+
+/// Options for `torch::nn::functional::rrelu`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::rrelu(x, F::RReLUFuncOptions().lower(0.1).upper(0.4).inplace(true));
+/// ```
+struct TORCH_API RReLUFuncOptions {
+  /// lower bound of the uniform distribution. Default: 1/8
+  TORCH_ARG(double, lower) = 1.0 / 8.0;
+
+  /// upper bound of the uniform distribution. Default: 1/3
+  TORCH_ARG(double, upper) = 1.0 / 3.0;
+
+  TORCH_ARG(bool, training) = false;
+
+  /// can optionally do the operation in-place. Default: False
+  TORCH_ARG(bool, inplace) = false;
+};
+
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `CELU` module.
+///
+/// Example:
+/// ```
+/// CELU model(CELUOptions().alpha(42.42).inplace(true));
+/// ```
+struct TORCH_API CELUOptions {
+  /// The `alpha` value for the CELU formulation. Default: 1.0
+  TORCH_ARG(double, alpha) = 1.0;
+
+  /// can optionally do the operation in-place. Default: False
+  TORCH_ARG(bool, inplace) = false;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::celu`.
+///
+/// See the documentation for `torch::nn::CELUOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::celu(x, F::CELUFuncOptions().alpha(0.42).inplace(true));
+/// ```
+using CELUFuncOptions = CELUOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `Softplus` module.
+///
+/// Example:
+/// ```
+/// Softplus model(SoftplusOptions().beta(0.24).threshold(42.42));
+/// ```
+struct TORCH_API SoftplusOptions {
+  /// the `beta` value for the Softplus formulation. Default: 1
+  TORCH_ARG(double, beta) = 1.0;
+
+  /// values above this revert to a linear function. Default: 20
+  TORCH_ARG(double, threshold) = 20.0;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::softplus`.
+///
+/// See the documentation for `torch::nn::SoftplusOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::softplus(x, F::SoftplusFuncOptions().beta(0.5).threshold(3.0));
+/// ```
+using SoftplusFuncOptions = SoftplusOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `Softshrink` module.
+///
+/// Example:
+/// ```
+/// Softshrink model(SoftshrinkOptions(42.42));
+/// ```
+struct TORCH_API SoftshrinkOptions {
+  /* implicit */ SoftshrinkOptions(double lambda = 0.5);
+
+  /// the `lambda` value for the Softshrink formulation. Default: 0.5
+  TORCH_ARG(double, lambda);
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::softshrink`.
+///
+/// See the documentation for `torch::nn::SoftshrinkOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::softshrink(x, F::SoftshrinkFuncOptions(0.42));
+/// ```
+using SoftshrinkFuncOptions = SoftshrinkOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `Threshold` module.
+///
+/// Example:
+/// ```
+/// Threshold model(ThresholdOptions(42.42, 24.24).inplace(true));
+/// ```
+struct TORCH_API ThresholdOptions {
+  ThresholdOptions(double threshold, double value)
+      : threshold_(threshold), value_(value) {}
+
+  /// The value to threshold at
+  TORCH_ARG(double, threshold);
+
+  /// The value to replace with
+  TORCH_ARG(double, value);
+
+  /// can optionally do the operation in-place. Default: False
+  TORCH_ARG(bool, inplace) = false;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::threshold`.
+///
+/// See the documentation for `torch::nn::ThresholdOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::threshold(x, F::ThresholdFuncOptions(0.5, 0.5).inplace(true));
+/// ```
+using ThresholdFuncOptions = ThresholdOptions;
+} // namespace functional
+
+// ============================================================================
+
+namespace functional {
+
+/// Options for `torch::nn::functional::gumbel_softmax`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::gumbel_softmax(logits, F::GumbelSoftmaxFuncOptions().hard(true).dim(-1));
+/// ```
+struct TORCH_API GumbelSoftmaxFuncOptions {
+  /// non-negative scalar temperature
+  TORCH_ARG(double, tau) = 1.0;
+
+  /// returned samples will be discretized as one-hot vectors,
+  /// but will be differentiated as if it is the soft sample in autograd.
+  /// Default: False
+  TORCH_ARG(bool, hard) = false;
+
+  /// dimension along which softmax will be computed. Default: -1
+  TORCH_ARG(int, dim) = -1;
+};
+
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `MultiheadAttention` module.
+///
+/// Example:
+/// ```
+/// MultiheadAttention model(MultiheadAttentionOptions(20, 10).bias(false));
+/// ```
+struct TORCH_API MultiheadAttentionOptions {
+  MultiheadAttentionOptions(int64_t embed_dim, int64_t num_heads);
+
+  /// total dimension of the model.
+  TORCH_ARG(int64_t, embed_dim);
+
+  /// parallel attention heads.
+  TORCH_ARG(int64_t, num_heads);
+
+  /// a Dropout layer on attn_output_weights. Default: 0.0.
+  TORCH_ARG(double, dropout) = 0.0;
+
+  /// add bias as module parameter. Default: true.
+  TORCH_ARG(bool, bias) = true;
+
+  /// add bias to the key and value sequences at dim=0.
+  TORCH_ARG(bool, add_bias_kv) = false;
+
+  /// add a new batch of zeros to the key and value sequences at dim=1.
+  TORCH_ARG(bool, add_zero_attn) = false;
+
+  /// total number of features in key. Default: std::nullopt.
+  TORCH_ARG(int64_t, kdim);
+
+  /// total number of features in key. Default: std::nullopt.
+  TORCH_ARG(int64_t, vdim);
+};
+
+// ============================================================================
+
+namespace functional {
+
+/// Options for `torch::nn::functional::multi_head_attention_forward`
+struct TORCH_API MultiheadAttentionForwardFuncOptions {
+  MultiheadAttentionForwardFuncOptions(
+      int64_t embed_dim_to_check,
+      int64_t num_heads,
+      Tensor in_proj_weight,
+      Tensor in_proj_bias,
+      Tensor bias_k,
+      Tensor bias_v,
+      bool add_zero_attn,
+      double dropout_p,
+      Tensor out_proj_weight,
+      Tensor out_proj_bias);
+
+  TORCH_ARG(int64_t, embed_dim_to_check);
+
+  TORCH_ARG(int64_t, num_heads);
+
+  TORCH_ARG(Tensor, in_proj_weight);
+
+  TORCH_ARG(Tensor, in_proj_bias);
+
+  TORCH_ARG(Tensor, bias_k);
+
+  TORCH_ARG(Tensor, bias_v);
+
+  TORCH_ARG(bool, add_zero_attn);
+
+  TORCH_ARG(double, dropout_p);
+
+  TORCH_ARG(Tensor, out_proj_weight);
+
+  TORCH_ARG(Tensor, out_proj_bias);
+
+  TORCH_ARG(bool, training) = true;
+
+  TORCH_ARG(Tensor, key_padding_mask) = {};
+
+  TORCH_ARG(bool, need_weights) = true;
+
+  TORCH_ARG(Tensor, attn_mask) = {};
+
+  TORCH_ARG(bool, use_separate_proj_weight) = false;
+
+  TORCH_ARG(Tensor, q_proj_weight) = {};
+
+  TORCH_ARG(Tensor, k_proj_weight) = {};
+
+  TORCH_ARG(Tensor, v_proj_weight) = {};
+
+  TORCH_ARG(Tensor, static_k) = {};
+
+  TORCH_ARG(Tensor, static_v) = {};
+
+  TORCH_ARG(bool, average_attn_weights) = true;
+};
+
+} // namespace functional
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/adaptive.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/adaptive.h
new file mode 100644
index 0000000000000000000000000000000000000000..e7d98ca6ce2c7b8f80a38f7b24a89612efcb223a
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/adaptive.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/types.h>
+
+namespace torch::nn {
+
+/// Options for the `AdaptiveLogSoftmaxWithLoss` module.
+///
+/// Example:
+/// ```
+/// AdaptiveLogSoftmaxWithLoss model(AdaptiveLogSoftmaxWithLossOptions(8, 10,
+/// {4, 8}).div_value(2.).head_bias(true));
+/// ```
+struct TORCH_API AdaptiveLogSoftmaxWithLossOptions {
+  /* implicit */ AdaptiveLogSoftmaxWithLossOptions(
+      int64_t in_features,
+      int64_t n_classes,
+      std::vector<int64_t> cutoffs);
+
+  /// Number of features in the input tensor
+  TORCH_ARG(int64_t, in_features);
+
+  /// Number of classes in the dataset
+  TORCH_ARG(int64_t, n_classes);
+
+  /// Cutoffs used to assign targets to their buckets
+  TORCH_ARG(std::vector<int64_t>, cutoffs);
+
+  /// value used as an exponent to compute sizes of the clusters. Default: 4.0
+  TORCH_ARG(double, div_value) = 4.;
+
+  /// If ``true``, adds a bias term to the 'head' of
+  /// the adaptive softmax. Default: false
+  TORCH_ARG(bool, head_bias) = false;
+};
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/batchnorm.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/batchnorm.h
new file mode 100644
index 0000000000000000000000000000000000000000..28f6d1200d749f88cdb5367fadce0caab5446a21
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/batchnorm.h
@@ -0,0 +1,93 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/types.h>
+
+namespace torch::nn {
+
+/// Options for the `BatchNorm` module.
+struct TORCH_API BatchNormOptions {
+  /* implicit */ BatchNormOptions(int64_t num_features);
+
+  /// The number of features of the input tensor.
+  /// Changing this parameter after construction __has no effect__.
+  TORCH_ARG(int64_t, num_features);
+
+  /// The epsilon value added for numerical stability.
+  /// Changing this parameter after construction __is effective__.
+  TORCH_ARG(double, eps) = 1e-5;
+
+  /// A momentum multiplier for the mean and variance.
+  /// Changing this parameter after construction __is effective__.
+  TORCH_ARG(std::optional<double>, momentum) = 0.1;
+
+  /// Whether to learn a scale and bias that are applied in an affine
+  /// transformation on the input.
+  /// Changing this parameter after construction __has no effect__.
+  TORCH_ARG(bool, affine) = true;
+
+  /// Whether to store and update batch statistics (mean and variance) in the
+  /// module.
+  /// Changing this parameter after construction __has no effect__.
+  TORCH_ARG(bool, track_running_stats) = true;
+};
+
+/// Options for the `BatchNorm1d` module.
+///
+/// Example:
+/// ```
+/// BatchNorm1d
+/// model(BatchNorm1dOptions(4).eps(0.5).momentum(0.1).affine(false).track_running_stats(true));
+/// ```
+using BatchNorm1dOptions = BatchNormOptions;
+
+/// Options for the `BatchNorm2d` module.
+///
+/// Example:
+/// ```
+/// BatchNorm2d
+/// model(BatchNorm2dOptions(4).eps(0.5).momentum(0.1).affine(false).track_running_stats(true));
+/// ```
+using BatchNorm2dOptions = BatchNormOptions;
+
+/// Options for the `BatchNorm3d` module.
+///
+/// Example:
+/// ```
+/// BatchNorm3d
+/// model(BatchNorm3dOptions(4).eps(0.5).momentum(0.1).affine(false).track_running_stats(true));
+/// ```
+using BatchNorm3dOptions = BatchNormOptions;
+
+// ============================================================================
+
+namespace functional {
+
+/// Options for `torch::nn::functional::batch_norm`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::batch_norm(input, mean, variance,
+/// F::BatchNormFuncOptions().weight(weight).bias(bias).momentum(0.1).eps(1e-05).training(false));
+/// ```
+struct TORCH_API BatchNormFuncOptions {
+  TORCH_ARG(Tensor, weight) = Tensor();
+
+  TORCH_ARG(Tensor, bias) = Tensor();
+
+  TORCH_ARG(bool, training) = false;
+
+  /// A momentum multiplier for the mean and variance.
+  /// Changing this parameter after construction __is effective__.
+  TORCH_ARG(double, momentum) = 0.1;
+
+  /// The epsilon value added for numerical stability.
+  /// Changing this parameter after construction __is effective__.
+  TORCH_ARG(double, eps) = 1e-5;
+};
+
+} // namespace functional
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/conv.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/conv.h
new file mode 100644
index 0000000000000000000000000000000000000000..e0d768b93367f5e1100d59d8ce9072888c4e95ac
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/conv.h
@@ -0,0 +1,413 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/enum.h>
+#include <torch/expanding_array.h>
+#include <torch/types.h>
+
+namespace torch::nn {
+
+namespace detail {
+
+typedef std::variant<
+    enumtype::kZeros,
+    enumtype::kReflect,
+    enumtype::kReplicate,
+    enumtype::kCircular>
+    conv_padding_mode_t;
+
+template <size_t D>
+using conv_padding_t =
+    std::variant<ExpandingArray<D>, enumtype::kValid, enumtype::kSame>;
+
+/// Options for a `D`-dimensional convolution or convolution transpose module.
+template <size_t D>
+struct ConvNdOptions {
+  using padding_t = conv_padding_t<D>;
+  ConvNdOptions(
+      int64_t in_channels,
+      int64_t out_channels,
+      ExpandingArray<D> kernel_size)
+      : in_channels_(in_channels),
+        out_channels_(out_channels),
+        kernel_size_(std::move(kernel_size)) {}
+
+  /// The number of channels the input volumes will have.
+  /// Changing this parameter after construction __has no effect__.
+  TORCH_ARG(int64_t, in_channels);
+
+  /// The number of output channels the convolution should produce.
+  /// Changing this parameter after construction __has no effect__.
+  TORCH_ARG(int64_t, out_channels);
+
+  /// The kernel size to use.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(ExpandingArray<D>, kernel_size);
+
+  /// The stride of the convolution.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(ExpandingArray<D>, stride) = 1;
+
+  /// The padding to add to the input volumes.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(padding_t, padding) = 0;
+
+ public:
+  decltype(auto) padding(std::initializer_list<int64_t> il) {
+    return padding(IntArrayRef{il});
+  }
+
+  /// The kernel dilation.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(ExpandingArray<D>, dilation) = 1;
+
+  /// If true, convolutions will be transpose convolutions (a.k.a.
+  /// deconvolutions).
+  /// Changing this parameter after construction __has no effect__.
+  TORCH_ARG(bool, transposed) = false;
+
+  /// For transpose convolutions, the padding to add to output volumes.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(ExpandingArray<D>, output_padding) = 0;
+
+  /// The number of convolution groups.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(int64_t, groups) = 1;
+
+  /// Whether to add a bias after individual applications of the kernel.
+  /// Changing this parameter after construction __has no effect__.
+  TORCH_ARG(bool, bias) = true;
+
+  /// Accepted values `torch::kZeros`, `torch::kReflect`, `torch::kReplicate` or
+  /// `torch::kCircular`. Default: `torch::kZeros`
+  TORCH_ARG(conv_padding_mode_t, padding_mode) = torch::kZeros;
+};
+
+} // namespace detail
+
+// ============================================================================
+
+/// Options for a `D`-dimensional convolution module.
+template <size_t D>
+struct ConvOptions {
+  using padding_mode_t = detail::conv_padding_mode_t;
+  using padding_t = detail::conv_padding_t<D>;
+
+  ConvOptions(
+      int64_t in_channels,
+      int64_t out_channels,
+      ExpandingArray<D> kernel_size)
+      : in_channels_(in_channels),
+        out_channels_(out_channels),
+        kernel_size_(std::move(kernel_size)) {}
+
+  /// The number of channels the input volumes will have.
+  /// Changing this parameter after construction __has no effect__.
+  TORCH_ARG(int64_t, in_channels);
+
+  /// The number of output channels the convolution should produce.
+  /// Changing this parameter after construction __has no effect__.
+  TORCH_ARG(int64_t, out_channels);
+
+  /// The kernel size to use.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(ExpandingArray<D>, kernel_size);
+
+  /// The stride of the convolution.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(ExpandingArray<D>, stride) = 1;
+
+  /// The padding to add to the input volumes.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(padding_t, padding) = 0;
+
+ public:
+  decltype(auto) padding(std::initializer_list<int64_t> il) {
+    return padding(IntArrayRef{il});
+  }
+
+  /// The kernel dilation.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(ExpandingArray<D>, dilation) = 1;
+
+  /// The number of convolution groups.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(int64_t, groups) = 1;
+
+  /// Whether to add a bias after individual applications of the kernel.
+  /// Changing this parameter after construction __has no effect__.
+  TORCH_ARG(bool, bias) = true;
+
+  /// Accepted values `torch::kZeros`, `torch::kReflect`, `torch::kReplicate` or
+  /// `torch::kCircular`. Default: `torch::kZeros`
+  TORCH_ARG(padding_mode_t, padding_mode) = torch::kZeros;
+};
+
+/// `ConvOptions` specialized for the `Conv1d` module.
+///
+/// Example:
+/// ```
+/// Conv1d model(Conv1dOptions(3, 2, 3).stride(1).bias(false));
+/// ```
+using Conv1dOptions = ConvOptions<1>;
+
+/// `ConvOptions` specialized for the `Conv2d` module.
+///
+/// Example:
+/// ```
+/// Conv2d model(Conv2dOptions(3, 2, 3).stride(1).bias(false));
+/// ```
+using Conv2dOptions = ConvOptions<2>;
+
+/// `ConvOptions` specialized for the `Conv3d` module.
+///
+/// Example:
+/// ```
+/// Conv3d model(Conv3dOptions(3, 2, 3).stride(1).bias(false));
+/// ```
+using Conv3dOptions = ConvOptions<3>;
+
+// ============================================================================
+
+namespace functional {
+
+/// Options for a `D`-dimensional convolution functional.
+template <size_t D>
+struct ConvFuncOptions {
+  using padding_t = torch::nn::detail::conv_padding_t<D>;
+
+  /// optional bias of shape `(out_channels)`. Default: ``None``
+  TORCH_ARG(torch::Tensor, bias) = Tensor();
+
+  /// The stride of the convolving kernel.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  TORCH_ARG(ExpandingArray<D>, stride) = 1;
+
+  /// Implicit paddings on both sides of the input.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  TORCH_ARG(padding_t, padding) = 0;
+
+ public:
+  decltype(auto) padding(std::initializer_list<int64_t> il) {
+    return padding(IntArrayRef{il});
+  }
+
+  /// The spacing between kernel elements.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  TORCH_ARG(ExpandingArray<D>, dilation) = 1;
+
+  /// Split input into groups, `in_channels` should be divisible by
+  /// the number of groups.
+  TORCH_ARG(int64_t, groups) = 1;
+};
+
+/// `ConvFuncOptions` specialized for `torch::nn::functional::conv1d`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::conv1d(x, weight, F::Conv1dFuncOptions().stride(1));
+/// ```
+using Conv1dFuncOptions = ConvFuncOptions<1>;
+
+/// `ConvFuncOptions` specialized for `torch::nn::functional::conv2d`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::conv2d(x, weight, F::Conv2dFuncOptions().stride(1));
+/// ```
+using Conv2dFuncOptions = ConvFuncOptions<2>;
+
+/// `ConvFuncOptions` specialized for `torch::nn::functional::conv3d`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::conv3d(x, weight, F::Conv3dFuncOptions().stride(1));
+/// ```
+using Conv3dFuncOptions = ConvFuncOptions<3>;
+
+} // namespace functional
+
+// ============================================================================
+
+template <size_t D>
+struct ConvTransposeOptions {
+  using padding_mode_t = detail::conv_padding_mode_t;
+
+  ConvTransposeOptions(
+      int64_t in_channels,
+      int64_t out_channels,
+      ExpandingArray<D> kernel_size)
+      : in_channels_(in_channels),
+        out_channels_(out_channels),
+        kernel_size_(std::move(kernel_size)) {}
+
+  /// The number of channels the input volumes will have.
+  /// Changing this parameter after construction __has no effect__.
+  TORCH_ARG(int64_t, in_channels);
+
+  /// The number of output channels the convolution should produce.
+  /// Changing this parameter after construction __has no effect__.
+  TORCH_ARG(int64_t, out_channels);
+
+  /// The kernel size to use.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(ExpandingArray<D>, kernel_size);
+
+  /// The stride of the convolution.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(ExpandingArray<D>, stride) = 1;
+
+  /// The padding to add to the input volumes.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(ExpandingArray<D>, padding) = 0;
+
+  /// For transpose convolutions, the padding to add to output volumes.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(ExpandingArray<D>, output_padding) = 0;
+
+  /// The number of convolution groups.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(int64_t, groups) = 1;
+
+  /// Whether to add a bias after individual applications of the kernel.
+  /// Changing this parameter after construction __has no effect__.
+  TORCH_ARG(bool, bias) = true;
+
+  /// The kernel dilation.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  /// This parameter __can__ be changed after construction.
+  TORCH_ARG(ExpandingArray<D>, dilation) = 1;
+
+  /// Accepted values `torch::kZeros`, `torch::kReflect`, `torch::kReplicate` or
+  /// `torch::kCircular`. Default: `torch::kZeros`
+  TORCH_ARG(padding_mode_t, padding_mode) = torch::kZeros;
+};
+
+/// `ConvTransposeOptions` specialized for the `ConvTranspose1d` module.
+///
+/// Example:
+/// ```
+/// ConvTranspose1d model(ConvTranspose1dOptions(3, 2,
+/// 3).stride(1).bias(false));
+/// ```
+using ConvTranspose1dOptions = ConvTransposeOptions<1>;
+
+/// `ConvTransposeOptions` specialized for the `ConvTranspose2d` module.
+///
+/// Example:
+/// ```
+/// ConvTranspose2d model(ConvTranspose2dOptions(3, 2,
+/// 3).stride(1).bias(false));
+/// ```
+using ConvTranspose2dOptions = ConvTransposeOptions<2>;
+
+/// `ConvTransposeOptions` specialized for the `ConvTranspose3d` module.
+///
+/// Example:
+/// ```
+/// ConvTranspose3d model(ConvTranspose3dOptions(2, 2,
+/// 2).stride(1).bias(false));
+/// ```
+using ConvTranspose3dOptions = ConvTransposeOptions<3>;
+
+// ============================================================================
+
+namespace functional {
+
+/// Options for a `D`-dimensional convolution functional.
+template <size_t D>
+struct ConvTransposeFuncOptions {
+  /// optional bias of shape `(out_channels)`. Default: ``None``
+  TORCH_ARG(torch::Tensor, bias) = Tensor();
+
+  /// The stride of the convolving kernel.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  TORCH_ARG(ExpandingArray<D>, stride) = 1;
+
+  /// Implicit paddings on both sides of the input.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  TORCH_ARG(ExpandingArray<D>, padding) = 0;
+
+  /// Additional size added to one side of each dimension in the output shape.
+  /// Default: 0
+  TORCH_ARG(ExpandingArray<D>, output_padding) = 0;
+
+  /// Split input into groups, `in_channels` should be divisible by
+  /// the number of groups.
+  TORCH_ARG(int64_t, groups) = 1;
+
+  /// The spacing between kernel elements.
+  /// For a `D`-dim convolution, must be a single number or a list of `D`
+  /// numbers.
+  TORCH_ARG(ExpandingArray<D>, dilation) = 1;
+};
+
+/// `ConvTransposeFuncOptions` specialized for
+/// `torch::nn::functional::conv_transpose1d`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::conv_transpose1d(x, weight, F::ConvTranspose1dFuncOptions().stride(1));
+/// ```
+using ConvTranspose1dFuncOptions = ConvTransposeFuncOptions<1>;
+
+/// `ConvTransposeFuncOptions` specialized for
+/// `torch::nn::functional::conv_transpose2d`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::conv_transpose2d(x, weight, F::ConvTranspose2dFuncOptions().stride(1));
+/// ```
+using ConvTranspose2dFuncOptions = ConvTransposeFuncOptions<2>;
+
+/// `ConvTransposeFuncOptions` specialized for
+/// `torch::nn::functional::conv_transpose3d`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::conv_transpose3d(x, weight, F::ConvTranspose3dFuncOptions().stride(1));
+/// ```
+using ConvTranspose3dFuncOptions = ConvTransposeFuncOptions<3>;
+
+} // namespace functional
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/distance.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/distance.h
new file mode 100644
index 0000000000000000000000000000000000000000..7039068d7616b33932905ae2b3a78553176f2a4b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/distance.h
@@ -0,0 +1,69 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/types.h>
+
+namespace torch::nn {
+
+/// Options for the `CosineSimilarity` module.
+///
+/// Example:
+/// ```
+/// CosineSimilarity model(CosineSimilarityOptions().dim(0).eps(0.5));
+/// ```
+struct TORCH_API CosineSimilarityOptions {
+  /// Dimension where cosine similarity is computed. Default: 1
+  TORCH_ARG(int64_t, dim) = 1;
+  /// Small value to avoid division by zero. Default: 1e-8
+  TORCH_ARG(double, eps) = 1e-8;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::cosine_similarity`.
+///
+/// See the documentation for `torch::nn::CosineSimilarityOptions` class to
+/// learn what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::cosine_similarity(input1, input2,
+/// F::CosineSimilarityFuncOptions().dim(1));
+/// ```
+using CosineSimilarityFuncOptions = CosineSimilarityOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `PairwiseDistance` module.
+///
+/// Example:
+/// ```
+/// PairwiseDistance
+/// model(PairwiseDistanceOptions().p(3).eps(0.5).keepdim(true));
+/// ```
+struct TORCH_API PairwiseDistanceOptions {
+  /// The norm degree. Default: 2
+  TORCH_ARG(double, p) = 2.0;
+  /// Small value to avoid division by zero. Default: 1e-6
+  TORCH_ARG(double, eps) = 1e-6;
+  /// Determines whether or not to keep the vector dimension. Default: false
+  TORCH_ARG(bool, keepdim) = false;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::pairwise_distance`.
+///
+/// See the documentation for `torch::nn::PairwiseDistanceOptions` class to
+/// learn what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::pairwise_distance(input1, input2, F::PairwiseDistanceFuncOptions().p(1));
+/// ```
+using PairwiseDistanceFuncOptions = PairwiseDistanceOptions;
+} // namespace functional
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/dropout.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/dropout.h
new file mode 100644
index 0000000000000000000000000000000000000000..1710ebfd422e65d90e2fc64bd7f14bd14d2b90a0
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/dropout.h
@@ -0,0 +1,128 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/types.h>
+
+namespace torch::nn {
+
+/// Options for the `Dropout` module.
+///
+/// Example:
+/// ```
+/// Dropout model(DropoutOptions().p(0.42).inplace(true));
+/// ```
+struct TORCH_API DropoutOptions {
+  /* implicit */ DropoutOptions(double p = 0.5);
+
+  /// The probability of an element to be zeroed. Default: 0.5
+  TORCH_ARG(double, p) = 0.5;
+
+  /// can optionally do the operation in-place. Default: False
+  TORCH_ARG(bool, inplace) = false;
+};
+
+/// Options for the `Dropout2d` module.
+///
+/// Example:
+/// ```
+/// Dropout2d model(Dropout2dOptions().p(0.42).inplace(true));
+/// ```
+using Dropout2dOptions = DropoutOptions;
+
+/// Options for the `Dropout3d` module.
+///
+/// Example:
+/// ```
+/// Dropout3d model(Dropout3dOptions().p(0.42).inplace(true));
+/// ```
+using Dropout3dOptions = DropoutOptions;
+
+/// Options for the `AlphaDropout` module.
+///
+/// Example:
+/// ```
+/// AlphaDropout model(AlphaDropoutOptions(0.2).inplace(true));
+/// ```
+using AlphaDropoutOptions = DropoutOptions;
+
+/// Options for the `FeatureAlphaDropout` module.
+///
+/// Example:
+/// ```
+/// FeatureAlphaDropout model(FeatureAlphaDropoutOptions(0.2).inplace(true));
+/// ```
+using FeatureAlphaDropoutOptions = DropoutOptions;
+
+namespace functional {
+
+/// Options for `torch::nn::functional::dropout`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::dropout(input, F::DropoutFuncOptions().p(0.5));
+/// ```
+struct TORCH_API DropoutFuncOptions {
+  /// The probability of an element to be zeroed. Default: 0.5
+  TORCH_ARG(double, p) = 0.5;
+
+  TORCH_ARG(bool, training) = true;
+
+  /// can optionally do the operation in-place. Default: False
+  TORCH_ARG(bool, inplace) = false;
+};
+
+/// Options for `torch::nn::functional::dropout2d`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::dropout2d(input, F::Dropout2dFuncOptions().p(0.5));
+/// ```
+using Dropout2dFuncOptions = DropoutFuncOptions;
+
+/// Options for `torch::nn::functional::dropout3d`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::dropout3d(input, F::Dropout3dFuncOptions().p(0.5));
+/// ```
+using Dropout3dFuncOptions = DropoutFuncOptions;
+
+/// Options for `torch::nn::functional::alpha_dropout`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::alpha_dropout(input,
+/// F::AlphaDropoutFuncOptions().p(0.5).training(false));
+/// ```
+struct TORCH_API AlphaDropoutFuncOptions {
+  TORCH_ARG(double, p) = 0.5;
+
+  TORCH_ARG(bool, training) = false;
+
+  TORCH_ARG(bool, inplace) = false;
+};
+
+/// Options for `torch::nn::functional::feature_alpha_dropout`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::feature_alpha_dropout(input,
+/// F::FeatureAlphaDropoutFuncOptions().p(0.5).training(false));
+/// ```
+struct TORCH_API FeatureAlphaDropoutFuncOptions {
+  TORCH_ARG(double, p) = 0.5;
+
+  TORCH_ARG(bool, training) = false;
+
+  TORCH_ARG(bool, inplace) = false;
+};
+
+} // namespace functional
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/embedding.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/embedding.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ea7cbc367f636ab34aaa8e49a55fd0e34d43b14
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/embedding.h
@@ -0,0 +1,240 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/enum.h>
+#include <torch/types.h>
+
+namespace torch::nn {
+
+/// Options for the `Embedding` module.
+///
+/// Example:
+/// ```
+/// Embedding model(EmbeddingOptions(10,
+/// 2).padding_idx(3).max_norm(2).norm_type(2.5).scale_grad_by_freq(true).sparse(true));
+/// ```
+struct TORCH_API EmbeddingOptions {
+  EmbeddingOptions(int64_t num_embeddings, int64_t embedding_dim);
+
+  /// The size of the dictionary of embeddings.
+  TORCH_ARG(int64_t, num_embeddings);
+  /// The size of each embedding vector.
+  TORCH_ARG(int64_t, embedding_dim);
+  /// If specified, the entries at `padding_idx` do not contribute to the
+  /// gradient; therefore, the embedding vector at `padding_idx` is not updated
+  /// during training, i.e. it remains as a fixed "pad". For a newly constructed
+  /// Embedding, the embedding vector at `padding_idx` will default to all
+  /// zeros, but can be updated to another value to be used as the padding
+  /// vector.
+  TORCH_ARG(std::optional<int64_t>, padding_idx) = std::nullopt;
+  /// If given, each embedding vector with norm larger than `max_norm` is
+  /// renormalized to have norm `max_norm`.
+  TORCH_ARG(std::optional<double>, max_norm) = std::nullopt;
+  /// The p of the p-norm to compute for the `max_norm` option. Default ``2``.
+  TORCH_ARG(double, norm_type) = 2.;
+  /// If given, this will scale gradients by the inverse of frequency of the
+  /// words in the mini-batch. Default ``false``.
+  TORCH_ARG(bool, scale_grad_by_freq) = false;
+  /// If ``true``, gradient w.r.t. `weight` matrix will be a sparse tensor.
+  TORCH_ARG(bool, sparse) = false;
+  /// The learnable weights of the module of shape (num_embeddings,
+  /// embedding_dim)
+  TORCH_ARG(torch::Tensor, _weight) = Tensor();
+};
+
+// ============================================================================
+
+/// Options for the `Embedding::from_pretrained` function.
+struct TORCH_API EmbeddingFromPretrainedOptions {
+  /// If ``true``, the tensor does not get updated in the learning process.
+  /// Equivalent to ``embedding.weight.requires_grad_(false)``. Default:
+  /// ``true``
+  TORCH_ARG(bool, freeze) = true;
+  /// If specified, the entries at `padding_idx` do not contribute to the
+  /// gradient; therefore, the embedding vector at `padding_idx` is not updated
+  /// during training, i.e. it remains as a fixed "pad".
+  TORCH_ARG(std::optional<int64_t>, padding_idx) = std::nullopt;
+  /// If given, each embedding vector with norm larger than `max_norm` is
+  /// renormalized to have norm `max_norm`.
+  TORCH_ARG(std::optional<double>, max_norm) = std::nullopt;
+  /// The p of the p-norm to compute for the `max_norm` option. Default ``2``.
+  TORCH_ARG(double, norm_type) = 2.;
+  /// If given, this will scale gradients by the inverse of frequency of the
+  /// words in the mini-batch. Default ``false``.
+  TORCH_ARG(bool, scale_grad_by_freq) = false;
+  /// If ``true``, gradient w.r.t. `weight` matrix will be a sparse tensor.
+  TORCH_ARG(bool, sparse) = false;
+};
+
+// ============================================================================
+
+namespace functional {
+
+/// Options for `torch::nn::functional::embedding`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::embedding(input, weight,
+/// F::EmbeddingFuncOptions().norm_type(2.5).scale_grad_by_freq(true).sparse(true));
+/// ```
+struct TORCH_API EmbeddingFuncOptions {
+  /// If specified, the entries at `padding_idx` do not contribute to the
+  /// gradient; therefore, the embedding vector at `padding_idx` is not updated
+  /// during training, i.e. it remains as a fixed "pad".
+  TORCH_ARG(std::optional<int64_t>, padding_idx) = std::nullopt;
+  /// If given, each embedding vector with norm larger than `max_norm` is
+  /// renormalized to have norm `max_norm`.
+  TORCH_ARG(std::optional<double>, max_norm) = std::nullopt;
+  /// The p of the p-norm to compute for the `max_norm` option. Default ``2``.
+  TORCH_ARG(double, norm_type) = 2.;
+  /// If given, this will scale gradients by the inverse of frequency of the
+  /// words in the mini-batch. Default ``false``.
+  TORCH_ARG(bool, scale_grad_by_freq) = false;
+  /// If ``true``, gradient w.r.t. `weight` matrix will be a sparse tensor.
+  TORCH_ARG(bool, sparse) = false;
+};
+
+} // namespace functional
+
+// ============================================================================
+
+typedef std::variant<enumtype::kSum, enumtype::kMean, enumtype::kMax>
+    EmbeddingBagMode;
+
+/// Options for the `EmbeddingBag` module.
+///
+/// Example:
+/// ```
+/// EmbeddingBag model(EmbeddingBagOptions(10,
+/// 2).max_norm(2).norm_type(2.5).scale_grad_by_freq(true).sparse(true).mode(torch::kSum));
+/// ```
+struct TORCH_API EmbeddingBagOptions {
+  EmbeddingBagOptions(int64_t num_embeddings, int64_t embedding_dim);
+
+  /// The size of the dictionary of embeddings.
+  TORCH_ARG(int64_t, num_embeddings);
+  /// The size of each embedding vector.
+  TORCH_ARG(int64_t, embedding_dim);
+  /// If given, each embedding vector with norm larger than `max_norm` is
+  /// renormalized to have norm `max_norm`.
+  TORCH_ARG(std::optional<double>, max_norm) = std::nullopt;
+  /// The p of the p-norm to compute for the `max_norm` option. Default ``2``.
+  TORCH_ARG(double, norm_type) = 2.;
+  /// If given, this will scale gradients by the inverse of frequency of the
+  /// words in the mini-batch. Default ``false``. Note: this option is not
+  /// supported when ``mode="kMax"``.
+  TORCH_ARG(bool, scale_grad_by_freq) = false;
+  /// ``"kSum"``, ``"kMean"`` or ``"kMax"``. Specifies the way to reduce the
+  /// bag. ``"kSum"`` computes the weighted sum, taking `per_sample_weights`
+  /// into consideration. ``"kMean"`` computes the average of the values in the
+  /// bag, ``"kMax"`` computes the max value over each bag.
+  TORCH_ARG(EmbeddingBagMode, mode) = torch::kMean;
+  /// If ``true``, gradient w.r.t. `weight` matrix will be a sparse tensor.
+  /// Note: this option is not supported when ``mode="kMax"``.
+  TORCH_ARG(bool, sparse) = false;
+  /// The learnable weights of the module of shape (num_embeddings,
+  /// embedding_dim)
+  TORCH_ARG(torch::Tensor, _weight) = Tensor();
+  /// If ``true``, `offsets` has one additional element, where the last element
+  /// is equivalent to the size of `indices`. This matches the CSR format.
+  TORCH_ARG(bool, include_last_offset) = false;
+  /// If specified, the entries at `padding_idx` do not contribute to the
+  /// gradient; therefore, the embedding vector at padding_idx is not updated
+  /// during training, i.e. it remains as a fixed "pad". For a newly constructed
+  /// EmbeddingBag, the embedding vector at `padding_idx` will default to all
+  /// zeros, but can be updated to another value to be used as the padding
+  /// vector. Note that the embedding vector at `padding_idx` is excluded from
+  /// the reduction.
+  TORCH_ARG(std::optional<int64_t>, padding_idx) = std::nullopt;
+};
+
+// ============================================================================
+
+/// Options for the `EmbeddingBag::from_pretrained` function.
+struct TORCH_API EmbeddingBagFromPretrainedOptions {
+  /// If ``true``, the tensor does not get updated in the learning process.
+  /// Equivalent to ``embeddingbag.weight.requires_grad_(false)``. Default:
+  /// ``true``
+  TORCH_ARG(bool, freeze) = true;
+  /// If given, each embedding vector with norm larger than `max_norm` is
+  /// renormalized to have norm `max_norm`.
+  TORCH_ARG(std::optional<double>, max_norm) = std::nullopt;
+  /// The p of the p-norm to compute for the `max_norm` option. Default ``2``.
+  TORCH_ARG(double, norm_type) = 2.;
+  /// If given, this will scale gradients by the inverse of frequency of the
+  /// words in the mini-batch. Default ``false``. Note: this option is not
+  /// supported when ``mode="kMax"``.
+  TORCH_ARG(bool, scale_grad_by_freq) = false;
+  /// ``"kSum"``, ``"kMean"`` or ``"kMax"``. Specifies the way to reduce the
+  /// bag. ``"kSum"`` computes the weighted sum, taking `per_sample_weights`
+  /// into consideration. ``"kMean"`` computes the average of the values in the
+  /// bag, ``"kMax"`` computes the max value over each bag.
+  TORCH_ARG(EmbeddingBagMode, mode) = torch::kMean;
+  /// If ``true``, gradient w.r.t. `weight` matrix will be a sparse tensor.
+  /// Note: this option is not supported when ``mode="kMax"``.
+  TORCH_ARG(bool, sparse) = false;
+  /// If ``true``, `offsets` has one additional element, where the last element
+  /// is equivalent to the size of `indices`. This matches the CSR format. Note:
+  /// this option is currently only supported when ``mode="sum"``.
+  TORCH_ARG(bool, include_last_offset) = false;
+  /// If specified, the entries at `padding_idx` do not contribute to the
+  /// gradient; therefore, the embedding vector at padding_idx is not updated
+  /// during training, i.e. it remains as a fixed "pad". Note that the embedding
+  /// vector at `padding_idx` is excluded from the reduction.
+  TORCH_ARG(std::optional<int64_t>, padding_idx) = std::nullopt;
+};
+
+// ============================================================================
+
+namespace functional {
+
+/// Options for `torch::nn::functional::embedding_bag`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::embedding_bag(input, weight,
+/// F::EmbeddingBagFuncOptions().mode(torch::kSum).offsets(offsets));
+/// ```
+struct TORCH_API EmbeddingBagFuncOptions {
+  /// Only used when `input` is 1D. `offsets` determines
+  /// the starting index position of each bag (sequence) in `input`.
+  TORCH_ARG(torch::Tensor, offsets) = Tensor();
+  /// If given, each embedding vector with norm larger than `max_norm` is
+  /// renormalized to have norm `max_norm`.
+  TORCH_ARG(std::optional<double>, max_norm) = std::nullopt;
+  /// The p of the p-norm to compute for the `max_norm` option. Default ``2``.
+  TORCH_ARG(double, norm_type) = 2.;
+  /// If given, this will scale gradients by the inverse of frequency of the
+  /// words in the mini-batch. Default ``false``. Note: this option is not
+  /// supported when ``mode="kMax"``.
+  TORCH_ARG(bool, scale_grad_by_freq) = false;
+  /// ``"kSum"``, ``"kMean"`` or ``"kMax"``. Specifies the way to reduce the
+  /// bag. ``"kSum"`` computes the weighted sum, taking `per_sample_weights`
+  /// into consideration. ``"kMean"`` computes the average of the values in the
+  /// bag, ``"kMax"`` computes the max value over each bag.
+  TORCH_ARG(EmbeddingBagMode, mode) = torch::kMean;
+  /// If ``true``, gradient w.r.t. `weight` matrix will be a sparse tensor.
+  /// Note: this option is not supported when ``mode="kMax"``.
+  TORCH_ARG(bool, sparse) = false;
+  /// a tensor of float / double weights, or None to indicate all weights should
+  /// be taken to be 1. If specified, `per_sample_weights` must have exactly the
+  /// same shape as input and is treated as having the same `offsets`, if those
+  /// are not None.
+  TORCH_ARG(torch::Tensor, per_sample_weights) = Tensor();
+  /// If ``true``, `offsets` has one additional element, where the last element
+  /// is equivalent to the size of `indices`. This matches the CSR format. Note:
+  /// this option is currently only supported when ``mode="sum"``.
+  TORCH_ARG(bool, include_last_offset) = false;
+  /// If specified, the entries at `padding_idx` do not contribute to the
+  /// gradient; therefore, the embedding vector at padding_idx is not updated
+  /// during training, i.e. it remains as a fixed "pad". Note that the embedding
+  /// vector at `padding_idx` is excluded from the reduction.
+  TORCH_ARG(std::optional<int64_t>, padding_idx) = std::nullopt;
+};
+
+} // namespace functional
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/fold.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/fold.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb11b39ee90fbe797767dc163216bebe701eacc2
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/fold.h
@@ -0,0 +1,95 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/expanding_array.h>
+#include <torch/types.h>
+
+namespace torch::nn {
+
+/// Options for the `Fold` module.
+///
+/// Example:
+/// ```
+/// Fold model(FoldOptions({8, 8}, {3, 3}).dilation(2).padding({2,
+/// 1}).stride(2));
+/// ```
+struct TORCH_API FoldOptions {
+  FoldOptions(ExpandingArray<2> output_size, ExpandingArray<2> kernel_size)
+      : output_size_(output_size), kernel_size_(kernel_size) {}
+
+  /// describes the spatial shape of the large containing tensor of the sliding
+  /// local blocks. It is useful to resolve the ambiguity when multiple input
+  /// shapes map to same number of sliding blocks, e.g., with stride > 0.
+  TORCH_ARG(ExpandingArray<2>, output_size);
+
+  /// the size of the sliding blocks
+  TORCH_ARG(ExpandingArray<2>, kernel_size);
+
+  /// controls the spacing between the kernel points; also known as the à trous
+  /// algorithm.
+  TORCH_ARG(ExpandingArray<2>, dilation) = 1;
+
+  /// controls the amount of implicit zero-paddings on both sides for padding
+  /// number of points for each dimension before reshaping.
+  TORCH_ARG(ExpandingArray<2>, padding) = 0;
+
+  /// controls the stride for the sliding blocks.
+  TORCH_ARG(ExpandingArray<2>, stride) = 1;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::fold`.
+///
+/// See the documentation for `torch::nn::FoldOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::fold(input, F::FoldFuncOptions({3, 2}, {2, 2}));
+/// ```
+using FoldFuncOptions = FoldOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `Unfold` module.
+///
+/// Example:
+/// ```
+/// Unfold model(UnfoldOptions({2, 4}).dilation(2).padding({2, 1}).stride(2));
+/// ```
+struct TORCH_API UnfoldOptions {
+  UnfoldOptions(ExpandingArray<2> kernel_size) : kernel_size_(kernel_size) {}
+
+  /// the size of the sliding blocks
+  TORCH_ARG(ExpandingArray<2>, kernel_size);
+
+  /// controls the spacing between the kernel points; also known as the à trous
+  /// algorithm.
+  TORCH_ARG(ExpandingArray<2>, dilation) = 1;
+
+  /// controls the amount of implicit zero-paddings on both sides for padding
+  /// number of points for each dimension before reshaping.
+  TORCH_ARG(ExpandingArray<2>, padding) = 0;
+
+  /// controls the stride for the sliding blocks.
+  TORCH_ARG(ExpandingArray<2>, stride) = 1;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::unfold`.
+///
+/// See the documentation for `torch::nn::UnfoldOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::unfold(input, F::UnfoldFuncOptions({2, 2}).padding(1).stride(2));
+/// ```
+using UnfoldFuncOptions = UnfoldOptions;
+} // namespace functional
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/instancenorm.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/instancenorm.h
new file mode 100644
index 0000000000000000000000000000000000000000..19adc15feaa6156f4a0a72217cdd003f48634300
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/instancenorm.h
@@ -0,0 +1,87 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/nn/options/batchnorm.h>
+#include <torch/types.h>
+
+namespace torch::nn {
+
+/// Options for the `InstanceNorm` module.
+struct TORCH_API InstanceNormOptions {
+  /* implicit */ InstanceNormOptions(int64_t num_features);
+
+  /// The number of features of the input tensor.
+  TORCH_ARG(int64_t, num_features);
+
+  /// The epsilon value added for numerical stability.
+  TORCH_ARG(double, eps) = 1e-5;
+
+  /// A momentum multiplier for the mean and variance.
+  TORCH_ARG(double, momentum) = 0.1;
+
+  /// Whether to learn a scale and bias that are applied in an affine
+  /// transformation on the input.
+  TORCH_ARG(bool, affine) = false;
+
+  /// Whether to store and update batch statistics (mean and variance) in the
+  /// module.
+  TORCH_ARG(bool, track_running_stats) = false;
+};
+
+/// Options for the `InstanceNorm1d` module.
+///
+/// Example:
+/// ```
+/// InstanceNorm1d
+/// model(InstanceNorm1dOptions(4).eps(0.5).momentum(0.1).affine(false).track_running_stats(true));
+/// ```
+using InstanceNorm1dOptions = InstanceNormOptions;
+
+/// Options for the `InstanceNorm2d` module.
+///
+/// Example:
+/// ```
+/// InstanceNorm2d
+/// model(InstanceNorm2dOptions(4).eps(0.5).momentum(0.1).affine(false).track_running_stats(true));
+/// ```
+using InstanceNorm2dOptions = InstanceNormOptions;
+
+/// Options for the `InstanceNorm3d` module.
+///
+/// Example:
+/// ```
+/// InstanceNorm3d
+/// model(InstanceNorm3dOptions(4).eps(0.5).momentum(0.1).affine(false).track_running_stats(true));
+/// ```
+using InstanceNorm3dOptions = InstanceNormOptions;
+
+namespace functional {
+
+/// Options for `torch::nn::functional::instance_norm`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::instance_norm(input,
+/// F::InstanceNormFuncOptions().running_mean(mean).running_var(variance).weight(weight).bias(bias).momentum(0.1).eps(1e-5));
+/// ```
+struct TORCH_API InstanceNormFuncOptions {
+  TORCH_ARG(Tensor, running_mean) = Tensor();
+
+  TORCH_ARG(Tensor, running_var) = Tensor();
+
+  TORCH_ARG(Tensor, weight) = Tensor();
+
+  TORCH_ARG(Tensor, bias) = Tensor();
+
+  TORCH_ARG(bool, use_input_stats) = true;
+
+  TORCH_ARG(double, momentum) = 0.1;
+
+  TORCH_ARG(double, eps) = 1e-5;
+};
+
+} // namespace functional
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/linear.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/linear.h
new file mode 100644
index 0000000000000000000000000000000000000000..73b8627ec5c416f794231c9f9af43ea047f5456c
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/linear.h
@@ -0,0 +1,93 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/types.h>
+
+namespace torch::nn {
+
+/// Options for the `Linear` module.
+///
+/// Example:
+/// ```
+/// Linear model(LinearOptions(5, 2).bias(false));
+/// ```
+struct TORCH_API LinearOptions {
+  LinearOptions(int64_t in_features, int64_t out_features);
+  /// size of each input sample
+  TORCH_ARG(int64_t, in_features);
+
+  /// size of each output sample
+  TORCH_ARG(int64_t, out_features);
+
+  /// If set to false, the layer will not learn an additive bias. Default: true
+  TORCH_ARG(bool, bias) = true;
+};
+
+// ============================================================================
+
+/// Options for the `Flatten` module.
+///
+/// Example:
+/// ```
+/// Flatten model(FlattenOptions().start_dim(2).end_dim(4));
+/// ```
+struct TORCH_API FlattenOptions {
+  /// first dim to flatten
+  TORCH_ARG(int64_t, start_dim) = 1;
+  /// last dim to flatten
+  TORCH_ARG(int64_t, end_dim) = -1;
+};
+
+// ============================================================================
+
+/// Options for the `Unflatten` module.
+///
+/// Note: If input tensor is named, use dimname and namedshape arguments.
+///
+/// Example:
+/// ```
+/// Unflatten unnamed_model(UnflattenOptions(0, {2, 2}));
+/// Unflatten named_model(UnflattenOptions("B", {{"B1", 2}, {"B2", 2}}));
+/// ```
+struct TORCH_API UnflattenOptions {
+  typedef std::vector<std::pair<std::string, int64_t>> namedshape_t;
+
+  UnflattenOptions(int64_t dim, std::vector<int64_t> sizes);
+  UnflattenOptions(const char* dimname, namedshape_t namedshape);
+  UnflattenOptions(std::string dimname, namedshape_t namedshape);
+
+  /// dim to unflatten
+  TORCH_ARG(int64_t, dim);
+  /// name of dim to unflatten, for use with named tensors
+  TORCH_ARG(std::string, dimname);
+  /// new shape of unflattened dim
+  TORCH_ARG(std::vector<int64_t>, sizes);
+  /// new shape of unflattened dim with names, for use with named tensors
+  TORCH_ARG(namedshape_t, namedshape);
+};
+
+// ============================================================================
+
+/// Options for the `Bilinear` module.
+///
+/// Example:
+/// ```
+/// Bilinear model(BilinearOptions(3, 2, 4).bias(false));
+/// ```
+struct TORCH_API BilinearOptions {
+  BilinearOptions(
+      int64_t in1_features,
+      int64_t in2_features,
+      int64_t out_features);
+  /// The number of features in input 1 (columns of the input1 matrix).
+  TORCH_ARG(int64_t, in1_features);
+  /// The number of features in input 2 (columns of the input2 matrix).
+  TORCH_ARG(int64_t, in2_features);
+  /// The number of output features to produce (columns of the output matrix).
+  TORCH_ARG(int64_t, out_features);
+  /// Whether to learn and add a bias after the bilinear transformation.
+  TORCH_ARG(bool, bias) = true;
+};
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/loss.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/loss.h
new file mode 100644
index 0000000000000000000000000000000000000000..397ef46166200d39a13f2b23a7499250cf6bde7a
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/loss.h
@@ -0,0 +1,800 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/enum.h>
+#include <torch/types.h>
+
+namespace torch::nn {
+
+/// Options for the `L1Loss` module.
+///
+/// Example:
+/// ```
+/// L1Loss model(L1LossOptions(torch::kNone));
+/// ```
+struct TORCH_API L1LossOptions {
+  typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+      reduction_t;
+
+  TORCH_OPTIONS_CTOR_VARIANT_ARG3(L1LossOptions, reduction, kNone, kMean, kSum)
+
+  /// Specifies the reduction to apply to the output.
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::l1_loss`.
+///
+/// See the documentation for `torch::nn::L1LossOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::l1_loss(input, target, F::L1LossFuncOptions(torch::kNone));
+/// ```
+using L1LossFuncOptions = L1LossOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `KLDivLoss` module.
+///
+/// Example:
+/// ```
+/// KLDivLoss
+/// model(KLDivLossOptions().reduction(torch::kNone).log_target(false));
+/// ```
+struct TORCH_API KLDivLossOptions {
+  typedef std::variant<
+      enumtype::kNone,
+      enumtype::kBatchMean,
+      enumtype::kSum,
+      enumtype::kMean>
+      reduction_t;
+
+  TORCH_OPTIONS_CTOR_VARIANT_ARG4(
+      KLDivLossOptions,
+      reduction,
+      kNone,
+      kBatchMean,
+      kSum,
+      kMean)
+
+  /// Specifies the reduction to apply to the output.
+  /// ``'none'`` | ``'batchmean'`` | ``'sum'`` | ``'mean'``. Default: ``'mean'``
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+
+  /// Specifies whether `target` is accepted in the log space. Default: False
+  TORCH_ARG(bool, log_target) = false;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::kl_div`.
+///
+/// See the documentation for `torch::nn::KLDivLossOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::kl_div(input, target,
+/// F::KLDivFuncOptions().reduction(torch::kNone).log_target(false));
+/// ```
+using KLDivFuncOptions = KLDivLossOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `MSELoss` module.
+///
+/// Example:
+/// ```
+/// MSELoss model(MSELossOptions(torch::kNone));
+/// ```
+struct TORCH_API MSELossOptions {
+  typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+      reduction_t;
+
+  TORCH_OPTIONS_CTOR_VARIANT_ARG3(MSELossOptions, reduction, kNone, kMean, kSum)
+
+  /// Specifies the reduction to apply to the output.
+  /// ``'none'`` | ``'mean'`` | ``'sum'``. Default: ``'mean'``
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::mse_loss`.
+///
+/// See the documentation for `torch::nn::MSELossOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::mse_loss(input, target, F::MSELossFuncOptions(torch::kNone));
+/// ```
+using MSELossFuncOptions = MSELossOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `BCELoss` module.
+///
+/// Example:
+/// ```
+/// BCELoss model(BCELossOptions().reduction(torch::kNone).weight(weight));
+/// ```
+struct TORCH_API BCELossOptions {
+  typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+      reduction_t;
+
+  /// A manual rescaling weight given to the loss of each batch element.
+  TORCH_ARG(Tensor, weight) = {};
+  /// Specifies the reduction to apply to the output.
+  /// ``'none'`` | ``'mean'`` | ``'sum'``. Default: ``'mean'``
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::binary_cross_entropy`.
+///
+/// See the documentation for `torch::nn::BCELossOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::binary_cross_entropy(input, target,
+/// F::BinaryCrossEntropyFuncOptions().weight(weight));
+/// ```
+using BinaryCrossEntropyFuncOptions = BCELossOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `HingeEmbeddingLoss` module.
+///
+/// Example:
+/// ```
+/// HingeEmbeddingLoss
+/// model(HingeEmbeddingLossOptions().margin(4).reduction(torch::kNone));
+/// ```
+struct TORCH_API HingeEmbeddingLossOptions {
+  typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+      reduction_t;
+
+  /// Specifies the threshold for which the distance of a negative sample must
+  /// reach in order to incur zero loss. Default: 1
+  TORCH_ARG(double, margin) = 1.0;
+  /// Specifies the reduction to apply to the output. Default: Mean
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::hinge_embedding_loss`.
+///
+/// See the documentation for `torch::nn::HingeEmbeddingLossOptions` class to
+/// learn what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::hinge_embedding_loss(input, target,
+/// F::HingeEmbeddingLossFuncOptions().margin(2));
+/// ```
+using HingeEmbeddingLossFuncOptions = HingeEmbeddingLossOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `MultiMarginLoss` module.
+///
+/// Example:
+/// ```
+/// MultiMarginLoss model(MultiMarginLossOptions().margin(2).weight(weight));
+/// ```
+struct TORCH_API MultiMarginLossOptions {
+  typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+      reduction_t;
+
+  /// Has a default value of :math:`1`. :math:`1` and :math:`2`
+  /// are the only supported values.
+  TORCH_ARG(int64_t, p) = 1;
+  /// Has a default value of :math:`1`.
+  TORCH_ARG(double, margin) = 1.0;
+  /// A manual rescaling weight given to each
+  /// class. If given, it has to be a Tensor of size `C`. Otherwise, it is
+  /// treated as if having all ones.
+  TORCH_ARG(Tensor, weight) = Tensor();
+  /// Specifies the reduction to apply to the output:
+  /// ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be
+  /// applied,
+  /// ``'mean'``: the sum of the output will be divided by the number of
+  /// elements in the output, ``'sum'``: the output will be summed. Default:
+  /// ``'mean'``
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::multi_margin_loss`.
+///
+/// See the documentation for `torch::nn::MultiMarginLossOptions` class to learn
+/// what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::multi_margin_loss(input, target,
+/// F::MultiMarginLossFuncOptions().margin(2).weight(weight));
+/// ```
+using MultiMarginLossFuncOptions = MultiMarginLossOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `CosineEmbeddingLoss` module.
+///
+/// Example:
+/// ```
+/// CosineEmbeddingLoss model(CosineEmbeddingLossOptions().margin(0.5));
+/// ```
+struct TORCH_API CosineEmbeddingLossOptions {
+  typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+      reduction_t;
+
+  /// Specifies the threshold for which the distance of a negative sample must
+  /// reach in order to incur zero loss. Should be a number from -1 to 1, 0
+  /// to 0.5 is suggested. Default: 0.0
+  TORCH_ARG(double, margin) = 0.0;
+  /// Specifies the reduction to apply to the output. Default: Mean
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::cosine_embedding_loss`.
+///
+/// See the documentation for `torch::nn::CosineEmbeddingLossOptions` class to
+/// learn what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::cosine_embedding_loss(input1, input2, target,
+/// F::CosineEmbeddingLossFuncOptions().margin(0.5));
+/// ```
+using CosineEmbeddingLossFuncOptions = CosineEmbeddingLossOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `MultiLabelMarginLoss` module.
+///
+/// Example:
+/// ```
+/// MultiLabelMarginLoss model(MultiLabelMarginLossOptions(torch::kNone));
+/// ```
+struct TORCH_API MultiLabelMarginLossOptions {
+  typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+      reduction_t;
+
+  TORCH_OPTIONS_CTOR_VARIANT_ARG3(
+      MultiLabelMarginLossOptions,
+      reduction,
+      kNone,
+      kMean,
+      kSum)
+
+  /// Specifies the reduction to apply to the output: 'none' | 'mean' | 'sum'.
+  /// 'none': no reduction will be applied, 'mean': the sum of the output will
+  /// be divided by the number of elements in the output, 'sum': the output will
+  /// be summed. Default: 'mean'
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::multilabel_margin_loss`.
+///
+/// See the documentation for `torch::nn::MultiLabelMarginLossOptions` class to
+/// learn what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::multilabel_margin_loss(input, target,
+/// F::MultilabelMarginLossFuncOptions(torch::kNone));
+/// ```
+using MultilabelMarginLossFuncOptions = MultiLabelMarginLossOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `SoftMarginLoss` module.
+///
+/// Example:
+/// ```
+/// SoftMarginLoss model(SoftMarginLossOptions(torch::kNone));
+/// ```
+struct TORCH_API SoftMarginLossOptions {
+  typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+      reduction_t;
+
+  TORCH_OPTIONS_CTOR_VARIANT_ARG3(
+      SoftMarginLossOptions,
+      reduction,
+      kNone,
+      kMean,
+      kSum)
+
+  /// Specifies the reduction to apply to the output: 'none' | 'mean' | 'sum'.
+  /// 'none': no reduction will be applied, 'mean': the sum of the output will
+  /// be divided by the number of elements in the output, 'sum': the output will
+  /// be summed. Default: 'mean'
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::soft_margin_loss`.
+///
+/// See the documentation for `torch::nn::SoftMarginLossOptions` class to learn
+/// what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::soft_margin_loss(input, target,
+/// F::SoftMarginLossFuncOptions(torch::kNone));
+/// ```
+using SoftMarginLossFuncOptions = SoftMarginLossOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `MultiLabelSoftMarginLoss` module.
+///
+/// Example:
+/// ```
+/// MultiLabelSoftMarginLoss
+/// model(MultiLabelSoftMarginLossOptions().reduction(torch::kNone).weight(weight));
+/// ```
+struct TORCH_API MultiLabelSoftMarginLossOptions {
+  typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+      reduction_t;
+
+  /// A manual rescaling weight given to each
+  /// class. If given, it has to be a Tensor of size `C`. Otherwise, it is
+  /// treated as if having all ones.
+  TORCH_ARG(Tensor, weight) = Tensor();
+
+  /// Specifies the reduction to apply to the output: 'none' | 'mean' | 'sum'.
+  /// 'none': no reduction will be applied, 'mean': the sum of the output will
+  /// be divided by the number of elements in the output, 'sum': the output will
+  /// be summed. Default: 'mean'
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::multilabel_soft_margin_loss`.
+///
+/// See the documentation for `torch::nn::MultiLabelSoftMarginLossOptions` class
+/// to learn what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::multilabel_soft_margin_loss(input, target,
+/// F::MultilabelSoftMarginLossFuncOptions().reduction(torch::kNone).weight(weight));
+/// ```
+using MultilabelSoftMarginLossFuncOptions = MultiLabelSoftMarginLossOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `TripletMarginLoss` module.
+///
+/// Example:
+/// ```
+/// TripletMarginLoss
+/// model(TripletMarginLossOptions().margin(3).p(2).eps(1e-06).swap(false));
+/// ```
+struct TORCH_API TripletMarginLossOptions {
+  typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+      reduction_t;
+
+  /// Specifies the threshold for which the distance of a negative sample must
+  /// reach in order to incur zero loss. Default: 1
+  TORCH_ARG(double, margin) = 1.0;
+  /// Specifies the norm degree for pairwise distance. Default: 2
+  TORCH_ARG(double, p) = 2.0;
+  TORCH_ARG(double, eps) = 1e-6;
+  /// The distance swap is described in detail in the paper Learning shallow
+  /// convolutional feature descriptors with triplet losses by V. Balntas,
+  /// E. Riba et al. Default: False
+  TORCH_ARG(bool, swap) = false;
+  /// Specifies the reduction to apply to the output. Default: Mean
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::triplet_margin_loss`.
+///
+/// See the documentation for `torch::nn::TripletMarginLossOptions` class to
+/// learn what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::triplet_margin_loss(anchor, positive, negative,
+/// F::TripletMarginLossFuncOptions().margin(1.0));
+/// ```
+using TripletMarginLossFuncOptions = TripletMarginLossOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `TripletMarginWithDistanceLoss` module.
+///
+/// Example:
+/// ```
+/// TripletMarginWithDistanceLoss
+/// model(TripletMarginWithDistanceLossOptions().margin(3).swap(false));
+/// ```
+struct TORCH_API TripletMarginWithDistanceLossOptions {
+  typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+      reduction_t;
+  typedef std::function<Tensor(const Tensor&, const Tensor&)>
+      distance_function_t;
+
+  /// Specifies a nonnegative, real-valued function that quantifies the
+  /// closeness of two tensors. If not specified, `F::pairwise_distance` will
+  /// be used. Default: nullopt
+  TORCH_ARG(std::optional<distance_function_t>, distance_function) =
+      std::nullopt;
+  /// Specifies a nonnegative margin representing the minimum difference
+  /// between the positive and negative distances required for the loss to be 0.
+  /// Larger margins penalize cases where the negative examples are not distance
+  /// enough from the anchors, relative to the positives. Default: 1
+  TORCH_ARG(double, margin) = 1.0;
+  /// Whether to use the distance swap described in the paper Learning shallow
+  /// convolutional feature descriptors with triplet losses by V. Balntas,
+  /// E. Riba et al. If True, and if the positive example is closer to the
+  /// negative example than the anchor is, swaps the positive example and the
+  /// anchor in the loss computation. Default: False
+  TORCH_ARG(bool, swap) = false;
+  /// Specifies the reduction to apply to the output. Default: Mean
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::triplet_margin_with_distance_loss`.
+///
+/// See the documentation for `torch::nn::TripletMarginWithDistanceLossOptions`
+/// class to learn what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::triplet_margin_with_distance_loss(anchor, positive, negative,
+/// F::TripletMarginWithDistanceLossFuncOptions().margin(1.0));
+/// ```
+using TripletMarginWithDistanceLossFuncOptions =
+    TripletMarginWithDistanceLossOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `CTCLoss` module.
+///
+/// Example:
+/// ```
+/// CTCLoss
+/// model(CTCLossOptions().blank(42).zero_infinity(false).reduction(torch::kSum));
+/// ```
+struct TORCH_API CTCLossOptions {
+  typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+      reduction_t;
+
+  /// blank label. Default `0`.
+  TORCH_ARG(int64_t, blank) = 0;
+  /// Specifies the reduction to apply to the output. Default: Mean
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+  /// Whether to zero infinite losses and the associated gradients.
+  /// Default: `false`. Infinite losses mainly occur when the inputs are
+  /// too short to be aligned to the targets.
+  TORCH_ARG(bool, zero_infinity) = false;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::ctc_loss`.
+///
+/// See the documentation for `torch::nn::CTCLossOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::ctc_loss(log_probs, targets, input_lengths, target_lengths,
+/// F::CTCLossFuncOptions().reduction(torch::kNone));
+/// ```
+using CTCLossFuncOptions = CTCLossOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `SmoothL1Loss` module.
+///
+/// Example:
+/// ```
+/// SmoothL1Loss model(SmoothL1LossOptions().reduction(torch::kNone).beta(0.5));
+/// ```
+struct TORCH_API SmoothL1LossOptions {
+  typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+      reduction_t;
+
+  TORCH_OPTIONS_CTOR_VARIANT_ARG3(
+      SmoothL1LossOptions,
+      reduction,
+      kNone,
+      kMean,
+      kSum)
+
+  /// Specifies the reduction to apply to the output: 'none' | 'mean' | 'sum'.
+  /// 'none': no reduction will be applied, 'mean': the sum of the output will
+  /// be divided by the number of elements in the output, 'sum': the output will
+  /// be summed. Default: 'mean'
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+  /// Specifies the threshold at which to change between L1 and L2 loss.
+  /// If beta is not specified, a value of 1.0 will be used.
+  /// Default: nullopt
+  TORCH_ARG(std::optional<double>, beta) = std::nullopt;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::smooth_l1_loss`.
+///
+/// See the documentation for `torch::nn::SmoothL1LossOptions` class to learn
+/// what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::smooth_l1_loss(input, target, F::SmoothL1LossFuncOptions(torch::kNone));
+/// ```
+using SmoothL1LossFuncOptions = SmoothL1LossOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `HuberLoss` module.
+///
+/// Example:
+/// ```
+/// HuberLoss model(HuberLossOptions().reduction(torch::kNone).delta(0.5));
+/// ```
+struct TORCH_API HuberLossOptions {
+  typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+      reduction_t;
+
+  TORCH_OPTIONS_CTOR_VARIANT_ARG3(
+      HuberLossOptions,
+      reduction,
+      kNone,
+      kMean,
+      kSum)
+
+  /// Specifies the reduction to apply to the output: 'none' | 'mean' | 'sum'.
+  /// 'none': no reduction will be applied, 'mean': the sum of the output will
+  /// be divided by the number of elements in the output, 'sum': the output will
+  /// be summed. Default: 'mean'
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+  /// Specifies the threshold at which to change between L1 and L2 loss.
+  /// Default: 1.0
+  TORCH_ARG(double, delta) = 1.0;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::huber_loss`.
+///
+/// See the documentation for `torch::nn::HuberLossOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::huber_loss(input, target, F::HuberLossFuncOptions(torch::kNone));
+/// ```
+using HuberLossFuncOptions = HuberLossOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `PoissonNLLLoss` module.
+///
+/// Example:
+/// ```
+/// PoissonNLLLoss
+/// model(PoissonNLLLossOptions().log_input(false).full(true).eps(0.42).reduction(torch::kSum));
+/// ```
+struct TORCH_API PoissonNLLLossOptions {
+  typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+      reduction_t;
+
+  /// if true the loss is computed as `exp(input) - target * input`,
+  /// if false the loss is `input - target * log(input + eps)`.
+  TORCH_ARG(bool, log_input) = true;
+  /// whether to compute full loss, i.e. to add the Stirling approximation term
+  /// target * log(target) - target + 0.5 * log(2 * pi * target).
+  TORCH_ARG(bool, full) = false;
+  /// Small value to avoid evaluation of `log(0)` when `log_input = false`.
+  /// Default: 1e-8
+  TORCH_ARG(double, eps) = 1e-8;
+  /// Specifies the reduction to apply to the output. Default: Mean
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::poisson_nll_loss`.
+///
+/// See the documentation for `torch::nn::PoissonNLLLossOptions` class to learn
+/// what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::poisson_nll_loss(input, target,
+/// F::PoissonNLLLossFuncOptions().reduction(torch::kNone));
+/// ```
+using PoissonNLLLossFuncOptions = PoissonNLLLossOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `MarginRankingLoss` module.
+///
+/// Example:
+/// ```
+/// MarginRankingLoss
+/// model(MarginRankingLossOptions().margin(0.5).reduction(torch::kSum));
+/// ```
+struct TORCH_API MarginRankingLossOptions {
+  typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+      reduction_t;
+
+  /// Has a default value of `0`.
+  TORCH_ARG(double, margin) = 0;
+  /// Specifies the reduction to apply to the output. Default: Mean
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::margin_ranking_loss`.
+///
+/// See the documentation for `torch::nn::MarginRankingLossOptions` class to
+/// learn what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::margin_ranking_loss(input1, input2, target,
+/// F::MarginRankingLossFuncOptions().margin(0.5).reduction(torch::kSum));
+/// ```
+using MarginRankingLossFuncOptions = MarginRankingLossOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `NLLLoss` module.
+///
+/// Example:
+/// ```
+/// NLLLoss model(NLLLossOptions().ignore_index(-100).reduction(torch::kMean));
+/// ```
+struct TORCH_API NLLLossOptions {
+  typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+      reduction_t;
+
+  /// A manual rescaling weight given to each
+  /// class. If given, it has to be a Tensor of size `C`. Otherwise, it is
+  /// treated as if having all ones.
+  TORCH_ARG(Tensor, weight) = {};
+  /// Specifies a target value that is ignored
+  /// and does not contribute to the input gradient.
+  TORCH_ARG(int64_t, ignore_index) = -100;
+  /// Specifies the reduction to apply to the output. Default: Mean
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::nll_loss`.
+///
+/// See the documentation for `torch::nn::NLLLossOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::nll_loss(input, target,
+/// F::NLLLossFuncOptions().ignore_index(-100).reduction(torch::kMean));
+/// ```
+using NLLLossFuncOptions = NLLLossOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `CrossEntropyLoss` module.
+///
+/// Example:
+/// ```
+/// CrossEntropyLoss
+/// model(CrossEntropyLossOptions().ignore_index(-100).reduction(torch::kMean));
+/// ```
+struct TORCH_API CrossEntropyLossOptions {
+  typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+      reduction_t;
+
+  /// A manual rescaling weight given to each class. If given, has to be a
+  /// Tensor of size C
+  TORCH_ARG(Tensor, weight) = {};
+  /// Specifies a target value that is ignored
+  /// and does not contribute to the input gradient.
+  TORCH_ARG(int64_t, ignore_index) = -100;
+  /// Specifies the reduction to apply to the output. Default: Mean
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+  /// Specifies the amount of smoothing when computing the loss. Default: 0.0
+  TORCH_ARG(double, label_smoothing) = 0.0;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::cross_entropy`.
+///
+/// See the documentation for `torch::nn::CrossEntropyLossOptions` class to
+/// learn what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::cross_entropy(input, target,
+/// F::CrossEntropyFuncOptions().ignore_index(-100).reduction(torch::kMean));
+/// ```
+using CrossEntropyFuncOptions = CrossEntropyLossOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `BCEWithLogitsLoss` module.
+///
+/// Example:
+/// ```
+/// BCEWithLogitsLoss
+/// model(BCEWithLogitsLossOptions().reduction(torch::kNone).weight(weight));
+/// ```
+struct TORCH_API BCEWithLogitsLossOptions {
+  typedef std::variant<enumtype::kNone, enumtype::kMean, enumtype::kSum>
+      reduction_t;
+  /// A manual rescaling weight given to the loss of each batch element.
+  /// If given, has to be a Tensor of size `nbatch`.
+  TORCH_ARG(Tensor, weight) = {};
+  /// Specifies the reduction to apply to the output. Default: Mean
+  TORCH_ARG(reduction_t, reduction) = torch::kMean;
+  /// A weight of positive examples.
+  /// Must be a vector with length equal to the number of classes.
+  TORCH_ARG(Tensor, pos_weight) = {};
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::binary_cross_entropy_with_logits`.
+///
+/// See the documentation for `torch::nn::BCEWithLogitsLossOptions` class to
+/// learn what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::binary_cross_entropy_with_logits(input, target,
+/// F::BinaryCrossEntropyWithLogitsFuncOptions().pos_weight(pos_weight).reduction(torch::kSum));
+/// ```
+using BinaryCrossEntropyWithLogitsFuncOptions = BCEWithLogitsLossOptions;
+} // namespace functional
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/normalization.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/normalization.h
new file mode 100644
index 0000000000000000000000000000000000000000..0a3bb53dbebf12cd80fed25292e8951a226250d3
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/normalization.h
@@ -0,0 +1,190 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/types.h>
+#include <vector>
+
+namespace torch::nn {
+
+/// Options for the `LayerNorm` module.
+///
+/// Example:
+/// ```
+/// LayerNorm model(LayerNormOptions({2,
+/// 2}).elementwise_affine(false).eps(2e-5));
+/// ```
+struct TORCH_API LayerNormOptions {
+  /* implicit */ LayerNormOptions(std::vector<int64_t> normalized_shape);
+  /// input shape from an expected input.
+  TORCH_ARG(std::vector<int64_t>, normalized_shape);
+  /// a value added to the denominator for numerical stability. ``Default:
+  /// 1e-5``.
+  TORCH_ARG(double, eps) = 1e-5;
+  /// a boolean value that when set to ``true``, this module
+  /// has learnable per-element affine parameters initialized to ones (for
+  /// weights) and zeros (for biases). ``Default: true``.
+  TORCH_ARG(bool, elementwise_affine) = true;
+};
+
+// ============================================================================
+
+namespace functional {
+
+/// Options for `torch::nn::functional::layer_norm`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::layer_norm(input, F::LayerNormFuncOptions({2, 2}).eps(2e-5));
+/// ```
+struct TORCH_API LayerNormFuncOptions {
+  /* implicit */ LayerNormFuncOptions(std::vector<int64_t> normalized_shape);
+  /// input shape from an expected input.
+  TORCH_ARG(std::vector<int64_t>, normalized_shape);
+
+  TORCH_ARG(Tensor, weight) = {};
+
+  TORCH_ARG(Tensor, bias) = {};
+
+  /// a value added to the denominator for numerical stability. ``Default:
+  /// 1e-5``.
+  TORCH_ARG(double, eps) = 1e-5;
+};
+
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `LocalResponseNorm` module.
+///
+/// Example:
+/// ```
+/// LocalResponseNorm
+/// model(LocalResponseNormOptions(2).alpha(0.0002).beta(0.85).k(2.));
+/// ```
+struct TORCH_API LocalResponseNormOptions {
+  /* implicit */ LocalResponseNormOptions(int64_t size) : size_(size) {}
+  /// amount of neighbouring channels used for normalization
+  TORCH_ARG(int64_t, size);
+
+  /// multiplicative factor. Default: 1e-4
+  TORCH_ARG(double, alpha) = 1e-4;
+
+  /// exponent. Default: 0.75
+  TORCH_ARG(double, beta) = 0.75;
+
+  /// additive factor. Default: 1
+  TORCH_ARG(double, k) = 1.;
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::local_response_norm`.
+///
+/// See the documentation for `torch::nn::LocalResponseNormOptions` class to
+/// learn what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::local_response_norm(x, F::LocalResponseNormFuncOptions(2));
+/// ```
+using LocalResponseNormFuncOptions = LocalResponseNormOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `CrossMapLRN2d` module.
+///
+/// Example:
+/// ```
+/// CrossMapLRN2d model(CrossMapLRN2dOptions(3).alpha(1e-5).beta(0.1).k(10));
+/// ```
+struct TORCH_API CrossMapLRN2dOptions {
+  CrossMapLRN2dOptions(int64_t size);
+
+  TORCH_ARG(int64_t, size);
+
+  TORCH_ARG(double, alpha) = 1e-4;
+
+  TORCH_ARG(double, beta) = 0.75;
+
+  TORCH_ARG(int64_t, k) = 1;
+};
+
+// ============================================================================
+
+namespace functional {
+
+/// Options for `torch::nn::functional::normalize`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::normalize(input, F::NormalizeFuncOptions().p(1).dim(-1));
+/// ```
+struct TORCH_API NormalizeFuncOptions {
+  /// The exponent value in the norm formulation. Default: 2.0
+  TORCH_ARG(double, p) = 2.0;
+  /// The dimension to reduce. Default: 1
+  TORCH_ARG(int64_t, dim) = 1;
+  /// Small value to avoid division by zero. Default: 1e-12
+  TORCH_ARG(double, eps) = 1e-12;
+  /// the output tensor. If `out` is used, this
+  /// operation won't be differentiable.
+  TORCH_ARG(std::optional<Tensor>, out) = std::nullopt;
+};
+
+} // namespace functional
+
+// ============================================================================
+
+/// Options for the `GroupNorm` module.
+///
+/// Example:
+/// ```
+/// GroupNorm model(GroupNormOptions(2, 2).eps(2e-5).affine(false));
+/// ```
+struct TORCH_API GroupNormOptions {
+  /* implicit */ GroupNormOptions(int64_t num_groups, int64_t num_channels);
+
+  /// number of groups to separate the channels into
+  TORCH_ARG(int64_t, num_groups);
+  /// number of channels expected in input
+  TORCH_ARG(int64_t, num_channels);
+  /// a value added to the denominator for numerical stability. Default: 1e-5
+  TORCH_ARG(double, eps) = 1e-5;
+  /// a boolean value that when set to ``true``, this module
+  /// has learnable per-channel affine parameters initialized to ones (for
+  /// weights) and zeros (for biases). Default: ``true``.
+  TORCH_ARG(bool, affine) = true;
+};
+
+// ============================================================================
+
+namespace functional {
+
+/// Options for `torch::nn::functional::group_norm`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::group_norm(input, F::GroupNormFuncOptions(2).eps(2e-5));
+/// ```
+struct TORCH_API GroupNormFuncOptions {
+  /* implicit */ GroupNormFuncOptions(int64_t num_groups);
+
+  /// number of groups to separate the channels into
+  TORCH_ARG(int64_t, num_groups);
+
+  TORCH_ARG(Tensor, weight) = {};
+
+  TORCH_ARG(Tensor, bias) = {};
+
+  /// a value added to the denominator for numerical stability. Default: 1e-5
+  TORCH_ARG(double, eps) = 1e-5;
+};
+
+} // namespace functional
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/padding.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/padding.h
new file mode 100644
index 0000000000000000000000000000000000000000..82da24b20085957129b5cb1022051fc7ad19a1f3
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/padding.h
@@ -0,0 +1,217 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/enum.h>
+#include <torch/expanding_array.h>
+#include <torch/types.h>
+
+namespace torch::nn {
+
+/// Options for a `D`-dimensional ReflectionPad module.
+template <size_t D>
+struct TORCH_API ReflectionPadOptions {
+  ReflectionPadOptions(ExpandingArray<D * 2> padding) : padding_(padding) {}
+
+  /// The size of the padding.
+  /// If it is `int`, uses the same padding in all boundaries.
+  /// If it is a 2-`tuple` (for ReflectionPad1d), uses (padding_left,
+  /// padding_right). If it is a 4-`tuple` (for ReflectionPad2d), uses
+  /// (padding_left, padding_right, padding_top, padding_bottom). If it is a
+  /// 6-`tuple` (for ReflectionPad3d), uses (padding_left, padding_right,
+  /// padding_top, padding_bottom, padding_front, padding_back).
+
+  TORCH_ARG(ExpandingArray<D * 2>, padding);
+};
+
+/// `ReflectionPadOptions` specialized for the `ReflectionPad1d` module.
+///
+/// Example:
+/// ```
+/// ReflectionPad1d model(ReflectionPad1dOptions({3, 1}));
+/// ```
+using ReflectionPad1dOptions = ReflectionPadOptions<1>;
+
+/// `ReflectionPadOptions` specialized for the `ReflectionPad2d` module.
+///
+/// Example:
+/// ```
+/// ReflectionPad2d model(ReflectionPad2dOptions({1, 1, 2, 0}));
+/// ```
+using ReflectionPad2dOptions = ReflectionPadOptions<2>;
+
+/// `ReflectionPadOptions` specialized for the `ReflectionPad3d` module.
+///
+/// Example:
+/// ```
+/// ReflectionPad3d model(ReflectionPad3dOptions({1, 1, 2, 0, 1, 1}));
+/// ```
+using ReflectionPad3dOptions = ReflectionPadOptions<3>;
+
+// ============================================================================
+
+/// Options for a `D`-dimensional ReplicationPad module.
+template <size_t D>
+struct TORCH_API ReplicationPadOptions {
+  ReplicationPadOptions(ExpandingArray<D * 2> padding) : padding_(padding) {}
+
+  /// The size of the padding.
+  /// - If it is `int`, uses the same padding in all boundaries.
+  /// - If it is a 2-`tuple` (for ReplicationPad1d), uses (padding_left,
+  /// padding_right).
+  /// - If it is a 4-`tuple` (for ReplicationPad2d), uses (padding_left,
+  /// padding_right, padding_top, padding_bottom).
+  /// - If it is a 6-`tuple` (for ReplicationPad3d), uses
+  ///   (padding_left, padding_right, padding_top, padding_bottom,
+  ///   padding_front, padding_back).
+  TORCH_ARG(ExpandingArray<D * 2>, padding);
+};
+
+/// `ReplicationPadOptions` specialized for the `ReplicationPad1d` module.
+///
+/// Example:
+/// ```
+/// ReplicationPad1d model(ReplicationPad1dOptions({3, 1}));
+/// ```
+using ReplicationPad1dOptions = ReplicationPadOptions<1>;
+
+/// `ReplicationPadOptions` specialized for the `ReplicationPad2d` module.
+///
+/// Example:
+/// ```
+/// ReplicationPad2d model(ReplicationPad2dOptions({1, 1, 2, 0}));
+/// ```
+using ReplicationPad2dOptions = ReplicationPadOptions<2>;
+
+/// `ReplicationPadOptions` specialized for the `ReplicationPad3d` module.
+///
+/// Example:
+/// ```
+/// ReplicationPad3d model(ReplicationPad3dOptions({1, 2, 1, 2, 1, 2}));
+/// ```
+using ReplicationPad3dOptions = ReplicationPadOptions<3>;
+
+// ============================================================================
+
+template <size_t D>
+struct TORCH_API ZeroPadOptions {
+  ZeroPadOptions(ExpandingArray<D * 2> padding) : padding_(padding) {}
+
+  /// The size of the padding.
+  /// - If it is `int`, uses the same padding in all boundaries.
+  /// - If it is a 2-`tuple` (for ZeroPad1d), uses (padding_left,
+  /// padding_right).
+  /// - If it is a 4-`tuple` (for ZeroPad2d), uses (padding_left, padding_right,
+  /// padding_top, padding_bottom).
+  /// - If it is a 6-`tuple` (for ZeroPad3d), uses
+  ///   (padding_left, padding_right, padding_top, padding_bottom,
+  ///   padding_front, padding_back).
+  TORCH_ARG(ExpandingArray<D * 2>, padding);
+};
+
+/// `ZeroPadOptions` specialized for the `ZeroPad1d` module.
+///
+/// Example:
+/// ```
+/// ConstantPad1d model(ConstantPad1dOptions({3, 1});
+/// ```
+using ZeroPad1dOptions = ZeroPadOptions<1>;
+
+/// `ZeroPadOptions` specialized for the `ZeroPad2d` module.
+///
+/// Example:
+/// ```
+/// ConstantPad2d model(ConstantPad2dOptions({1, 1, 2, 0});
+/// ```
+using ZeroPad2dOptions = ZeroPadOptions<2>;
+
+/// `ZeroPadOptions` specialized for the `ZeroPad3d` module.
+///
+/// Example:
+/// ```
+/// ConstantPad3d model(ConstantPad3dOptions({1, 2, 1, 2, 1, 2});
+/// ```
+using ZeroPad3dOptions = ZeroPadOptions<3>;
+
+// ============================================================================
+
+/// Options for a `D`-dimensional ConstantPad module.
+template <size_t D>
+struct TORCH_API ConstantPadOptions {
+  ConstantPadOptions(ExpandingArray<D * 2> padding, double value)
+      : padding_(padding), value_(value) {}
+
+  /// The size of the padding.
+  /// - If it is `int`, uses the same padding in all boundaries.
+  /// - If it is a 2-`tuple` (for ConstantPad1d), uses (padding_left,
+  /// padding_right).
+  /// - If it is a 4-`tuple` (for ConstantPad2d), uses (padding_left,
+  /// padding_right, padding_top, padding_bottom).
+  /// - If it is a 6-`tuple` (for ConstantPad3d), uses
+  ///   (padding_left, padding_right, padding_top, padding_bottom,
+  ///   padding_front, padding_back).
+  TORCH_ARG(ExpandingArray<D * 2>, padding);
+
+  /// Fill value for constant padding.
+  TORCH_ARG(double, value);
+};
+
+/// `ConstantPadOptions` specialized for the `ConstantPad1d` module.
+///
+/// Example:
+/// ```
+/// ConstantPad1d model(ConstantPad1dOptions({3, 1}, 3.5));
+/// ```
+using ConstantPad1dOptions = ConstantPadOptions<1>;
+
+/// `ConstantPadOptions` specialized for the `ConstantPad2d` module.
+///
+/// Example:
+/// ```
+/// ConstantPad2d model(ConstantPad2dOptions({3, 0, 2, 1}, 3.5));
+/// ```
+using ConstantPad2dOptions = ConstantPadOptions<2>;
+
+/// `ConstantPadOptions` specialized for the `ConstantPad3d` module.
+///
+/// Example:
+/// ```
+/// ConstantPad3d model(ConstantPad3dOptions({1, 2, 1, 2, 1, 2}, 3.5));
+/// ```
+using ConstantPad3dOptions = ConstantPadOptions<3>;
+
+// ============================================================================
+
+namespace functional {
+
+/// Options for `torch::nn::functional::pad`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::pad(input, F::PadFuncOptions({1, 2, 2, 1, 1,
+/// 2}).mode(torch::kReplicate));
+/// ```
+struct TORCH_API PadFuncOptions {
+  typedef std::variant<
+      enumtype::kConstant,
+      enumtype::kReflect,
+      enumtype::kReplicate,
+      enumtype::kCircular>
+      mode_t;
+
+  PadFuncOptions(std::vector<int64_t> pad);
+
+  /// m-elements tuple, where m/2 <= input dimensions and m is even.
+  TORCH_ARG(std::vector<int64_t>, pad);
+
+  /// "constant", "reflect", "replicate" or "circular". Default: "constant"
+  TORCH_ARG(mode_t, mode) = torch::kConstant;
+
+  /// fill value for "constant" padding. Default: 0
+  TORCH_ARG(double, value) = 0;
+};
+
+} // namespace functional
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/pixelshuffle.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/pixelshuffle.h
new file mode 100644
index 0000000000000000000000000000000000000000..11413aa2b36e2e9ac1eecbf002a2dc7998f29ddb
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/pixelshuffle.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/types.h>
+
+namespace torch::nn {
+
+/// Options for the `PixelShuffle` module.
+///
+/// Example:
+/// ```
+/// PixelShuffle model(PixelShuffleOptions(5));
+/// ```
+struct TORCH_API PixelShuffleOptions {
+  PixelShuffleOptions(int64_t upscale_factor)
+      : upscale_factor_(upscale_factor) {}
+
+  /// Factor to increase spatial resolution by
+  TORCH_ARG(int64_t, upscale_factor);
+};
+
+/// Options for the `PixelUnshuffle` module.
+///
+/// Example:
+/// ```
+/// PixelUnshuffle model(PixelUnshuffleOptions(5));
+/// ```
+struct TORCH_API PixelUnshuffleOptions {
+  /* implicit */ PixelUnshuffleOptions(int64_t downscale_factor)
+      : downscale_factor_(downscale_factor) {}
+
+  /// Factor to decrease spatial resolution by
+  TORCH_ARG(int64_t, downscale_factor);
+};
+
+namespace functional {
+/// Options for `torch::nn::functional::pixel_shuffle`.
+///
+/// See the documentation for `torch::nn::PixelShuffleOptions` class to learn
+/// what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::pixel_shuffle(x, F::PixelShuffleFuncOptions(2));
+/// ```
+using PixelShuffleFuncOptions = PixelShuffleOptions;
+
+/// Options for `torch::nn::functional::pixel_unshuffle`.
+///
+/// See the documentation for `torch::nn::PixelUnshuffleOptions` class to learn
+/// what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::pixel_unshuffle(x, F::PixelUnshuffleFuncOptions(2));
+/// ```
+using PixelUnshuffleFuncOptions = PixelUnshuffleOptions;
+} // namespace functional
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/pooling.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/pooling.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e01aba72b6ff6d1648cc30a2f14097ca2cd869d
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/pooling.h
@@ -0,0 +1,594 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/expanding_array.h>
+#include <torch/types.h>
+
+namespace torch::nn {
+
+/// Options for a `D`-dimensional avgpool module.
+template <size_t D>
+struct AvgPoolOptions {
+  AvgPoolOptions(ExpandingArray<D> kernel_size)
+      : kernel_size_(kernel_size), stride_(kernel_size) {}
+
+  /// the size of the window to take an average over
+  TORCH_ARG(ExpandingArray<D>, kernel_size);
+
+  /// the stride of the window. Default value is `kernel_size`
+  TORCH_ARG(ExpandingArray<D>, stride);
+
+  /// implicit zero padding to be added on both sides
+  TORCH_ARG(ExpandingArray<D>, padding) = 0;
+
+  /// when True, will use `ceil` instead of `floor` to compute the output shape
+  TORCH_ARG(bool, ceil_mode) = false;
+
+  /// when True, will include the zero-padding in the averaging calculation
+  TORCH_ARG(bool, count_include_pad) = true;
+
+  /// if specified, it will be used as divisor, otherwise size of the pooling
+  /// region will be used.
+
+  TORCH_ARG(std::optional<int64_t>, divisor_override) = std::nullopt;
+};
+
+/// `AvgPoolOptions` specialized for the `AvgPool1d` module.
+///
+/// Example:
+/// ```
+/// AvgPool1d model(AvgPool1dOptions(3).stride(2));
+/// ```
+using AvgPool1dOptions = AvgPoolOptions<1>;
+
+/// `AvgPoolOptions` specialized for the `AvgPool2d` module.
+///
+/// Example:
+/// ```
+/// AvgPool2d model(AvgPool2dOptions({3, 2}).stride({2, 2}));
+/// ```
+using AvgPool2dOptions = AvgPoolOptions<2>;
+
+/// `AvgPoolOptions` specialized for the `AvgPool3d` module.
+///
+/// Example:
+/// ```
+/// AvgPool3d model(AvgPool3dOptions(5).stride(2));
+/// ```
+using AvgPool3dOptions = AvgPoolOptions<3>;
+
+namespace functional {
+/// Options for `torch::nn::functional::avg_pool1d`.
+///
+/// See the documentation for `torch::nn::AvgPool1dOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::avg_pool1d(x, F::AvgPool1dFuncOptions(3).stride(2));
+/// ```
+using AvgPool1dFuncOptions = AvgPool1dOptions;
+} // namespace functional
+
+namespace functional {
+/// Options for `torch::nn::functional::avg_pool2d`.
+///
+/// See the documentation for `torch::nn::AvgPool2dOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::avg_pool2d(x, F::AvgPool2dFuncOptions(3).stride(2));
+/// ```
+using AvgPool2dFuncOptions = AvgPool2dOptions;
+} // namespace functional
+
+namespace functional {
+/// Options for `torch::nn::functional::avg_pool3d`.
+///
+/// See the documentation for `torch::nn::AvgPool3dOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::avg_pool3d(x, F::AvgPool3dFuncOptions(3).stride(2));
+/// ```
+using AvgPool3dFuncOptions = AvgPool3dOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for a `D`-dimensional maxpool module.
+template <size_t D>
+struct MaxPoolOptions {
+  MaxPoolOptions(ExpandingArray<D> kernel_size)
+      : kernel_size_(kernel_size), stride_(kernel_size) {}
+
+  /// the size of the window to take a max over
+  TORCH_ARG(ExpandingArray<D>, kernel_size);
+
+  /// the stride of the window. Default value is `kernel_size
+  TORCH_ARG(ExpandingArray<D>, stride);
+
+  /// implicit zero padding to be added on both sides
+  TORCH_ARG(ExpandingArray<D>, padding) = 0;
+
+  /// a parameter that controls the stride of elements in the window
+  TORCH_ARG(ExpandingArray<D>, dilation) = 1;
+
+  /// when True, will use `ceil` instead of `floor` to compute the output shape
+  TORCH_ARG(bool, ceil_mode) = false;
+};
+
+/// `MaxPoolOptions` specialized for the `MaxPool1d` module.
+///
+/// Example:
+/// ```
+/// MaxPool1d model(MaxPool1dOptions(3).stride(2));
+/// ```
+using MaxPool1dOptions = MaxPoolOptions<1>;
+
+/// `MaxPoolOptions` specialized for the `MaxPool2d` module.
+///
+/// Example:
+/// ```
+/// MaxPool2d model(MaxPool2dOptions({3, 2}).stride({2, 2}));
+/// ```
+using MaxPool2dOptions = MaxPoolOptions<2>;
+
+/// `MaxPoolOptions` specialized for the `MaxPool3d` module.
+///
+/// Example:
+/// ```
+/// MaxPool3d model(MaxPool3dOptions(3).stride(2));
+/// ```
+using MaxPool3dOptions = MaxPoolOptions<3>;
+
+namespace functional {
+/// Options for `torch::nn::functional::max_pool1d` and
+/// `torch::nn::functional::max_pool1d_with_indices`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::max_pool1d(x, F::MaxPool1dFuncOptions(3).stride(2));
+/// ```
+using MaxPool1dFuncOptions = MaxPool1dOptions;
+} // namespace functional
+
+namespace functional {
+/// Options for `torch::nn::functional::max_pool2d` and
+/// `torch::nn::functional::max_pool2d_with_indices`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::max_pool2d(x, F::MaxPool2dFuncOptions(3).stride(2));
+/// ```
+using MaxPool2dFuncOptions = MaxPool2dOptions;
+} // namespace functional
+
+namespace functional {
+/// Options for `torch::nn::functional::max_pool3d` and
+/// `torch::nn::functional::max_pool3d_with_indices`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::max_pool3d(x, F::MaxPool3dFuncOptions(3).stride(2));
+/// ```
+using MaxPool3dFuncOptions = MaxPool3dOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for a `D`-dimensional adaptive maxpool module.
+template <typename output_size_t>
+struct AdaptiveMaxPoolOptions {
+  AdaptiveMaxPoolOptions(output_size_t output_size)
+      : output_size_(output_size) {}
+
+  /// the target output size
+  TORCH_ARG(output_size_t, output_size);
+};
+
+/// `AdaptiveMaxPoolOptions` specialized for the `AdaptiveMaxPool1d` module.
+///
+/// Example:
+/// ```
+/// AdaptiveMaxPool1d model(AdaptiveMaxPool1dOptions(3));
+/// ```
+using AdaptiveMaxPool1dOptions = AdaptiveMaxPoolOptions<ExpandingArray<1>>;
+
+/// `AdaptiveMaxPoolOptions` specialized for the `AdaptiveMaxPool2d` module.
+///
+/// Example:
+/// ```
+/// AdaptiveMaxPool2d model(AdaptiveMaxPool2dOptions({3, 2}));
+/// ```
+using AdaptiveMaxPool2dOptions =
+    AdaptiveMaxPoolOptions<ExpandingArrayWithOptionalElem<2>>;
+
+/// `AdaptiveMaxPoolOptions` specialized for the `AdaptiveMaxPool3d` module.
+///
+/// Example:
+/// ```
+/// AdaptiveMaxPool3d model(AdaptiveMaxPool3dOptions(3));
+/// ```
+using AdaptiveMaxPool3dOptions =
+    AdaptiveMaxPoolOptions<ExpandingArrayWithOptionalElem<3>>;
+
+namespace functional {
+/// Options for `torch::nn::functional::adaptive_max_pool1d` and
+/// `torch::nn::functional::adaptive_max_pool1d_with_indices`
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::adaptive_max_pool1d(x, F::AdaptiveMaxPool1dFuncOptions(3));
+/// ```
+using AdaptiveMaxPool1dFuncOptions = AdaptiveMaxPool1dOptions;
+} // namespace functional
+
+namespace functional {
+/// Options for `torch::nn::functional::adaptive_max_pool2d` and
+/// `torch::nn::functional::adaptive_max_pool2d_with_indices`
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::adaptive_max_pool2d(x, F::AdaptiveMaxPool2dFuncOptions(3));
+/// ```
+using AdaptiveMaxPool2dFuncOptions = AdaptiveMaxPool2dOptions;
+} // namespace functional
+
+namespace functional {
+/// Options for `torch::nn::functional::adaptive_max_pool3d` and
+/// `torch::nn::functional::adaptive_max_pool3d_with_indices`
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::adaptive_max_pool3d(x, F::AdaptiveMaxPool3dFuncOptions(3));
+/// ```
+using AdaptiveMaxPool3dFuncOptions = AdaptiveMaxPool3dOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for a `D`-dimensional adaptive avgpool module.
+template <typename output_size_t>
+struct AdaptiveAvgPoolOptions {
+  AdaptiveAvgPoolOptions(output_size_t output_size)
+      : output_size_(output_size) {}
+
+  /// the target output size
+  TORCH_ARG(output_size_t, output_size);
+};
+
+/// `AdaptiveAvgPoolOptions` specialized for the `AdaptiveAvgPool1d` module.
+///
+/// Example:
+/// ```
+/// AdaptiveAvgPool1d model(AdaptiveAvgPool1dOptions(5));
+/// ```
+using AdaptiveAvgPool1dOptions = AdaptiveAvgPoolOptions<ExpandingArray<1>>;
+
+/// `AdaptiveAvgPoolOptions` specialized for the `AdaptiveAvgPool2d` module.
+///
+/// Example:
+/// ```
+/// AdaptiveAvgPool2d model(AdaptiveAvgPool2dOptions({3, 2}));
+/// ```
+using AdaptiveAvgPool2dOptions =
+    AdaptiveAvgPoolOptions<ExpandingArrayWithOptionalElem<2>>;
+
+/// `AdaptiveAvgPoolOptions` specialized for the `AdaptiveAvgPool3d` module.
+///
+/// Example:
+/// ```
+/// AdaptiveAvgPool3d model(AdaptiveAvgPool3dOptions(3));
+/// ```
+using AdaptiveAvgPool3dOptions =
+    AdaptiveAvgPoolOptions<ExpandingArrayWithOptionalElem<3>>;
+
+namespace functional {
+/// Options for `torch::nn::functional::adaptive_avg_pool1d`.
+///
+/// See the documentation for `torch::nn::AdaptiveAvgPool1dOptions` class to
+/// learn what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::adaptive_avg_pool1d(x, F::AdaptiveAvgPool1dFuncOptions(3));
+/// ```
+using AdaptiveAvgPool1dFuncOptions = AdaptiveAvgPool1dOptions;
+} // namespace functional
+
+namespace functional {
+/// Options for `torch::nn::functional::adaptive_avg_pool2d`.
+///
+/// See the documentation for `torch::nn::AdaptiveAvgPool2dOptions` class to
+/// learn what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::adaptive_avg_pool2d(x, F::AdaptiveAvgPool2dFuncOptions(3));
+/// ```
+using AdaptiveAvgPool2dFuncOptions = AdaptiveAvgPool2dOptions;
+} // namespace functional
+
+namespace functional {
+/// Options for `torch::nn::functional::adaptive_avg_pool3d`.
+///
+/// See the documentation for `torch::nn::AdaptiveAvgPool3dOptions` class to
+/// learn what arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::adaptive_avg_pool3d(x, F::AdaptiveAvgPool3dFuncOptions(3));
+/// ```
+using AdaptiveAvgPool3dFuncOptions = AdaptiveAvgPool3dOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for a `D`-dimensional maxunpool module.
+template <size_t D>
+struct MaxUnpoolOptions {
+  MaxUnpoolOptions(ExpandingArray<D> kernel_size)
+      : kernel_size_(kernel_size), stride_(kernel_size) {}
+
+  /// the size of the window to take a max over
+  TORCH_ARG(ExpandingArray<D>, kernel_size);
+
+  /// the stride of the window. Default value is `kernel_size
+  TORCH_ARG(ExpandingArray<D>, stride);
+
+  /// implicit zero padding to be added on both sides
+  TORCH_ARG(ExpandingArray<D>, padding) = 0;
+};
+
+/// `MaxUnpoolOptions` specialized for the `MaxUnpool1d` module.
+///
+/// Example:
+/// ```
+/// MaxUnpool1d model(MaxUnpool1dOptions(3).stride(2).padding(1));
+/// ```
+using MaxUnpool1dOptions = MaxUnpoolOptions<1>;
+
+/// `MaxUnpoolOptions` specialized for the `MaxUnpool2d` module.
+///
+/// Example:
+/// ```
+/// MaxUnpool2d model(MaxUnpool2dOptions(3).stride(2).padding(1));
+/// ```
+using MaxUnpool2dOptions = MaxUnpoolOptions<2>;
+
+/// `MaxUnpoolOptions` specialized for the `MaxUnpool3d` module.
+///
+/// Example:
+/// ```
+/// MaxUnpool3d model(MaxUnpool3dOptions(3).stride(2).padding(1));
+/// ```
+using MaxUnpool3dOptions = MaxUnpoolOptions<3>;
+
+// ============================================================================
+
+namespace functional {
+
+/// Options for a `D`-dimensional maxunpool functional.
+template <size_t D>
+struct MaxUnpoolFuncOptions {
+  MaxUnpoolFuncOptions(ExpandingArray<D> kernel_size)
+      : kernel_size_(kernel_size), stride_(kernel_size) {}
+
+  /// the size of the window to take a max over
+  TORCH_ARG(ExpandingArray<D>, kernel_size);
+
+  /// the stride of the window. Default value is `kernel_size
+  TORCH_ARG(ExpandingArray<D>, stride);
+
+  /// implicit zero padding to be added on both sides
+  TORCH_ARG(ExpandingArray<D>, padding) = 0;
+
+  /// the targeted output size
+  TORCH_ARG(std::optional<std::vector<int64_t>>, output_size) = std::nullopt;
+};
+
+/// `MaxUnpoolFuncOptions` specialized for
+/// `torch::nn::functional::max_unpool1d`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::max_unpool1d(x, indices,
+/// F::MaxUnpool1dFuncOptions(3).stride(2).padding(1));
+/// ```
+using MaxUnpool1dFuncOptions = MaxUnpoolFuncOptions<1>;
+
+/// `MaxUnpoolFuncOptions` specialized for
+/// `torch::nn::functional::max_unpool2d`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::max_unpool2d(x, indices,
+/// F::MaxUnpool2dFuncOptions(3).stride(2).padding(1));
+/// ```
+using MaxUnpool2dFuncOptions = MaxUnpoolFuncOptions<2>;
+
+/// `MaxUnpoolFuncOptions` specialized for
+/// `torch::nn::functional::max_unpool3d`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::max_unpool3d(x, indices, F::MaxUnpool3dFuncOptions(3));
+/// ```
+using MaxUnpool3dFuncOptions = MaxUnpoolFuncOptions<3>;
+
+} // namespace functional
+
+// ============================================================================
+
+/// Options for a `D`-dimensional fractional maxpool module.
+template <size_t D>
+struct FractionalMaxPoolOptions {
+  FractionalMaxPoolOptions(ExpandingArray<D> kernel_size)
+      : kernel_size_(kernel_size) {}
+
+  /// the size of the window to take a max over
+  TORCH_ARG(ExpandingArray<D>, kernel_size);
+
+  /// the target output size of the image
+  TORCH_ARG(std::optional<ExpandingArray<D>>, output_size) = std::nullopt;
+
+  /// If one wants to have an output size as a ratio of the input size, this
+  /// option can be given. This has to be a number or tuple in the range (0, 1)
+  using ExpandingArrayDouble = torch::ExpandingArray<D, double>;
+  TORCH_ARG(std::optional<ExpandingArrayDouble>, output_ratio) = std::nullopt;
+
+  TORCH_ARG(torch::Tensor, _random_samples) = Tensor();
+};
+
+/// `FractionalMaxPoolOptions` specialized for the `FractionalMaxPool2d` module.
+///
+/// Example:
+/// ```
+/// FractionalMaxPool2d model(FractionalMaxPool2dOptions(5).output_size(1));
+/// ```
+using FractionalMaxPool2dOptions = FractionalMaxPoolOptions<2>;
+
+/// `FractionalMaxPoolOptions` specialized for the `FractionalMaxPool3d` module.
+///
+/// Example:
+/// ```
+/// FractionalMaxPool3d model(FractionalMaxPool3dOptions(5).output_size(1));
+/// ```
+using FractionalMaxPool3dOptions = FractionalMaxPoolOptions<3>;
+
+namespace functional {
+/// Options for `torch::nn::functional::fractional_max_pool2d` and
+/// `torch::nn::functional::fractional_max_pool2d_with_indices`
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::fractional_max_pool2d(x,
+/// F::FractionalMaxPool2dFuncOptions(3).output_size(2));
+/// ```
+using FractionalMaxPool2dFuncOptions = FractionalMaxPool2dOptions;
+} // namespace functional
+
+namespace functional {
+/// Options for `torch::nn::functional::fractional_max_pool3d` and
+/// `torch::nn::functional::fractional_max_pool3d_with_indices`
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::fractional_max_pool3d(x,
+/// F::FractionalMaxPool3dFuncOptions(3).output_size(2));
+/// ```
+using FractionalMaxPool3dFuncOptions = FractionalMaxPool3dOptions;
+} // namespace functional
+
+// ============================================================================
+
+/// Options for a `D`-dimensional lppool module.
+template <size_t D>
+struct LPPoolOptions {
+  LPPoolOptions(double norm_type, ExpandingArray<D> kernel_size)
+      : norm_type_(norm_type),
+        kernel_size_(kernel_size),
+        stride_(kernel_size) {}
+
+  TORCH_ARG(double, norm_type);
+
+  // the size of the window to take an average over
+  TORCH_ARG(ExpandingArray<D>, kernel_size);
+
+  // the stride of the window. Default value is `kernel_size`
+  TORCH_ARG(ExpandingArray<D>, stride);
+
+  // when True, will use `ceil` instead of `floor` to compute the output shape
+  TORCH_ARG(bool, ceil_mode) = false;
+};
+
+/// `LPPoolOptions` specialized for the `LPPool1d` module.
+///
+/// Example:
+/// ```
+/// LPPool1d model(LPPool1dOptions(1, 2).stride(5).ceil_mode(true));
+/// ```
+using LPPool1dOptions = LPPoolOptions<1>;
+
+/// `LPPoolOptions` specialized for the `LPPool2d` module.
+///
+/// Example:
+/// ```
+/// LPPool2d model(LPPool2dOptions(1, std::vector<int64_t>({3, 4})).stride({5,
+/// 6}).ceil_mode(true));
+/// ```
+using LPPool2dOptions = LPPoolOptions<2>;
+
+/// `LPPoolOptions` specialized for the `LPPool3d` module.
+///
+/// Example:
+/// ```
+/// LPPool3d model(LPPool3dOptions(1, std::vector<int64_t>({3, 4, 5})).stride(
+/// {5, 6, 7}).ceil_mode(true));
+/// ```
+using LPPool3dOptions = LPPoolOptions<3>;
+
+namespace functional {
+/// Options for `torch::nn::functional::lp_pool1d`.
+///
+/// See the documentation for `torch::nn::LPPool1dOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::lp_pool1d(x, F::LPPool1dFuncOptions(2, 3).stride(2));
+/// ```
+using LPPool1dFuncOptions = LPPool1dOptions;
+} // namespace functional
+
+namespace functional {
+/// Options for `torch::nn::functional::lp_pool2d`.
+///
+/// See the documentation for `torch::nn::LPPool2dOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::lp_pool2d(x, F::LPPool2dFuncOptions(2, {2, 3}).stride(2));
+/// ```
+using LPPool2dFuncOptions = LPPool2dOptions;
+} // namespace functional
+
+namespace functional {
+/// Options for `torch::nn::functional::lp_pool3d`.
+///
+/// See the documentation for `torch::nn::LPPool3dOptions` class to learn what
+/// arguments are supported.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::lp_pool3d(x, F::LPPool3dFuncOptions(2, {2, 3, 4}).stride(2));
+/// ```
+using LPPool3dFuncOptions = LPPool3dOptions;
+} // namespace functional
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/rnn.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/rnn.h
new file mode 100644
index 0000000000000000000000000000000000000000..8607139a27ecb2a265ba86a4ac1e3e23de39673a
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/rnn.h
@@ -0,0 +1,234 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/enum.h>
+#include <torch/types.h>
+
+namespace torch::nn {
+
+namespace detail {
+
+/// Common options for RNN, LSTM and GRU modules.
+struct TORCH_API RNNOptionsBase {
+  typedef std::variant<
+      enumtype::kLSTM,
+      enumtype::kGRU,
+      enumtype::kRNN_TANH,
+      enumtype::kRNN_RELU>
+      rnn_options_base_mode_t;
+
+  RNNOptionsBase(
+      rnn_options_base_mode_t mode,
+      int64_t input_size,
+      int64_t hidden_size);
+
+  TORCH_ARG(rnn_options_base_mode_t, mode);
+  /// The number of features of a single sample in the input sequence `x`.
+  TORCH_ARG(int64_t, input_size);
+  /// The number of features in the hidden state `h`.
+  TORCH_ARG(int64_t, hidden_size);
+  /// The number of recurrent layers (cells) to use.
+  TORCH_ARG(int64_t, num_layers) = 1;
+  /// Whether a bias term should be added to all linear operations.
+  TORCH_ARG(bool, bias) = true;
+  /// If true, the input sequence should be provided as `(batch, sequence,
+  /// features)`. If false (default), the expected layout is `(sequence, batch,
+  /// features)`.
+  TORCH_ARG(bool, batch_first) = false;
+  /// If non-zero, adds dropout with the given probability to the output of each
+  /// RNN layer, except the final layer.
+  TORCH_ARG(double, dropout) = 0.0;
+  /// Whether to make the RNN bidirectional.
+  TORCH_ARG(bool, bidirectional) = false;
+  /// Cell projection dimension. If 0, projections are not added. Can only be
+  /// used for LSTMs.
+  TORCH_ARG(int64_t, proj_size) = 0;
+};
+
+} // namespace detail
+
+/// Options for the `RNN` module.
+///
+/// Example:
+/// ```
+/// RNN model(RNNOptions(128,
+/// 64).num_layers(3).dropout(0.2).nonlinearity(torch::kTanh));
+/// ```
+struct TORCH_API RNNOptions {
+  typedef std::variant<enumtype::kTanh, enumtype::kReLU> nonlinearity_t;
+
+  RNNOptions(int64_t input_size, int64_t hidden_size);
+
+  /// The number of expected features in the input `x`
+  TORCH_ARG(int64_t, input_size);
+  /// The number of features in the hidden state `h`
+  TORCH_ARG(int64_t, hidden_size);
+  /// Number of recurrent layers. E.g., setting ``num_layers=2``
+  /// would mean stacking two RNNs together to form a `stacked RNN`,
+  /// with the second RNN taking in outputs of the first RNN and
+  /// computing the final results. Default: 1
+  TORCH_ARG(int64_t, num_layers) = 1;
+  /// The non-linearity to use. Can be either ``torch::kTanh`` or
+  /// ``torch::kReLU``. Default: ``torch::kTanh``
+  TORCH_ARG(nonlinearity_t, nonlinearity) = torch::kTanh;
+  /// If ``false``, then the layer does not use bias weights `b_ih` and `b_hh`.
+  /// Default: ``true``
+  TORCH_ARG(bool, bias) = true;
+  /// If ``true``, then the input and output tensors are provided
+  /// as `(batch, seq, feature)`. Default: ``false``
+  TORCH_ARG(bool, batch_first) = false;
+  /// If non-zero, introduces a `Dropout` layer on the outputs of each
+  /// RNN layer except the last layer, with dropout probability equal to
+  /// `dropout`. Default: 0
+  TORCH_ARG(double, dropout) = 0.0;
+  /// If ``true``, becomes a bidirectional RNN. Default: ``false``
+  TORCH_ARG(bool, bidirectional) = false;
+};
+
+/// Options for the `LSTM` module.
+///
+/// Example:
+/// ```
+/// LSTM model(LSTMOptions(2,
+/// 4).num_layers(3).batch_first(false).bidirectional(true));
+/// ```
+struct TORCH_API LSTMOptions {
+  LSTMOptions(int64_t input_size, int64_t hidden_size);
+
+  /// The number of expected features in the input `x`
+  TORCH_ARG(int64_t, input_size);
+  /// The number of features in the hidden state `h`
+  TORCH_ARG(int64_t, hidden_size);
+  /// Number of recurrent layers. E.g., setting ``num_layers=2``
+  /// would mean stacking two LSTMs together to form a `stacked LSTM`,
+  /// with the second LSTM taking in outputs of the first LSTM and
+  /// computing the final results. Default: 1
+  TORCH_ARG(int64_t, num_layers) = 1;
+  /// If ``false``, then the layer does not use bias weights `b_ih` and `b_hh`.
+  /// Default: ``true``
+  TORCH_ARG(bool, bias) = true;
+  /// If ``true``, then the input and output tensors are provided
+  /// as (batch, seq, feature). Default: ``false``
+  TORCH_ARG(bool, batch_first) = false;
+  /// If non-zero, introduces a `Dropout` layer on the outputs of each
+  /// LSTM layer except the last layer, with dropout probability equal to
+  /// `dropout`. Default: 0
+  TORCH_ARG(double, dropout) = 0.0;
+  /// If ``true``, becomes a bidirectional LSTM. Default: ``false``
+  TORCH_ARG(bool, bidirectional) = false;
+  /// Cell projection dimension. If 0, projections are not added
+  TORCH_ARG(int64_t, proj_size) = 0;
+};
+
+/// Options for the `GRU` module.
+///
+/// Example:
+/// ```
+/// GRU model(GRUOptions(2,
+/// 4).num_layers(3).batch_first(false).bidirectional(true));
+/// ```
+struct TORCH_API GRUOptions {
+  GRUOptions(int64_t input_size, int64_t hidden_size);
+
+  /// The number of expected features in the input `x`
+  TORCH_ARG(int64_t, input_size);
+  /// The number of features in the hidden state `h`
+  TORCH_ARG(int64_t, hidden_size);
+  /// Number of recurrent layers. E.g., setting ``num_layers=2``
+  /// would mean stacking two GRUs together to form a `stacked GRU`,
+  /// with the second GRU taking in outputs of the first GRU and
+  /// computing the final results. Default: 1
+  TORCH_ARG(int64_t, num_layers) = 1;
+  /// If ``false``, then the layer does not use bias weights `b_ih` and `b_hh`.
+  /// Default: ``true``
+  TORCH_ARG(bool, bias) = true;
+  /// If ``true``, then the input and output tensors are provided
+  /// as (batch, seq, feature). Default: ``false``
+  TORCH_ARG(bool, batch_first) = false;
+  /// If non-zero, introduces a `Dropout` layer on the outputs of each
+  /// GRU layer except the last layer, with dropout probability equal to
+  /// `dropout`. Default: 0
+  TORCH_ARG(double, dropout) = 0.0;
+  /// If ``true``, becomes a bidirectional GRU. Default: ``false``
+  TORCH_ARG(bool, bidirectional) = false;
+};
+
+namespace detail {
+
+/// Common options for RNNCell, LSTMCell and GRUCell modules
+struct TORCH_API RNNCellOptionsBase {
+  RNNCellOptionsBase(
+      int64_t input_size,
+      int64_t hidden_size,
+      bool bias,
+      int64_t num_chunks);
+  TORCH_ARG(int64_t, input_size);
+  TORCH_ARG(int64_t, hidden_size);
+  TORCH_ARG(bool, bias);
+  TORCH_ARG(int64_t, num_chunks);
+};
+
+} // namespace detail
+
+/// Options for the `RNNCell` module.
+///
+/// Example:
+/// ```
+/// RNNCell model(RNNCellOptions(20,
+/// 10).bias(false).nonlinearity(torch::kReLU));
+/// ```
+struct TORCH_API RNNCellOptions {
+  typedef std::variant<enumtype::kTanh, enumtype::kReLU> nonlinearity_t;
+
+  RNNCellOptions(int64_t input_size, int64_t hidden_size);
+
+  /// The number of expected features in the input `x`
+  TORCH_ARG(int64_t, input_size);
+  /// The number of features in the hidden state `h`
+  TORCH_ARG(int64_t, hidden_size);
+  /// If ``false``, then the layer does not use bias weights `b_ih` and `b_hh`.
+  /// Default: ``true``
+  TORCH_ARG(bool, bias) = true;
+  /// The non-linearity to use. Can be either ``torch::kTanh`` or
+  /// ``torch::kReLU``. Default: ``torch::kTanh``
+  TORCH_ARG(nonlinearity_t, nonlinearity) = torch::kTanh;
+};
+
+/// Options for the `LSTMCell` module.
+///
+/// Example:
+/// ```
+/// LSTMCell model(LSTMCellOptions(20, 10).bias(false));
+/// ```
+struct TORCH_API LSTMCellOptions {
+  LSTMCellOptions(int64_t input_size, int64_t hidden_size);
+
+  /// The number of expected features in the input `x`
+  TORCH_ARG(int64_t, input_size);
+  /// The number of features in the hidden state `h`
+  TORCH_ARG(int64_t, hidden_size);
+  /// If ``false``, then the layer does not use bias weights `b_ih` and `b_hh`.
+  /// Default: ``true``
+  TORCH_ARG(bool, bias) = true;
+};
+
+/// Options for the `GRUCell` module.
+///
+/// Example:
+/// ```
+/// GRUCell model(GRUCellOptions(20, 10).bias(false));
+/// ```
+struct TORCH_API GRUCellOptions {
+  GRUCellOptions(int64_t input_size, int64_t hidden_size);
+
+  /// The number of expected features in the input `x`
+  TORCH_ARG(int64_t, input_size);
+  /// The number of features in the hidden state `h`
+  TORCH_ARG(int64_t, hidden_size);
+  /// If ``false``, then the layer does not use bias weights `b_ih` and `b_hh`.
+  /// Default: ``true``
+  TORCH_ARG(bool, bias) = true;
+};
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/transformer.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/transformer.h
new file mode 100644
index 0000000000000000000000000000000000000000..44878220e5982e3d827b179fd452e1aef29bc8fd
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/transformer.h
@@ -0,0 +1,62 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/enum.h>
+#include <torch/types.h>
+
+#include <torch/nn/modules/container/any.h>
+#include <torch/nn/options/transformerlayer.h>
+
+namespace torch::nn {
+
+/// Options for the `Transformer` module
+///
+/// Example:
+/// ```
+/// TransformerOptions options;
+/// TransformerOptions options(16, 4);
+/// auto options = TransformerOptions().d_model(4).nhead(2).dropout(0.0);
+/// ```
+struct TORCH_API TransformerOptions {
+  // The following constructors are commonly used
+  // Please don't add more unless it is proved as a common usage
+  TransformerOptions() = default;
+  TransformerOptions(int64_t d_model, int64_t nhead);
+  TransformerOptions(
+      int64_t d_model,
+      int64_t nhead,
+      int64_t num_encoder_layers,
+      int64_t num_decoder_layers);
+
+  /// the number of expected features in the encoder/decoder inputs
+  /// (default=512)
+  TORCH_ARG(int64_t, d_model) = 512;
+
+  /// the number of heads in the multiheadattention models (default=8)
+  TORCH_ARG(int64_t, nhead) = 8;
+
+  /// the number of sub-encoder-layers in the encoder (default=6)
+  TORCH_ARG(int64_t, num_encoder_layers) = 6;
+
+  /// the number of sub-decoder-layers in the decoder (default=6)
+  TORCH_ARG(int64_t, num_decoder_layers) = 6;
+
+  /// the dimension of the feedforward network model (default=2048)
+  TORCH_ARG(int64_t, dim_feedforward) = 2048;
+
+  /// the dropout value (default=0.1)
+  TORCH_ARG(double, dropout) = 0.1;
+
+  /// the activation function of encoder/decoder intermediate layer
+  /// (default=``torch::kReLU``)
+  TORCH_ARG(activation_t, activation) = torch::kReLU;
+
+  /// custom encoder (default=None)
+  TORCH_ARG(AnyModule, custom_encoder);
+
+  /// custom decoder (default=None)
+  TORCH_ARG(AnyModule, custom_decoder);
+};
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/transformercoder.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/transformercoder.h
new file mode 100644
index 0000000000000000000000000000000000000000..c7716d4646900ab3e15cff326095c4119c7cb052
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/transformercoder.h
@@ -0,0 +1,74 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/enum.h>
+#include <torch/types.h>
+
+#include <torch/nn/modules/container/any.h>
+#include <torch/nn/modules/transformerlayer.h>
+
+namespace torch::nn {
+
+/// Options for the `TransformerEncoder`
+///
+/// Example:
+/// ```
+/// TransformerEncoderLayer encoderLayer(TransformerEncoderLayerOptions(512,
+/// 8).dropout(0.1)); auto options = TransformerEncoderOptions(encoderLayer,
+/// 6).norm(LayerNorm(LayerNormOptions({2})));
+/// ```
+struct TORCH_API TransformerEncoderOptions {
+  // This constructor will keep a shallow copy of encoder_layer, so it keeps all
+  // the data in encoder_layer.
+  TransformerEncoderOptions(
+      TransformerEncoderLayer encoder_layer,
+      int64_t num_layers);
+  // This constructor will create a new TransformerEncoderLayer obj based on
+  // passed in encoder_layer_options.
+  TransformerEncoderOptions(
+      const TransformerEncoderLayerOptions& encoder_layer_options,
+      int64_t num_layers);
+
+  /// transformer Encoder Layer
+  TORCH_ARG(TransformerEncoderLayer, encoder_layer) = nullptr;
+
+  /// number of encoder layers
+  TORCH_ARG(int64_t, num_layers);
+
+  /// normalization module
+  TORCH_ARG(AnyModule, norm);
+};
+
+/// Options for the `TransformerDecoder` module.
+///
+/// Example:
+/// ```
+/// TransformerDecoderLayer decoder_layer(TransformerDecoderLayerOptions(512,
+/// 8).dropout(0.1)); auto options = TransformerDecoderOptions(decoder_layer,
+/// 6)norm(LayerNorm(LayerNormOptions({2}))); TransformerDecoder
+/// transformer_decoder(options);
+/// ```
+struct TORCH_API TransformerDecoderOptions {
+  // This constructor will keep the a ref of passed in decoder_layer,
+  // so it keeps all the data in decoder_layer.
+  TransformerDecoderOptions(
+      TransformerDecoderLayer decoder_layer,
+      int64_t num_layers);
+  // This constructor will create a new TransformerDecoderLayer obj,
+  // based on passed in decoder_layer_options.
+  TransformerDecoderOptions(
+      const TransformerDecoderLayerOptions& decoder_layer_options,
+      int64_t num_layers);
+
+  /// decoder layer to be cloned
+  TORCH_ARG(TransformerDecoderLayer, decoder_layer) = nullptr;
+
+  /// number of decoder layers
+  TORCH_ARG(int64_t, num_layers);
+
+  /// normalization module
+  TORCH_ARG(AnyModule, norm);
+};
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/transformerlayer.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/transformerlayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..b2ec50f016cb29c6805bee4570b6f829f9006ef4
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/transformerlayer.h
@@ -0,0 +1,70 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/enum.h>
+#include <torch/types.h>
+
+namespace torch::nn {
+
+using activation_t = std::variant<
+    enumtype::kReLU,
+    enumtype::kGELU,
+    std::function<Tensor(const Tensor&)>>;
+
+/// Options for the `TransformerEncoderLayer`
+///
+/// Example:
+/// ```
+/// auto options = TransformerEncoderLayer(512, 8).dropout(0.2);
+/// ```
+struct TORCH_API TransformerEncoderLayerOptions {
+  /* implicit */ TransformerEncoderLayerOptions(int64_t d_model, int64_t nhead);
+
+  /// the number of expected features in the input
+  TORCH_ARG(int64_t, d_model);
+
+  /// the number of heads in the multiheadattention models
+  TORCH_ARG(int64_t, nhead);
+
+  /// the dimension of the feedforward network model, default is 2048
+  TORCH_ARG(int64_t, dim_feedforward) = 2048;
+
+  /// the dropout value, default is 0.1
+  TORCH_ARG(double, dropout) = 0.1;
+
+  /// the activation function of intermediate layer, can be ``torch::kReLU``,
+  /// ``torch::GELU``, or a unary callable. Default: ``torch::kReLU``
+  TORCH_ARG(activation_t, activation) = torch::kReLU;
+};
+
+// ============================================================================
+
+/// Options for the `TransformerDecoderLayer` module.
+///
+/// Example:
+/// ```
+/// TransformerDecoderLayer model(TransformerDecoderLayerOptions(512,
+/// 8).dropout(0.2));
+/// ```
+struct TORCH_API TransformerDecoderLayerOptions {
+  TransformerDecoderLayerOptions(int64_t d_model, int64_t nhead);
+
+  /// number of expected features in the input
+  TORCH_ARG(int64_t, d_model);
+
+  /// number of heads in the multiheadattention models
+  TORCH_ARG(int64_t, nhead);
+
+  /// dimension of the feedforward network model. Default: 2048
+  TORCH_ARG(int64_t, dim_feedforward) = 2048;
+
+  /// dropout value. Default: 1
+  TORCH_ARG(double, dropout) = 0.1;
+
+  /// activation function of intermediate layer, can be ``torch::kGELU``,
+  /// ``torch::kReLU``, or a unary callable. Default: ``torch::kReLU``
+  TORCH_ARG(activation_t, activation) = torch::kReLU;
+};
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/upsampling.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/upsampling.h
new file mode 100644
index 0000000000000000000000000000000000000000..10a58843ba7342b6be69eda8cefec58a4092ae67
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/upsampling.h
@@ -0,0 +1,108 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/enum.h>
+#include <torch/expanding_array.h>
+#include <torch/types.h>
+
+#include <vector>
+
+namespace torch::nn {
+
+/// Options for the `Upsample` module.
+///
+/// Example:
+/// ```
+/// Upsample
+/// model(UpsampleOptions().scale_factor(std::vector<double>({3})).mode(torch::kLinear).align_corners(false));
+/// ```
+struct TORCH_API UpsampleOptions {
+  /// output spatial sizes.
+  TORCH_ARG(std::optional<std::vector<int64_t>>, size) = std::nullopt;
+
+  /// multiplier for spatial size.
+  TORCH_ARG(std::optional<std::vector<double>>, scale_factor) = std::nullopt;
+
+  /// the upsampling algorithm: one of "nearest", "linear", "bilinear",
+  /// "bicubic" and "trilinear". Default: "nearest"
+  typedef std::variant<
+      enumtype::kNearest,
+      enumtype::kLinear,
+      enumtype::kBilinear,
+      enumtype::kBicubic,
+      enumtype::kTrilinear>
+      mode_t;
+  TORCH_ARG(mode_t, mode) = torch::kNearest;
+
+  /// if "True", the corner pixels of the input and output tensors are
+  /// aligned, and thus preserving the values at those pixels. This only has
+  /// effect when :attr:`mode` is "linear", "bilinear", "bicubic", or
+  /// "trilinear". Default: "False"
+  TORCH_ARG(std::optional<bool>, align_corners) = std::nullopt;
+};
+
+namespace functional {
+
+/// Options for `torch::nn::functional::interpolate`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::interpolate(input,
+/// F::InterpolateFuncOptions().size(std::vector<int64_t>({4})).mode(torch::kNearest));
+/// ```
+struct TORCH_API InterpolateFuncOptions {
+  typedef std::variant<
+      enumtype::kNearest,
+      enumtype::kLinear,
+      enumtype::kBilinear,
+      enumtype::kBicubic,
+      enumtype::kTrilinear,
+      enumtype::kArea,
+      enumtype::kNearestExact>
+      mode_t;
+
+  /// output spatial sizes.
+  TORCH_ARG(std::optional<std::vector<int64_t>>, size) = std::nullopt;
+
+  /// multiplier for spatial size.
+  TORCH_ARG(std::optional<std::vector<double>>, scale_factor) = std::nullopt;
+
+  /// the upsampling algorithm: one of "nearest", "linear", "bilinear",
+  /// "bicubic", "trilinear", "area", "nearest-exact". Default: "nearest"
+  TORCH_ARG(mode_t, mode) = torch::kNearest;
+
+  /// Geometrically, we consider the pixels of the input and output as squares
+  /// rather than points. If set to "True", the input and output tensors are
+  /// aligned by the center points of their corner pixels, preserving the values
+  /// at the corner pixels. If set to "False", the input and output tensors
+  /// are aligned by the corner points of their corner pixels, and the
+  /// interpolation uses edge value padding for out-of-boundary values, making
+  /// this operation *independent* of input size when `scale_factor` is
+  /// kept the same.  It is *required* when interpolating mode is "linear",
+  /// "bilinear", "bicubic" or "trilinear". Default: "False"
+  TORCH_ARG(std::optional<bool>, align_corners) = std::nullopt;
+
+  /// recompute the scale_factor for use in the
+  /// interpolation calculation.  When `scale_factor` is passed as a parameter,
+  /// it is used to compute the `output_size`.  If `recompute_scale_factor` is
+  /// `true` or not specified, a new `scale_factor` will be computed based on
+  /// the output and input sizes for use in the interpolation computation (i.e.
+  /// the computation will be identical to if the computed `output_size` were
+  /// passed-in explicitly).  Otherwise, the passed-in `scale_factor` will be
+  /// used in the interpolation computation.  Note that when `scale_factor` is
+  /// floating-point, the recomputed scale_factor may differ from the one passed
+  /// in due to rounding and precision issues.
+  TORCH_ARG(std::optional<bool>, recompute_scale_factor) = std::nullopt;
+
+  /// flag to apply anti-aliasing. Using anti-alias
+  /// option together with :attr:`align_corners` equals "False", interpolation
+  /// result would match Pillow result for downsampling operation. Supported
+  /// modes: "bilinear". Default: "False".
+  TORCH_ARG(bool, antialias) = false;
+};
+
+} // namespace functional
+
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/vision.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/vision.h
new file mode 100644
index 0000000000000000000000000000000000000000..617cbe9a215a719870cc675b7ece3e8f1be18bff
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/options/vision.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+#include <torch/enum.h>
+#include <torch/types.h>
+
+namespace torch::nn::functional {
+
+/// Options for `torch::nn::functional::grid_sample`.
+///
+/// Example:
+/// ```
+/// namespace F = torch::nn::functional;
+/// F::grid_sample(input, grid,
+/// F::GridSampleFuncOptions().mode(torch::kBilinear).padding_mode(torch::kZeros).align_corners(true));
+/// ```
+struct TORCH_API GridSampleFuncOptions {
+  typedef std::
+      variant<enumtype::kBilinear, enumtype::kNearest, enumtype::kBicubic>
+          mode_t;
+  typedef std::
+      variant<enumtype::kZeros, enumtype::kBorder, enumtype::kReflection>
+          padding_mode_t;
+
+  /// interpolation mode to calculate output values. Default: Bilinear
+  TORCH_ARG(mode_t, mode) = torch::kBilinear;
+  /// padding mode for outside grid values. Default: Zeros
+  TORCH_ARG(padding_mode_t, padding_mode) = torch::kZeros;
+  /// Specifies perspective to pixel as point. Default: false
+  TORCH_ARG(std::optional<bool>, align_corners) = std::nullopt;
+};
+
+} // namespace torch::nn::functional
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/parallel/data_parallel.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/parallel/data_parallel.h
new file mode 100644
index 0000000000000000000000000000000000000000..337ba8e08182ce9511cb58b2807ab5f0a884a399
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/parallel/data_parallel.h
@@ -0,0 +1,295 @@
+#pragma once
+
+#include <torch/cuda.h>
+#include <torch/nn/module.h>
+#include <torch/nn/pimpl.h>
+#include <torch/types.h>
+
+#include <ATen/core/functional.h>
+#include <torch/csrc/autograd/functions/comm.h>
+#include <torch/csrc/autograd/functions/utils.h>
+
+#include <ATen/Device.h>
+#include <ATen/Parallel.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Exception.h>
+#include <c10/util/irange.h>
+
+#include <exception>
+#include <memory>
+#include <mutex>
+#include <vector>
+
+namespace torch::nn {
+
+namespace {
+
+// Note [Replicating Modules]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~
+//
+// Module replication is implemented in the following two steps:
+// 1) create a module replica on each destination device using Module.clone().
+// 2) manually add a gradient edge pointing from every parameter X in every
+//    module replica to the same parameter X in the original module, using
+//    ReduceAdd as the grad_fn.
+//
+// ReduceAdd can ONLY be used during the backward pass of data parallel. Forward
+// pass cannot use this function as it does not setup gradient function and
+// history at all. Do NOT try to use ReduceAdd for any other purposes.
+//
+// NB: An alternative is to add Broadcast and ReduceAddCoalesce to
+// torch/csrc/autograd/functions/comm.cpp as normal autograd functions,
+// implement a Replicatable (like cloneable) class and add it as a friend class
+// in Module.h. In the forward pass, the Replicatable could use the Broadcast
+// function to replicate every module parameter and set gradient functions using
+// ReduceAddCoalesce (like how it is implemented in Python). However, unlike in
+// Python, where changes to Linear._parameters["weight"] would also apply to
+// Linear.weight (using Linear as an example), Linear.weight and
+// Linear.parameters_["weight"] are two tensor objects pointing to the same
+// TensorImpl. Assigning a new tensor to Linear.parameters_["weight"] will not
+// change Linear.weight. To make this work, we will have to:
+// 1) force every module to also inherit from Replicatable
+// 2) force every module to implement an additional function, e.g.,
+//    Replicatable::load_params(), to pick up changes from parameters_ to their
+//    own member fields.
+// This will be an overkill as Replicatable will only be used in data_parallel,
+// not even ddp.
+
+// Autograd function for the replicate step in data parallel. This is only used
+// in data parallel, and should not be exposed as a user API.
+struct ReduceAdd : public autograd::Node {
+  explicit ReduceAdd(const at::Device& destination_device)
+      : destination_device_(destination_device) {};
+  ~ReduceAdd() override = default;
+
+  // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
+  autograd::variable_list apply(autograd::variable_list&& inputs) override {
+    TORCH_CHECK(
+        !torch::autograd::compute_requires_grad(inputs),
+        "ReduceAdd can only be used during the backward pass of data parallel.");
+
+    Tensor output = torch::zeros_like(inputs[0], {destination_device_});
+
+    for (auto& input : inputs) {
+      TORCH_CHECK(
+          input.sizes() == inputs[0].sizes(),
+          "All inputs of ReduceAdd must have the same size, but got ",
+          input.sizes(),
+          " and ",
+          inputs[0].sizes());
+
+      TORCH_CHECK(
+          input.dtype() == inputs[0].dtype(),
+          "All inputs of ReduceAdd must have the same dtype, but got ",
+          input.dtype(),
+          " and ",
+          inputs[0].dtype());
+
+      // TODO: use nccl reduce
+      output.add_(input.to(destination_device_));
+    }
+
+    return {output};
+  }
+
+ private:
+  at::Device destination_device_;
+};
+
+} // namespace
+
+// A friend function to Module, it recursively sets gradient edges pointing from
+// every parameter X in every module replica to the same parameter X in the
+// original module. See [Replicating Modules]
+template <typename ModuleType>
+void replicate_grad_edges(
+    const std::shared_ptr<Module>& module,
+    const std::vector<std::shared_ptr<ModuleType>>& replicas,
+    const std::vector<Device>& devices) {
+  for (auto& parameter : module->named_parameters(/*recurse=*/false)) {
+    auto grad_fn = std::make_shared<ReduceAdd>((*parameter).device());
+    grad_fn->set_next_edges(autograd::collect_next_edges(*parameter));
+
+    for (const auto i : c10::irange(devices.size())) {
+      autograd::set_history(replicas[i]->parameters_[parameter.key()], grad_fn);
+    }
+  }
+
+  for (auto& buffer : module->named_buffers(/*recurse=*/false)) {
+    if (buffer.value().requires_grad()) {
+      auto grad_fn = std::make_shared<ReduceAdd>((*buffer).device());
+      grad_fn->set_next_edges(autograd::collect_next_edges(*buffer));
+
+      for (const auto i : c10::irange(devices.size())) {
+        autograd::set_history(replicas[i]->buffers_[buffer.key()], grad_fn);
+      }
+    }
+  }
+
+  for (auto& child : module->children_) {
+    std::vector<std::shared_ptr<Module>> child_replicas;
+    child_replicas.reserve(devices.size());
+    for (auto& replica : replicas) {
+      child_replicas.push_back(replica->children_[child.key()]);
+    }
+
+    // recursively set gradient edges for all children
+    replicate_grad_edges(*child, child_replicas, devices);
+  }
+}
+
+namespace parallel {
+
+/// Replicates a module on the given list of devices.
+/// A replica is created by calling `clone()` on the module. For this, the
+/// module must inherit from `nn::Cloneable`, or define its own `clone()`
+/// method, which is expected to perform a deep copy of the module.
+template <typename ModuleType>
+std::vector<std::shared_ptr<ModuleType>> replicate(
+    const std::shared_ptr<ModuleType>& module,
+    const std::vector<Device>& devices) {
+  std::vector<std::shared_ptr<ModuleType>> replicas;
+  replicas.reserve(devices.size());
+  for (const auto& device : devices) {
+    replicas.push_back(
+        std::dynamic_pointer_cast<ModuleType>(module->clone(device)));
+  }
+  // Configure gradient edges to point from replcia parameters to original
+  // module parameters. See [Replicating Modules]
+  replicate_grad_edges(module, replicas, devices);
+  return replicas;
+}
+
+/// Replicates a module holder on the given list of devices.
+/// This method allows calling `replicate()` with a module holder, such as
+/// `Linear`.
+template <typename ModuleType>
+std::vector<ModuleHolder<ModuleType>> replicate(
+    const ModuleHolder<ModuleType>& module,
+    const std::vector<Device>& devices) {
+  auto ptrs = replicate(module.ptr(), devices);
+  return std::vector<ModuleHolder<ModuleType>>(ptrs.begin(), ptrs.end());
+}
+
+/// Applies the given inputs to the given modules in a parallel fashion.
+/// Conceptually, a thread is spawned for each `(module, input)` pair, in which
+/// `forward()` is called on the module with its corresponding input. The
+/// outputs of the individual calls are stored in a vector and returned.
+///
+/// The first exception caught by any thread is stashed and rethrown after all
+/// threads have completed their operation.
+///
+/// Further remarks:
+/// 1. The length of the module container must match the length of the inputs.
+/// 2. If a list of devices is supplied, it must match the list of modules in
+/// length. Each device will be set to the current default device during the
+/// invocation of the respective module. This means any tensors allocated on the
+/// default device inside the module will be constructed on this device.
+template <typename ModuleType>
+std::vector<Tensor> parallel_apply(
+    std::vector<ModuleType>& modules,
+    const std::vector<Tensor>& inputs,
+    const std::optional<std::vector<Device>>& devices = std::nullopt) {
+  TORCH_CHECK(
+      modules.size() == inputs.size(), "Must have as many inputs as modules");
+  if (devices) {
+    TORCH_CHECK(
+        modules.size() == devices->size(),
+        "Must have as many devices as modules");
+  }
+
+  std::vector<Tensor> outputs(modules.size());
+  std::mutex mutex;
+
+  // std::exception_ptr can be passed between threads:
+  // > An instance of std::exception_ptr may be passed to another function,
+  // > possibly on another thread, where the exception may be rethrown [...].
+  // https://en.cppreference.com/w/cpp/error/exception_ptr
+  std::exception_ptr exception;
+
+  at::parallel_for(
+      /*begin=*/0,
+      /*end=*/modules.size(),
+      /*grain_size=*/1,
+      [&modules, &inputs, &devices, &outputs, &mutex, &exception](
+          int64_t index, int64_t stop) {
+        for (; index < stop; ++index) {
+          try {
+            auto output = modules[index]->forward(inputs[index]);
+            output =
+                output.to(devices ? (*devices)[index] : inputs[index].device());
+            std::lock_guard<std::mutex> lock(mutex);
+            outputs[index] = output;
+          } catch (...) {
+            std::lock_guard<std::mutex> lock(mutex);
+            if (!exception) {
+              exception = std::current_exception();
+            }
+          }
+        }
+      });
+
+  if (exception) {
+    std::rethrow_exception(exception);
+  }
+
+  return outputs;
+}
+
+/// Evaluates `module(input)` in parallel across the given `devices`. If
+/// `devices` is not supplied, the invocation is parallelized across all
+/// available CUDA devices. If `output_device` is supplied, the final, combined
+/// tensor will be placed on this device. If not, it defaults to the first
+/// device in `devices`.
+///
+/// In detail, this method performs the following four distinct steps:
+/// 1. *Scatter* the input to the given devices,
+/// 2. *Replicate* (deep clone) the model on each device,
+/// 3. *Evaluate* each module with its input on its device,
+/// 4. *Gather* the outputs of each replica into a single output tensor, located
+/// on the `output_device`.
+template <typename ModuleType>
+Tensor data_parallel(
+    ModuleType module,
+    Tensor input,
+    std::optional<std::vector<Device>> devices = std::nullopt,
+    std::optional<Device> output_device = std::nullopt,
+    int64_t dim = 0) {
+  if (!devices) {
+    const auto device_count = torch::cuda::device_count();
+    TORCH_CHECK(
+        device_count > 0, "Expected at least one CUDA device to be available");
+    devices = std::vector<Device>();
+    devices->reserve(device_count);
+    for (const auto index : c10::irange(device_count)) {
+      devices->emplace_back(kCUDA, static_cast<torch::DeviceIndex>(index));
+    }
+  }
+  if (!output_device) {
+    output_device = devices->front();
+  }
+
+  if (devices->size() == 1) {
+    module->to(devices->front());
+    input = input.to(devices->front());
+    return module->forward(std::move(input)).to(*output_device);
+  }
+
+  autograd::Scatter scatter(*devices, /*chunk_sizes=*/std::nullopt, dim);
+  auto scattered_inputs = fmap<Tensor>(scatter.apply({std::move(input)}));
+  // Input tensor might not be big enough to scale across all available devices
+  if (scattered_inputs.size() < devices->size()) {
+    devices->resize(
+        scattered_inputs.size(),
+        Device(DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES));
+  }
+
+  auto replicas = replicate(module, *devices);
+  auto outputs = parallel_apply(replicas, scattered_inputs, *devices);
+  return autograd::Gather(*output_device, dim)
+      .apply(fmap<autograd::Variable>(std::move(outputs)))
+      .front();
+}
+
+} // namespace parallel
+} // namespace torch::nn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/pimpl-inl.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/pimpl-inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..b78f14ef6f0e9e45540590dbf8b61acbfb187e02
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/pimpl-inl.h
@@ -0,0 +1,76 @@
+// This class exists  only to do SFINAE on abstract types `T` that are really
+// `ModuleHolder<ModuleType>`, because there's no good way to say that `T` is a
+// `ModuleHolder` over some unknown type `ModuleType`. With this, you can do
+// `enable_if_t<is_base_of_v<ModuleHolderIndicator, T>>`.
+struct ModuleHolderIndicator {};
+
+// A type trait that is true for types that are `ModuleHolder`s.
+template <typename T>
+using is_module_holder =
+    std::is_base_of<ModuleHolderIndicator, std::decay_t<T>>;
+
+template <typename T>
+using disable_if_module_holder_t =
+    std::enable_if_t<!is_module_holder<T>::value>;
+
+// A collection of templates that answer the question whether a type `T` is a
+// `ModuleHolder`, and if so whether its contained type is of type `C`. This is
+// tricky because it is hard to short circuit in template metaprogramming. A
+// naive and incorrect solution to this problem would be something like
+// `disable_if<is_module_holder<T>::value && typename T::ContainedType == C>`.
+// This would disable all types that are not `ModuleHolder`s, because even
+// though the `is_module_holder<T>::value` may be `false` for such types the
+// `T::ContainedType` access would be ill-formed and thus fail the whole
+// expression by the rules of SFINAE. Instead we have to use template
+// specialization to statically branch on the first condition
+// (`is_module_holder<T>`) and are only then allowed to query
+// `T::ContainedType` in the branch for which the condition was true.
+
+// Base template.
+template <bool is_module_holder_value, typename T, typename C>
+struct is_module_holder_of_impl;
+
+// False branch. `T` is not a `ModuleHolder` and thus not a `ModuleHolder` with
+// contained type `C`.
+template <typename T, typename C>
+struct is_module_holder_of_impl<false, T, C> : std::false_type {};
+
+// True branch. `T` is a `ModuleHolder` and thus we can legit access its
+// `ContainedType` and compare it against `C`.
+template <typename T, typename C>
+struct is_module_holder_of_impl<true, T, C>
+    : std::is_same<typename T::ContainedType, C> {};
+
+// Helper template.
+template <typename T, typename C>
+struct is_module_holder_of : is_module_holder_of_impl<
+                                 is_module_holder<T>::value,
+                                 std::decay_t<T>,
+                                 std::decay_t<C>> {};
+
+// A collection of templates that allow deducing the return type of the
+// `forward()` method, but only if a module actually has a `forward()` method,
+// and otherwise deduces to the type `void`.
+
+template <bool has_forward_value, typename C, typename... Args>
+struct return_type_of_forward_impl;
+
+template <typename C, typename... Args>
+struct return_type_of_forward_impl<true, C, Args...> {
+  using type = decltype(::std::declval<C>().forward(::std::declval<Args>()...));
+};
+
+template <typename C, typename... Args>
+struct return_type_of_forward_impl<false, C, Args...> {
+  using type = void;
+};
+
+template <typename C, typename... Args>
+using return_type_of_forward = return_type_of_forward_impl<
+    torch::detail::has_forward<C>::value,
+    C,
+    Args...>;
+
+template <typename C, typename... Args>
+using return_type_of_forward_t =
+    typename return_type_of_forward<C, Args...>::type;
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/pimpl.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/pimpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..f1e5ff74619798748d826cba605382ccb1699419
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/pimpl.h
@@ -0,0 +1,200 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/detail/static.h>
+#include <torch/serialize/archive.h>
+#include <torch/types.h>
+
+#include <torch/csrc/utils/variadic.h>
+
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+namespace torch {
+namespace detail {
+// Dump all the template metaprogramming in this file.
+#include <torch/csrc/api/include/torch/nn/pimpl-inl.h>
+} // namespace detail
+
+namespace nn {
+
+/// A `ModuleHolder` is essentially a wrapper around `std::shared_ptr<M>` where
+/// `M` is an `nn::Module` subclass, with convenient constructors defined for
+/// the kind of constructions we want to allow for our modules.
+template <typename Contained>
+class ModuleHolder : torch::detail::ModuleHolderIndicator {
+ protected:
+  /// The module pointer this class wraps.
+  /// NOTE: Must be placed at the top of the class so that we can use it with
+  /// trailing return types below.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::shared_ptr<Contained> impl_;
+
+ public:
+  using ContainedType = Contained;
+
+  /// Default constructs the contained module if if has a default constructor,
+  /// else produces a static error.
+  ///
+  /// NOTE: This uses the behavior of template
+  /// classes in C++ that constructors (or any methods) are only compiled when
+  /// actually used.
+  ModuleHolder() : impl_(default_construct()) {
+    static_assert(
+        std::is_default_constructible_v<Contained>,
+        "You are trying to default construct a module which has "
+        "no default constructor. Use = nullptr to give it the empty state "
+        "(e.g. `Linear linear = nullptr;` instead of `Linear linear;`).");
+  }
+
+  /// Constructs the `ModuleHolder` with an empty contained value. Access to
+  /// the underlying module is not permitted and will throw an exception, until
+  /// a value is assigned.
+  /* implicit */ ModuleHolder(std::nullptr_t) : impl_(nullptr) {}
+
+  /// Constructs the `ModuleHolder` with a contained module, forwarding all
+  /// arguments to its constructor.
+  template <
+      typename Head,
+      typename... Tail,
+      typename = std::enable_if_t<
+          !(torch::detail::is_module_holder_of<Head, ContainedType>::value &&
+            (sizeof...(Tail) == 0))>>
+  explicit ModuleHolder(Head&& head, Tail&&... tail)
+      : impl_(new Contained(
+            std::forward<Head>(head),
+            std::forward<Tail>(tail)...)) {}
+
+  /// Constructs the `ModuleHolder` from a pointer to the contained type.
+  /// Example: `Linear(std::make_shared<LinearImpl>(...))`.
+  /* implicit */ ModuleHolder(std::shared_ptr<Contained> module)
+      : impl_(std::move(module)) {}
+
+  /// Returns true if the `ModuleHolder` contains a module, or false if it is
+  /// `nullptr`.
+  explicit operator bool() const noexcept {
+    return !is_empty();
+  }
+
+  /// Forwards to the contained module.
+  Contained* operator->() {
+    return get();
+  }
+
+  /// Forwards to the contained module.
+  const Contained* operator->() const {
+    return get();
+  }
+
+  /// Returns a reference to the contained module.
+  Contained& operator*() {
+    return *get();
+  }
+
+  /// Returns a const reference to the contained module.
+  const Contained& operator*() const {
+    return *get();
+  }
+
+  /// Returns a shared pointer to the underlying module.
+  const std::shared_ptr<Contained>& ptr() const {
+    TORCH_CHECK(!is_empty(), "Accessing empty ModuleHolder");
+    return impl_;
+  }
+
+  /// Returns a pointer to the underlying module.
+  Contained* get() {
+    TORCH_CHECK(!is_empty(), "Accessing empty ModuleHolder");
+    return impl_.get();
+  }
+
+  /// Returns a const pointer to the underlying module.
+  const Contained* get() const {
+    TORCH_CHECK(!is_empty(), "Accessing empty ModuleHolder");
+    return impl_.get();
+  }
+
+  /// Calls the `forward()` method of the contained module.
+  template <typename... Args>
+  auto operator()(Args&&... args)
+      -> torch::detail::return_type_of_forward_t<Contained, Args...> {
+    // This will not compile if the module does not have a `forward()` method
+    // (as expected).
+    // NOTE: `std::forward` is qualified to prevent VS2017 emitting
+    // error C2872: 'std': ambiguous symbol
+    return impl_->forward(::std::forward<Args>(args)...);
+  }
+
+  /// Forwards to the subscript operator of the contained module.
+  /// NOTE: std::forward is qualified to prevent VS2017 emitting
+  ///       error C2872: 'std': ambiguous symbol
+  template <typename Arg>
+  decltype(auto) operator[](Arg&& arg) {
+    return (*impl_)[::std::forward<Arg>(arg)];
+  }
+
+  /// Returns true if the `ModuleHolder` does not contain a module.
+  bool is_empty() const noexcept {
+    return impl_ == nullptr;
+  }
+
+ private:
+  template <typename T = Contained>
+  std::shared_ptr<Contained> default_construct() {
+    if constexpr (std::is_default_constructible_v<T>) {
+      return std::make_shared<Contained>();
+    } else {
+      return nullptr;
+    }
+  }
+};
+
+/// Pretty prints the given `Module` into the `ostream`.
+template <typename ModuleType>
+std::ostream& operator<<(
+    std::ostream& stream,
+    const nn::ModuleHolder<ModuleType>& module) {
+  return stream << *module;
+}
+
+/// Serializes a `ModuleHolder` into an `OutputArchive`.
+template <typename ModuleType>
+serialize::OutputArchive& operator<<(
+    serialize::OutputArchive& archive,
+    const nn::ModuleHolder<ModuleType>& module) {
+  return archive << module.ptr();
+}
+
+/// Deserializes a `ModuleHolder` from an `InputArchive`.
+template <typename ModuleType>
+serialize::InputArchive& operator>>(
+    serialize::InputArchive& archive,
+    nn::ModuleHolder<ModuleType>& module) {
+  return archive >> module.ptr();
+}
+
+} // namespace nn
+} // namespace torch
+
+// Workaround for CUDA 10.2 and below not allowing attribute unused on
+// using declarations.
+#ifdef __CUDACC__
+#define TORCH_UNUSED_EXCEPT_CUDA
+#else
+#define TORCH_UNUSED_EXCEPT_CUDA [[maybe_unused]]
+#endif
+
+/// Defines a class `Name` which inherits from `nn::ModuleHolder` to provide a
+/// wrapper over a `std::shared_ptr<ImplType>`.
+/// `Impl` is a type alias for `ImplType` which provides a way to call static
+/// method of `ImplType`.
+#define TORCH_MODULE_IMPL(Name, ImplType)                              \
+  class Name : public torch::nn::ModuleHolder<ImplType> { /* NOLINT */ \
+   public:                                                             \
+    using torch::nn::ModuleHolder<ImplType>::ModuleHolder;             \
+    using Impl TORCH_UNUSED_EXCEPT_CUDA = ImplType;                    \
+  }
+
+/// Like `TORCH_MODULE_IMPL`, but defaults the `ImplType` name to `<Name>Impl`.
+#define TORCH_MODULE(Name) TORCH_MODULE_IMPL(Name, Name##Impl)
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/utils.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..47a5e2cd799f0ec56ab60e2a59b52dd9a98a653a
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/utils.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#include <torch/nn/utils/clip_grad.h>
+#include <torch/nn/utils/convert_parameters.h>
+#include <torch/nn/utils/rnn.h>
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/utils/clip_grad.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/utils/clip_grad.h
new file mode 100644
index 0000000000000000000000000000000000000000..0478e6ae831850884f87d4f857a41cc1e26d9e8c
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/utils/clip_grad.h
@@ -0,0 +1,144 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+
+#include <torch/types.h>
+#include <utility>
+#include <vector>
+
+namespace torch::nn::utils {
+
+// Clips gradient norm of a vector of Tensors.
+// See
+// https://pytorch.org/docs/stable/nn.html?highlight=clip_grad_norm#torch.nn.utils.clip_grad_norm_
+// for more details about this module.
+//
+// Difference with the python version: unlike the python version, even when
+// skipping the finiteness checks (error_if_nonfinite = false), this function
+// will introduce a device <=> CPU synchronization (for devices where that makes
+// sense!) in order to return a CPU-side `double`. This C++ version therefore
+// cannot be run fully asynchronously w.r.t. the device of the gradients.
+inline double clip_grad_norm_(
+    const std::vector<Tensor>& parameters,
+    double max_norm,
+    double norm_type = 2.0,
+    bool error_if_nonfinite = false) {
+  std::vector<Tensor> params_with_grad;
+
+  for (const auto& param : parameters) {
+    auto& grad = param.grad();
+    if (grad.defined()) {
+      params_with_grad.push_back(param);
+    }
+  }
+
+  if (params_with_grad.empty()) {
+    return 0.0;
+  }
+
+  Tensor total_norm_tensor;
+  if (norm_type == std::numeric_limits<double>::infinity()) {
+    std::vector<Tensor> norms;
+    norms.reserve(params_with_grad.size());
+
+    for (const auto& param : params_with_grad) {
+      norms.emplace_back(param.grad().data().abs().max());
+    }
+    total_norm_tensor =
+        (norms.size() == 1) ? norms[0] : torch::max(torch::stack(norms));
+  } else if (norm_type == 0) {
+    total_norm_tensor =
+        torch::full({}, static_cast<double>(params_with_grad.size()));
+  } else {
+    std::vector<Tensor> norms;
+    norms.reserve(params_with_grad.size());
+
+    for (const auto& param : params_with_grad) {
+      norms.emplace_back(param.grad().data().norm(norm_type));
+    }
+    total_norm_tensor =
+        (norms.size() == 1) ? norms[0] : torch::stack(norms).norm(norm_type);
+  }
+
+  // When possible (ie when skipping the finiteness check), we avoid
+  // synchronizing the CPU and the gradients' device until the very end to
+  // preserve async execution on the device. When checking for finite-ness, this
+  // optional ensures we only sync once.
+  std::optional<double> total_norm = std::nullopt;
+  if (error_if_nonfinite) {
+    total_norm = total_norm_tensor.item().toDouble();
+    TORCH_CHECK(
+        std::isfinite(*total_norm),
+        "The total norm of order ",
+        norm_type,
+        " for gradients from `parameters` ",
+        "is non-finite, so it cannot be clipped. To disable this error and scale ",
+        "the gradients with the non-finite norm anyway, set ",
+        "`error_if_nonfinite=false`");
+  }
+
+  auto clip_coef = max_norm / (total_norm_tensor + 1e-6);
+  auto clip_coef_clamped =
+      torch::clamp(clip_coef, std::nullopt /* min */, 1.0 /* max */);
+  for (auto& param : params_with_grad) {
+    param.grad().data().mul_(clip_coef_clamped);
+  }
+
+  if (!total_norm.has_value()) {
+    total_norm = total_norm_tensor.item().toDouble();
+  }
+  return *total_norm;
+}
+
+// A wrapper around clip_grad_norm_ that allows us to call the function with a
+// braced-init-list of Tensors.
+inline double clip_grad_norm_(
+    std::initializer_list<Tensor> parameters,
+    double max_norm,
+    double norm_type = 2.0,
+    bool error_if_nonfinite = false) {
+  return clip_grad_norm_(
+      std::vector<Tensor>(parameters), max_norm, norm_type, error_if_nonfinite);
+}
+
+// A wrapper around clip_grad_norm_ that allows us to call the function with a
+// single Tensor.
+inline double clip_grad_norm_(
+    Tensor parameter,
+    double max_norm,
+    double norm_type = 2.0,
+    bool error_if_nonfinite = false) {
+  std::vector<Tensor> params = {std::move(parameter)};
+  return clip_grad_norm_(params, max_norm, norm_type, error_if_nonfinite);
+}
+
+// Clips gradient of an iterable of parameters at specified value.
+// Gradients are modified in-place.
+// See https://pytorch.org/docs/stable/nn.html#clip-grad-value
+// for more details about this module.
+inline void clip_grad_value_(
+    const std::vector<Tensor>& parameters,
+    double clip_value) {
+  for (const auto& param : parameters) {
+    if (param.grad().defined()) {
+      param.grad().data().clamp_(-clip_value, clip_value);
+    }
+  }
+}
+
+// A wrapper around clip_grad_value_ that allows us to call the function with a
+// braced-init-list of Tensors.
+inline void clip_grad_value_(
+    std::initializer_list<Tensor> parameters,
+    double clip_value) {
+  clip_grad_value_(std::vector<Tensor>(parameters), clip_value);
+}
+
+// A wrapper around clip_grad_value_ that allows us to call the function with a
+// single Tensor.
+inline void clip_grad_value_(Tensor parameter, double clip_value) {
+  std::vector<Tensor> params = {std::move(parameter)};
+  clip_grad_value_(params, clip_value);
+}
+
+} // namespace torch::nn::utils
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/utils/convert_parameters.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/utils/convert_parameters.h
new file mode 100644
index 0000000000000000000000000000000000000000..044c1123721ead7597c637669c070528d55d2b0a
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/utils/convert_parameters.h
@@ -0,0 +1,78 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/types.h>
+
+namespace torch::nn::utils {
+
+// This helper function is to check if the parameters are located
+// in the same device. Currently, the conversion between model parameters
+// and single vector form is not supported for multiple allocations,
+// e.g. parameters in different GPUs, or mixture of CPU/GPU.
+inline std::optional<int64_t> _check_param_device(
+    const torch::Tensor& param,
+    std::optional<int64_t> old_param_device) {
+  // Meet the first parameter
+  if (old_param_device == std::nullopt) {
+    old_param_device = param.is_cuda() ? param.get_device() : -1;
+  } else {
+    bool warn = false;
+    if (param.is_cuda()) { // Check if in same GPU
+      warn = (param.get_device() != old_param_device);
+    } else { // Check if in CPU
+      warn = (old_param_device != -1);
+    }
+    if (warn) {
+      TORCH_CHECK(
+          false,
+          "Found two parameters on different devices, ",
+          "this is currently not supported.");
+    }
+  }
+
+  return old_param_device;
+}
+
+// Convert parameters to one vector
+inline torch::Tensor parameters_to_vector(
+    const std::vector<torch::Tensor>& parameters) {
+  std::optional<int64_t> param_device;
+
+  std::vector<torch::Tensor> vec;
+  vec.reserve(parameters.size());
+
+  for (const torch::Tensor& param : parameters) {
+    // Ensure the parameters are located in the same device
+    param_device = _check_param_device(param, param_device);
+
+    vec.push_back(param.view(-1));
+  }
+
+  return torch::cat(vec);
+}
+
+// Convert one vector to the parameters
+inline void vector_to_parameters(
+    const torch::Tensor& vec,
+    const std::vector<torch::Tensor>& parameters) {
+  // Flag for the device where the parameter is located
+  std::optional<int64_t> param_device;
+
+  // Pointer for slicing the vector for each parameter
+  int64_t pointer = 0;
+  for (const torch::Tensor& param : parameters) {
+    // Ensure the parameters are located in the same device
+    param_device = _check_param_device(param, param_device);
+
+    // The length of the parameter
+    auto num_param = param.numel();
+    // Slice the vector, reshape it, and replace the old data of the parameter
+    param.set_data(
+        vec.slice(0, pointer, pointer + num_param).view_as(param).data());
+
+    // Increment the pointer
+    pointer += num_param;
+  }
+}
+
+} // namespace torch::nn::utils
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/utils/rnn.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/utils/rnn.h
new file mode 100644
index 0000000000000000000000000000000000000000..8b8fbae583fea731b403b30eb8408e43ef579662
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/nn/utils/rnn.h
@@ -0,0 +1,348 @@
+#pragma once
+
+#include <c10/util/irange.h>
+#include <torch/types.h>
+
+#include <utility>
+
+namespace torch::nn::utils::rnn {
+
+inline Tensor invert_permutation(const Tensor& permutation) {
+  if (!permutation.defined()) {
+    return torch::Tensor();
+  }
+  Tensor output =
+      torch::empty_like(permutation, torch::MemoryFormat::Contiguous);
+  output.scatter_(
+      0,
+      permutation,
+      torch::arange(0, permutation.numel(), permutation.device()));
+  return output;
+}
+
+/// Holds the data and list of `batch_sizes` of a packed sequence.
+///
+/// All RNN modules accept packed sequences as inputs.
+///
+/// Note:
+///     Instances of this class should never be created manually. They are meant
+///     to be instantiated by functions like `pack_padded_sequence`.
+///
+///     Batch sizes represent the number elements at each sequence step in
+///     the batch, not the varying sequence lengths passed to
+///     `pack_padded_sequence`.  For instance, given data ``abc`` and ``x``
+///     the :class:`PackedSequence` would contain data ``axbc`` with
+///     ``batch_sizes=[2,1,1]``.
+///
+/// Attributes:
+///     data (Tensor): Tensor containing packed sequence
+///     batch_sizes (Tensor): Tensor of integers holding
+///         information about the batch size at each sequence step
+///     sorted_indices (Tensor, optional): Tensor of integers holding how this
+///         :class:`PackedSequence` is constructed from sequences.
+///     unsorted_indices (Tensor, optional): Tensor of integers holding how this
+///         to recover the original sequences with correct order.
+///
+/// .. note::
+///     `data` can be on arbitrary device and of arbitrary dtype.
+///     `sorted_indices` and `unsorted_indices` must be ``torch::kInt64``
+///     tensors on the same device as `data`.
+///
+///     However, `batch_sizes` should always be a CPU ``torch::kInt64`` tensor.
+///
+///     This invariant is maintained throughout `PackedSequence` class,
+///     and all functions that construct a `PackedSequence` in libtorch
+///     (i.e., they only pass in tensors conforming to this constraint).
+class PackedSequence {
+ public:
+  explicit PackedSequence(
+      Tensor data,
+      Tensor batch_sizes,
+      Tensor sorted_indices = {},
+      Tensor unsorted_indices = {}) {
+    // NB: if unsorted_indices is provided, it should be the inverse permutation
+    // to sorted_indices. Don't assert it here because the PackedSequence ctor
+    // should only be used internally.
+    if (!unsorted_indices.defined()) {
+      unsorted_indices = invert_permutation(sorted_indices);
+    }
+    TORCH_CHECK(
+        batch_sizes.device().type() == kCPU,
+        "batch_sizes should always be on CPU. "
+        "Instances of PackedSequence should never be created manually. "
+        "They should be instantiated by functions like pack_sequence "
+        "and pack_padded_sequences in nn::utils::rnn. "
+        "https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pack_sequence");
+    data_ = std::move(data);
+    batch_sizes_ = std::move(batch_sizes);
+    sorted_indices_ = std::move(sorted_indices);
+    unsorted_indices_ = std::move(unsorted_indices);
+  }
+
+  const Tensor& data() const {
+    return data_;
+  }
+
+  const Tensor& batch_sizes() const {
+    return batch_sizes_;
+  }
+
+  const Tensor& sorted_indices() const {
+    return sorted_indices_;
+  }
+
+  const Tensor& unsorted_indices() const {
+    return unsorted_indices_;
+  }
+
+  PackedSequence pin_memory() const {
+    // Why not convert `batch_sizes`?
+    // See NOTE [ device and dtype of a PackedSequence ]
+    return PackedSequence(
+        data_.pin_memory(),
+        batch_sizes_,
+        sorted_indices_.defined() ? sorted_indices_.pin_memory() : Tensor(),
+        unsorted_indices_.defined() ? unsorted_indices_.pin_memory()
+                                    : Tensor());
+  }
+
+  PackedSequence to(TensorOptions options) const {
+    // Performs dtype and/or device conversion on `data_`.
+    //
+    // If the ``data_`` Tensor already has the correct `torch::Dtype`
+    // and `torch::Device`, then ``self`` is returned.
+    // Otherwise, returns a copy with the desired configuration.
+
+    // Why not convert `batch_sizes`?
+    // See NOTE [ device and dtype of a PackedSequence ]
+    Tensor data = data_.to(options);
+    if (data.is_same(data_)) {
+      return *this;
+    } else {
+      // Does not forward device or dtype args, device is set from data.device()
+      Tensor sorted_indices = sorted_indices_.defined()
+          ? sorted_indices_.to(
+                options.device(data.device()).dtype(sorted_indices_.dtype()))
+          : Tensor();
+      Tensor unsorted_indices = unsorted_indices_.defined()
+          ? unsorted_indices_.to(
+                options.device(data.device()).dtype(unsorted_indices_.dtype()))
+          : Tensor();
+      return PackedSequence(
+          std::move(data),
+          batch_sizes_,
+          std::move(sorted_indices),
+          std::move(unsorted_indices));
+    }
+  }
+
+  PackedSequence cuda() const {
+    return to(kCUDA);
+  }
+
+  PackedSequence cpu() const {
+    return to(kCPU);
+  }
+
+  /// Returns true if `data_` stored on a gpu
+  bool is_cuda() const {
+    return data_.is_cuda();
+  }
+
+  /// Returns true if `data_` stored on in pinned memory
+  bool is_pinned() const {
+    return data_.is_pinned();
+  }
+
+ private:
+  Tensor data_;
+  Tensor batch_sizes_;
+  Tensor sorted_indices_;
+  Tensor unsorted_indices_;
+};
+
+/// Packs a Tensor containing padded sequences of variable length.
+///
+/// `input` can be of size ``T x B x *`` where `T` is the length of the
+/// longest sequence (equal to ``lengths[0]``), ``B`` is the batch size, and
+/// ``*`` is any number of dimensions (including 0). If ``batch_first`` is
+/// ``true``, ``B x T x *`` `input` is expected.
+///
+/// For unsorted sequences, use `enforce_sorted = false`. If `enforce_sorted` is
+/// ``true``, the sequences should be sorted by length in a decreasing order,
+/// i.e.
+/// ``input[:,0]`` should be the longest sequence, and ``input[:,B-1]`` the
+/// shortest one.
+///
+/// Note:
+///     This function accepts any input that has at least two dimensions. You
+///     can apply it to pack the labels, and use the output of the RNN with
+///     them to compute the loss directly. A Tensor can be retrieved from
+///     a `PackedSequence` object by calling its ``.data()`` function.
+///
+/// Arguments:
+///     input (Tensor): padded batch of variable length sequences.
+///     lengths (Tensor): list of sequences lengths of each batch element.
+///     batch_first (bool, optional): if ``true``, the input is expected in ``B
+///     x T x *``
+///         format. Default: ``false``.
+///     enforce_sorted (bool, optional): if ``true``, the input is expected to
+///         contain sequences sorted by length in a decreasing order. If
+///         ``false``, this condition is not checked. Default: ``true``.
+///
+/// Returns:
+///     a `PackedSequence` object
+inline PackedSequence pack_padded_sequence(
+    Tensor input,
+    Tensor lengths,
+    bool batch_first = false,
+    bool enforce_sorted = true) {
+  lengths = lengths.to(kInt64);
+  Tensor sorted_indices;
+  if (enforce_sorted) {
+    sorted_indices = Tensor();
+  } else {
+    std::tie(lengths, sorted_indices) =
+        torch::sort(lengths, /*dim=*/-1, /*descending=*/true);
+    sorted_indices = sorted_indices.to(input.device());
+    int64_t batch_dim = batch_first ? 0 : 1;
+    input = input.index_select(batch_dim, sorted_indices);
+  }
+
+  auto [data, batch_sizes] =
+      torch::_pack_padded_sequence(input, lengths, batch_first);
+  return PackedSequence(
+      std::move(data), std::move(batch_sizes), std::move(sorted_indices), {});
+}
+
+/// Pads a packed batch of variable length sequences.
+///
+/// It is an inverse operation to `pack_padded_sequence`.
+///
+/// The returned Tensor's data will be of size ``T x B x *``, where `T` is the
+/// length of the longest sequence and `B` is the batch size. If ``batch_first``
+/// is true, the data will be transposed into ``B x T x *`` format.
+///
+/// Batch elements will be ordered decreasingly by their length.
+///
+/// Arguments:
+///     sequence (PackedSequence): batch to pad
+///     batch_first (bool, optional): if ``true``, the output will be in ``B x T
+///     x *``
+///         format.
+///     padding_value (double, optional): values for padded elements.
+///     total_length (int64_t, optional): if specified, the output will be
+///     padded to
+///         have length `total_length`. This method will throw error
+///         if `total_length` is less than the max sequence length in
+///         `sequence`.
+///
+/// Returns:
+///     Tuple of Tensor containing the padded sequence, and a Tensor
+///     containing the list of lengths of each sequence in the batch.
+inline std::tuple<Tensor, Tensor> pad_packed_sequence(
+    const PackedSequence& sequence,
+    bool batch_first = false,
+    double padding_value = 0.0,
+    std::optional<int64_t> total_length = std::nullopt) {
+  int64_t max_seq_length = sequence.batch_sizes().size(0);
+  if (total_length.has_value()) {
+    int64_t total_length_val = total_length.value();
+    TORCH_CHECK(
+        total_length_val >= max_seq_length,
+        "Expected total_length to be at least the length "
+        "of the longest sequence in input, but got "
+        "total_length=",
+        total_length_val,
+        " and max sequence length being ",
+        max_seq_length);
+    max_seq_length = total_length_val;
+  }
+  auto [padded_output, lengths] = torch::_pad_packed_sequence(
+      sequence.data(),
+      sequence.batch_sizes(),
+      batch_first,
+      padding_value,
+      max_seq_length);
+  const Tensor& unsorted_indices = sequence.unsorted_indices();
+  if (unsorted_indices.defined()) {
+    int64_t batch_dim = batch_first ? 0 : 1;
+    return std::make_tuple(
+        padded_output.index_select(batch_dim, unsorted_indices),
+        lengths.index({unsorted_indices.cpu()}));
+  }
+  return std::make_tuple(padded_output, lengths);
+}
+
+/// Pad a list of variable length Tensors with ``padding_value``
+///
+/// ``pad_sequence`` stacks a list of Tensors along a new dimension,
+/// and pads them to equal length. For example, if the input is list of
+/// sequences with size ``L x *`` and if batch_first is false, and ``T x B x *``
+/// otherwise.
+///
+/// `B` is batch size. It is equal to the number of elements in ``sequences``.
+/// `T` is length of the longest sequence.
+/// `L` is length of the sequence.
+/// `*` is any number of trailing dimensions, including none.
+///
+/// Note:
+///     This function returns a Tensor of size ``T x B x *`` or ``B x T x *``
+///     where `T` is the length of the longest sequence. This function assumes
+///     trailing dimensions and type of all the Tensors in sequences are same.
+///
+/// Arguments:
+///     sequences (torch::ArrayRef<Tensor>): list of variable length sequences.
+///     batch_first (bool, optional): output will be in ``B x T x *`` if true,
+///     or in
+///         ``T x B x *`` otherwise
+///     padding_value (double, optional): value for padded elements. Default: 0.
+///     padding_side (str, optional): the side to pad the sequences on. Default:
+///         "right".
+///
+/// Returns:
+///     Tensor of size ``T x B x *`` if `batch_first` is ``false``.
+///     Tensor of size ``B x T x *`` otherwise
+inline Tensor pad_sequence(
+    ArrayRef<Tensor> sequences,
+    bool batch_first = false,
+    double padding_value = 0,
+    std::string_view padding_side = "right") {
+  return at::pad_sequence(sequences, batch_first, padding_value, padding_side);
+}
+
+/// Packs a list of variable length Tensors
+///
+/// ``sequences`` should be a list of Tensors of size ``L x *``, where `L` is
+/// the length of a sequence and `*` is any number of trailing dimensions,
+/// including zero.
+///
+/// For unsorted sequences, use `enforce_sorted = false`. If ``enforce_sorted``
+/// is ``true``, the sequences should be sorted in the order of decreasing
+/// length.
+///
+///
+/// Arguments:
+///     sequences (torch::ArrayRef<Tensor>): A list of sequences of decreasing
+///     length. enforce_sorted (bool, optional): if ``true``, checks that the
+///     input
+///         contains sequences sorted by length in a decreasing order. If
+///         ``false``, this condition is not checked. Default: ``true``.
+///
+/// Returns:
+///     a `PackedSequence` object
+inline PackedSequence pack_sequence(
+    ArrayRef<Tensor> sequences,
+    bool enforce_sorted = true) {
+  Tensor lengths = torch::empty({(int64_t)sequences.size()}, kInt64);
+  for (const auto i : c10::irange(sequences.size())) {
+    lengths[static_cast<int64_t>(i)] = sequences[i].size(0);
+  }
+  return pack_padded_sequence(
+      at::pad_sequence(sequences),
+      std::move(lengths),
+      /*batch_first=*/false,
+      /*enforce_sorted=*/enforce_sorted);
+}
+
+} // namespace torch::nn::utils::rnn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim.h
new file mode 100644
index 0000000000000000000000000000000000000000..efcec794021b7b73f54c18d7cf9d646d27321412
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <torch/optim/adagrad.h>
+#include <torch/optim/adam.h>
+#include <torch/optim/adamw.h>
+#include <torch/optim/lbfgs.h>
+#include <torch/optim/optimizer.h>
+#include <torch/optim/rmsprop.h>
+#include <torch/optim/sgd.h>
+
+#include <torch/optim/schedulers/lr_scheduler.h>
+#include <torch/optim/schedulers/reduce_on_plateau_scheduler.h>
+#include <torch/optim/schedulers/step_lr.h>
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/adagrad.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/adagrad.h
new file mode 100644
index 0000000000000000000000000000000000000000..159b68a8b9434c4da97789ee5358369e77f0260e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/adagrad.h
@@ -0,0 +1,99 @@
+#pragma once
+
+#include <torch/nn/pimpl.h>
+#include <torch/optim/optimizer.h>
+#include <torch/optim/serialize.h>
+#include <torch/serialize/archive.h>
+#include <torch/types.h>
+
+#include <utility>
+#include <vector>
+
+namespace torch::serialize {
+class OutputArchive;
+class InputArchive;
+} // namespace torch::serialize
+
+namespace torch::optim {
+
+struct TORCH_API AdagradOptions
+    : public OptimizerCloneableOptions<AdagradOptions> {
+  AdagradOptions(double lr = 1e-2);
+  TORCH_ARG(double, lr) = 1e-2;
+  TORCH_ARG(double, lr_decay) = 0;
+  TORCH_ARG(double, weight_decay) = 0;
+  TORCH_ARG(double, initial_accumulator_value) = 0;
+  TORCH_ARG(double, eps) = 1e-10;
+
+ public:
+  void serialize(torch::serialize::InputArchive& archive) override;
+  void serialize(torch::serialize::OutputArchive& archive) const override;
+  TORCH_API friend bool operator==(
+      const AdagradOptions& lhs,
+      const AdagradOptions& rhs);
+  double get_lr() const override;
+  void set_lr(const double lr) override;
+};
+
+struct TORCH_API AdagradParamState
+    : public OptimizerCloneableParamState<AdagradParamState> {
+  TORCH_ARG(torch::Tensor, sum);
+  TORCH_ARG(int64_t, step) = 0;
+
+ public:
+  void serialize(torch::serialize::InputArchive& archive) override;
+  void serialize(torch::serialize::OutputArchive& archive) const override;
+  TORCH_API friend bool operator==(
+      const AdagradParamState& lhs,
+      const AdagradParamState& rhs);
+};
+
+class TORCH_API Adagrad : public Optimizer {
+ public:
+  explicit Adagrad(
+      const std::vector<OptimizerParamGroup>& param_groups,
+      AdagradOptions defaults = {})
+      : Optimizer(param_groups, std::make_unique<AdagradOptions>(defaults)) {
+    TORCH_CHECK(defaults.lr() >= 0, "Invalid learning rate: ", defaults.lr());
+    TORCH_CHECK(
+        defaults.lr_decay() >= 0,
+        "Invalid lr_decay value: ",
+        defaults.lr_decay());
+    TORCH_CHECK(
+        defaults.weight_decay() >= 0,
+        "Invalid weight_decay value: ",
+        defaults.weight_decay());
+    TORCH_CHECK(
+        defaults.initial_accumulator_value() >= 0,
+        "Invalid initial_accumulator_value value: ",
+        defaults.initial_accumulator_value());
+    TORCH_CHECK(defaults.eps() >= 0, "Invalid epsilon value: ", defaults.eps());
+
+    for (const auto& group : param_groups_) {
+      for (const auto& p : group.params()) {
+        auto state = std::make_unique<AdagradParamState>();
+        state->step(0);
+        state->sum(torch::full_like(
+            p.data(),
+            defaults.initial_accumulator_value(),
+            at::MemoryFormat::Preserve));
+        state_[p.unsafeGetTensorImpl()] = std::move(state);
+      }
+    }
+  }
+
+  explicit Adagrad(std::vector<Tensor> params, AdagradOptions defaults = {})
+      : Adagrad({OptimizerParamGroup(std::move(params))}, std::move(defaults)) {
+  }
+
+  torch::Tensor step(LossClosure closure = nullptr) override;
+  void save(serialize::OutputArchive& archive) const override;
+  void load(serialize::InputArchive& archive) override;
+
+ private:
+  template <typename Self, typename Archive>
+  static void serialize(Self& self, Archive& archive) {
+    _TORCH_OPTIM_SERIALIZE_WITH_TEMPLATE_ARG(Adagrad);
+  }
+};
+} // namespace torch::optim
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/adam.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/adam.h
new file mode 100644
index 0000000000000000000000000000000000000000..c39f7e7f17c293ae45041d7e72f6e834d3181339
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/adam.h
@@ -0,0 +1,86 @@
+#pragma once
+
+#include <torch/nn/module.h>
+#include <torch/optim/optimizer.h>
+#include <torch/optim/serialize.h>
+
+#include <utility>
+#include <vector>
+
+namespace torch::serialize {
+class OutputArchive;
+class InputArchive;
+} // namespace torch::serialize
+
+namespace torch::optim {
+
+struct TORCH_API AdamOptions : public OptimizerCloneableOptions<AdamOptions> {
+  AdamOptions(double lr = 1e-3);
+  TORCH_ARG(double, lr) = 1e-3;
+  typedef std::tuple<double, double> betas_t;
+  TORCH_ARG(betas_t, betas) = std::make_tuple(0.9, 0.999);
+  TORCH_ARG(double, eps) = 1e-8;
+  TORCH_ARG(double, weight_decay) = 0;
+  TORCH_ARG(bool, amsgrad) = false;
+
+ public:
+  void serialize(torch::serialize::InputArchive& archive) override;
+  void serialize(torch::serialize::OutputArchive& archive) const override;
+  TORCH_API friend bool operator==(
+      const AdamOptions& lhs,
+      const AdamOptions& rhs);
+  double get_lr() const override;
+  void set_lr(const double lr) override;
+};
+
+struct TORCH_API AdamParamState
+    : public OptimizerCloneableParamState<AdamParamState> {
+  TORCH_ARG(int64_t, step) = 0;
+  TORCH_ARG(torch::Tensor, exp_avg);
+  TORCH_ARG(torch::Tensor, exp_avg_sq);
+  TORCH_ARG(torch::Tensor, max_exp_avg_sq) = {};
+
+ public:
+  void serialize(torch::serialize::InputArchive& archive) override;
+  void serialize(torch::serialize::OutputArchive& archive) const override;
+  TORCH_API friend bool operator==(
+      const AdamParamState& lhs,
+      const AdamParamState& rhs);
+};
+
+class TORCH_API Adam : public Optimizer {
+ public:
+  explicit Adam(
+      const std::vector<OptimizerParamGroup>& param_groups,
+      AdamOptions defaults = {})
+      : Optimizer(param_groups, std::make_unique<AdamOptions>(defaults)) {
+    TORCH_CHECK(defaults.lr() >= 0, "Invalid learning rate: ", defaults.lr());
+    TORCH_CHECK(defaults.eps() >= 0, "Invalid epsilon value: ", defaults.eps());
+    auto betas = defaults.betas();
+    TORCH_CHECK(
+        0 <= std::get<0>(betas) && std::get<0>(betas) < 1.0,
+        "Invalid beta parameter at index 0: ",
+        std::get<0>(betas));
+    TORCH_CHECK(
+        0 <= std::get<1>(betas) && std::get<1>(betas) < 1.0,
+        "Invalid beta parameter at index 1: ",
+        std::get<1>(betas));
+    TORCH_CHECK(
+        defaults.weight_decay() >= 0,
+        "Invalid weight_decay value: ",
+        defaults.weight_decay());
+  }
+  explicit Adam(std::vector<Tensor> params, AdamOptions defaults = {})
+      : Adam({OptimizerParamGroup(std::move(params))}, std::move(defaults)) {}
+
+  torch::Tensor step(LossClosure closure = nullptr) override;
+  void save(serialize::OutputArchive& archive) const override;
+  void load(serialize::InputArchive& archive) override;
+
+ private:
+  template <typename Self, typename Archive>
+  static void serialize(Self& self, Archive& archive) {
+    _TORCH_OPTIM_SERIALIZE_WITH_TEMPLATE_ARG(Adam);
+  }
+};
+} // namespace torch::optim
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/adamw.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/adamw.h
new file mode 100644
index 0000000000000000000000000000000000000000..63977b7c7538f7ea6016c021c370d1ee1255b935
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/adamw.h
@@ -0,0 +1,86 @@
+#pragma once
+
+#include <torch/nn/module.h>
+#include <torch/optim/optimizer.h>
+#include <torch/optim/serialize.h>
+
+#include <utility>
+#include <vector>
+
+namespace torch::serialize {
+class OutputArchive;
+class InputArchive;
+} // namespace torch::serialize
+
+namespace torch::optim {
+
+struct TORCH_API AdamWOptions : public OptimizerCloneableOptions<AdamWOptions> {
+  AdamWOptions(double lr = 1e-3);
+  TORCH_ARG(double, lr) = 1e-3;
+  typedef std::tuple<double, double> betas_t;
+  TORCH_ARG(betas_t, betas) = std::make_tuple(0.9, 0.999);
+  TORCH_ARG(double, eps) = 1e-8;
+  TORCH_ARG(double, weight_decay) = 1e-2;
+  TORCH_ARG(bool, amsgrad) = false;
+
+ public:
+  void serialize(torch::serialize::InputArchive& archive) override;
+  void serialize(torch::serialize::OutputArchive& archive) const override;
+  TORCH_API friend bool operator==(
+      const AdamWOptions& lhs,
+      const AdamWOptions& rhs);
+  double get_lr() const override;
+  void set_lr(const double lr) override;
+};
+
+struct TORCH_API AdamWParamState
+    : public OptimizerCloneableParamState<AdamWParamState> {
+  TORCH_ARG(int64_t, step) = 0;
+  TORCH_ARG(torch::Tensor, exp_avg);
+  TORCH_ARG(torch::Tensor, exp_avg_sq);
+  TORCH_ARG(torch::Tensor, max_exp_avg_sq) = {};
+
+ public:
+  void serialize(torch::serialize::InputArchive& archive) override;
+  void serialize(torch::serialize::OutputArchive& archive) const override;
+  TORCH_API friend bool operator==(
+      const AdamWParamState& lhs,
+      const AdamWParamState& rhs);
+};
+
+class TORCH_API AdamW : public Optimizer {
+ public:
+  explicit AdamW(
+      const std::vector<OptimizerParamGroup>& param_groups,
+      AdamWOptions defaults = {})
+      : Optimizer(param_groups, std::make_unique<AdamWOptions>(defaults)) {
+    TORCH_CHECK(defaults.lr() >= 0, "Invalid learning rate: ", defaults.lr());
+    TORCH_CHECK(defaults.eps() >= 0, "Invalid epsilon value: ", defaults.eps());
+    auto betas = defaults.betas();
+    TORCH_CHECK(
+        0 <= std::get<0>(betas) && std::get<0>(betas) < 1.0,
+        "Invalid beta parameter at index 0: ",
+        std::get<0>(betas));
+    TORCH_CHECK(
+        0 <= std::get<1>(betas) && std::get<1>(betas) < 1.0,
+        "Invalid beta parameter at index 1: ",
+        std::get<1>(betas));
+    TORCH_CHECK(
+        defaults.weight_decay() >= 0,
+        "Invalid weight_decay value: ",
+        defaults.weight_decay());
+  }
+  explicit AdamW(std::vector<Tensor> params, AdamWOptions defaults = {})
+      : AdamW({OptimizerParamGroup(std::move(params))}, std::move(defaults)) {}
+
+  torch::Tensor step(LossClosure closure = nullptr) override;
+  void save(serialize::OutputArchive& archive) const override;
+  void load(serialize::InputArchive& archive) override;
+
+ private:
+  template <typename Self, typename Archive>
+  static void serialize(Self& self, Archive& archive) {
+    _TORCH_OPTIM_SERIALIZE_WITH_TEMPLATE_ARG(AdamW);
+  }
+};
+} // namespace torch::optim
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/lbfgs.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/lbfgs.h
new file mode 100644
index 0000000000000000000000000000000000000000..5ff1dbd091856ef7c8ff5004582fcba25ded3acd
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/lbfgs.h
@@ -0,0 +1,100 @@
+#pragma once
+
+#include <torch/nn/module.h>
+#include <torch/optim/optimizer.h>
+#include <torch/optim/serialize.h>
+#include <torch/serialize/archive.h>
+
+#include <deque>
+#include <functional>
+#include <memory>
+#include <utility>
+#include <vector>
+
+namespace torch::optim {
+
+struct TORCH_API LBFGSOptions : public OptimizerCloneableOptions<LBFGSOptions> {
+  LBFGSOptions(double lr = 1);
+  TORCH_ARG(double, lr) = 1;
+  TORCH_ARG(int64_t, max_iter) = 20;
+  TORCH_ARG(std::optional<int64_t>, max_eval) = std::nullopt;
+  TORCH_ARG(double, tolerance_grad) = 1e-7;
+  TORCH_ARG(double, tolerance_change) = 1e-9;
+  TORCH_ARG(int64_t, history_size) = 100;
+  TORCH_ARG(std::optional<std::string>, line_search_fn) = std::nullopt;
+
+ public:
+  void serialize(torch::serialize::InputArchive& archive) override;
+  void serialize(torch::serialize::OutputArchive& archive) const override;
+  TORCH_API friend bool operator==(
+      const LBFGSOptions& lhs,
+      const LBFGSOptions& rhs);
+  double get_lr() const override;
+  void set_lr(const double lr) override;
+};
+
+struct TORCH_API LBFGSParamState
+    : public OptimizerCloneableParamState<LBFGSParamState> {
+  TORCH_ARG(int64_t, func_evals) = 0;
+  TORCH_ARG(int64_t, n_iter) = 0;
+  TORCH_ARG(double, t) = 0;
+  TORCH_ARG(double, prev_loss) = 0;
+  TORCH_ARG(Tensor, d) = {};
+  TORCH_ARG(Tensor, H_diag) = {};
+  TORCH_ARG(Tensor, prev_flat_grad) = {};
+  TORCH_ARG(std::deque<Tensor>, old_dirs);
+  TORCH_ARG(std::deque<Tensor>, old_stps);
+  TORCH_ARG(std::deque<Tensor>, ro);
+  TORCH_ARG(std::optional<std::vector<Tensor>>, al) = std::nullopt;
+
+ public:
+  void serialize(torch::serialize::InputArchive& archive) override;
+  void serialize(torch::serialize::OutputArchive& archive) const override;
+  TORCH_API friend bool operator==(
+      const LBFGSParamState& lhs,
+      const LBFGSParamState& rhs);
+};
+
+class TORCH_API LBFGS : public Optimizer {
+ public:
+  explicit LBFGS(
+      const std::vector<OptimizerParamGroup>& param_groups,
+      LBFGSOptions defaults = {})
+      : Optimizer(param_groups, std::make_unique<LBFGSOptions>(defaults)) {
+    TORCH_CHECK(
+        param_groups_.size() == 1,
+        "LBFGS doesn't support per-parameter options (parameter groups)");
+    if (defaults.max_eval() == std::nullopt) {
+      auto max_eval_val = (defaults.max_iter() * 5) / 4;
+      static_cast<LBFGSOptions&>(param_groups_[0].options())
+          .max_eval(max_eval_val);
+      static_cast<LBFGSOptions&>(*defaults_).max_eval(max_eval_val);
+    }
+    _numel_cache = std::nullopt;
+  }
+  explicit LBFGS(std::vector<Tensor> params, LBFGSOptions defaults = {})
+      : LBFGS({OptimizerParamGroup(std::move(params))}, std::move(defaults)) {}
+
+  Tensor step(LossClosure closure) override;
+  void save(serialize::OutputArchive& archive) const override;
+  void load(serialize::InputArchive& archive) override;
+
+ private:
+  std::optional<int64_t> _numel_cache;
+  int64_t _numel();
+  Tensor _gather_flat_grad();
+  void _add_grad(const double step_size, const Tensor& update);
+  std::tuple<double, Tensor> _directional_evaluate(
+      const LossClosure& closure,
+      const std::vector<Tensor>& x,
+      double t,
+      const Tensor& d);
+  void _set_param(const std::vector<Tensor>& params_data);
+  std::vector<Tensor> _clone_param();
+
+  template <typename Self, typename Archive>
+  static void serialize(Self& self, Archive& archive) {
+    _TORCH_OPTIM_SERIALIZE_WITH_TEMPLATE_ARG(LBFGS);
+  }
+};
+} // namespace torch::optim
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/optimizer.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/optimizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..cdea146629d750407ef854a8cc1490bf5e7be829
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/optimizer.h
@@ -0,0 +1,223 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <c10/util/Exception.h>
+#include <c10/util/flat_hash_map.h>
+
+#include <torch/arg.h>
+#include <torch/csrc/Export.h>
+
+#include <algorithm>
+#include <functional>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <vector>
+
+// Forward declarations confuse Doxygen
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace at {
+class Tensor;
+} // namespace at
+
+namespace torch {
+using at::Tensor;
+namespace serialize {
+class OutputArchive;
+class InputArchive;
+} // namespace serialize
+} // namespace torch
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+namespace torch::optim {
+
+class TORCH_API OptimizerParamState {
+ public:
+  OptimizerParamState() = default;
+  OptimizerParamState(const OptimizerParamState&) = default;
+  OptimizerParamState& operator=(const OptimizerParamState&) = default;
+  OptimizerParamState(OptimizerParamState&&) noexcept = default;
+  OptimizerParamState& operator=(OptimizerParamState&&) noexcept = default;
+  virtual std::unique_ptr<OptimizerParamState> clone() const;
+  virtual void serialize(torch::serialize::InputArchive& archive);
+  virtual void serialize(torch::serialize::OutputArchive& archive) const;
+  virtual ~OptimizerParamState() = default;
+};
+
+template <typename Derived>
+class OptimizerCloneableParamState : public OptimizerParamState {
+  std::unique_ptr<OptimizerParamState> clone() const override {
+    return std::make_unique<Derived>(static_cast<const Derived&>(*this));
+  }
+};
+
+class TORCH_API OptimizerOptions {
+ public:
+  OptimizerOptions() = default;
+  OptimizerOptions(const OptimizerOptions&) = default;
+  OptimizerOptions& operator=(const OptimizerOptions&) = default;
+  OptimizerOptions(OptimizerOptions&&) noexcept = default;
+  OptimizerOptions& operator=(OptimizerOptions&&) noexcept = default;
+  virtual std::unique_ptr<OptimizerOptions> clone() const;
+  virtual void serialize(torch::serialize::InputArchive& archive);
+  virtual void serialize(torch::serialize::OutputArchive& archive) const;
+  virtual ~OptimizerOptions() = default;
+  virtual double get_lr() const;
+  virtual void set_lr(const double lr);
+};
+
+template <typename Derived>
+class OptimizerCloneableOptions : public OptimizerOptions {
+ private:
+  std::unique_ptr<OptimizerOptions> clone() const override {
+    return std::make_unique<Derived>(static_cast<const Derived&>(*this));
+  }
+};
+
+/// Stores parameters in the param_group and stores a pointer to the
+/// OptimizerOptions
+class TORCH_API OptimizerParamGroup {
+ public:
+  // NOTE: In order to store `OptimizerParamGroup` in a `std::vector`, it has to
+  // be copy-constructible.
+  OptimizerParamGroup(const OptimizerParamGroup& param_group)
+      : params_(param_group.params()),
+        options_(
+            param_group.has_options() ? param_group.options().clone()
+                                      : nullptr) {}
+  OptimizerParamGroup(OptimizerParamGroup&& param_group) = default;
+  OptimizerParamGroup(std::vector<Tensor> params)
+      : params_(std::move(params)) {}
+  OptimizerParamGroup(
+      std::vector<Tensor> params,
+      std::unique_ptr<OptimizerOptions> options)
+      : params_(std::move(params)), options_(std::move(options)) {}
+
+  OptimizerParamGroup& operator=(const OptimizerParamGroup& param_group) =
+      delete;
+  OptimizerParamGroup& operator=(OptimizerParamGroup&& param_group) noexcept =
+      default;
+  ~OptimizerParamGroup() = default;
+  bool has_options() const;
+  OptimizerOptions& options();
+  const OptimizerOptions& options() const;
+  void set_options(std::unique_ptr<OptimizerOptions> options);
+  std::vector<Tensor>& params();
+  const std::vector<Tensor>& params() const;
+
+ protected:
+  std::vector<Tensor> params_;
+  std::unique_ptr<OptimizerOptions> options_;
+};
+
+class TORCH_API Optimizer {
+ public:
+  // The copy constructor is deleted, because the user should use the
+  // `state_dict` / `load_state_dict` API to copy an optimizer instead.
+  Optimizer(const Optimizer& optimizer) = delete;
+  Optimizer(Optimizer&& optimizer) = default;
+  Optimizer& operator=(const Optimizer& optimizer) = delete;
+  Optimizer& operator=(Optimizer&& optimizer) = default;
+
+  explicit Optimizer(
+      const std::vector<OptimizerParamGroup>& param_groups,
+      std::unique_ptr<OptimizerOptions> defaults)
+      : defaults_(std::move(defaults)) {
+    for (const auto& param_group : param_groups) {
+      add_param_group(param_group);
+    }
+  }
+
+  /// Constructs the `Optimizer` from a vector of parameters.
+  explicit Optimizer(
+      std::vector<Tensor> parameters,
+      std::unique_ptr<OptimizerOptions> defaults)
+      : Optimizer(
+            {OptimizerParamGroup(std::move(parameters))},
+            std::move(defaults)) {}
+
+  /// Adds the given param_group to the optimizer's param_group list.
+  void add_param_group(const OptimizerParamGroup& param_group);
+
+  virtual ~Optimizer() = default;
+
+  using LossClosure = std::function<Tensor()>;
+  /// A loss function closure, which is expected to return the loss value.
+  virtual Tensor step(LossClosure closure = nullptr) = 0;
+
+  /// Adds the given vector of parameters to the optimizer's parameter list.
+  void add_parameters(const std::vector<Tensor>& parameters);
+
+  /// Zeros out the gradients of all parameters.
+  void zero_grad(bool set_to_none = true);
+
+  /// Provides a const reference to the parameters in the first param_group this
+  /// optimizer holds.
+  const std::vector<Tensor>& parameters() const noexcept;
+
+  /// Provides a reference to the parameters in the first param_group this
+  /// optimizer holds.
+  std::vector<Tensor>& parameters() noexcept;
+
+  /// Returns the number of parameters referenced by the optimizer.
+  size_t size() const noexcept;
+
+  OptimizerOptions& defaults() noexcept;
+
+  const OptimizerOptions& defaults() const noexcept;
+
+  /// Provides a reference to the param_groups this optimizer holds.
+  std::vector<OptimizerParamGroup>& param_groups() noexcept;
+
+  /// Provides a const reference to the param_groups this optimizer holds.
+  const std::vector<OptimizerParamGroup>& param_groups() const noexcept;
+
+  /// Provides a reference to the state this optimizer holds
+  ska::flat_hash_map<void*, std::unique_ptr<OptimizerParamState>>&
+  state() noexcept;
+
+  /// Provides a const reference to the state this optimizer holds
+  const ska::flat_hash_map<void*, std::unique_ptr<OptimizerParamState>>& state()
+      const noexcept;
+
+  /// Serializes the optimizer state into the given `archive`.
+  virtual void save(serialize::OutputArchive& archive) const;
+
+  /// Deserializes the optimizer state from the given `archive`.
+  virtual void load(serialize::InputArchive& archive);
+
+ protected:
+  std::vector<OptimizerParamGroup> param_groups_;
+  ska::flat_hash_map<void*, std::unique_ptr<OptimizerParamState>> state_;
+  std::unique_ptr<OptimizerOptions> defaults_;
+};
+
+/* How do we decide whether to serialize undefined tensors or
+  std::nullopt values into the output archive?
+Answer: we strictly follow the behavior of Python API. To be more specific:
+
+For optimizer options:
+a) For undefined tensor: currently no tensor is used as an options argument in
+Python API, so we don't need to worry about it now. b) For std::nullopt value:
+we serialize std::nullopt values into the output archive, to follow the exact
+same behavior as Python API.
+
+For optimizer param state:
+a) For undefined tensor: in param state, undefined tensor in C++ impl is
+equivalent to missing key in Python impl. Since we don't serialize missing keys
+in Python API, we skip undefined tensors when serializing the param state. b)
+For std::nullopt value: in param state, std::nullopt value in C++ impl is
+equivalent to missing key in Python impl. Since we don't serialize missing keys
+in Python API, we skip std::nullopt values when serializing the param state. */
+
+/// Serializes an `Optimizer` into an `OutputArchive`.
+TORCH_API serialize::OutputArchive& operator<<(
+    serialize::OutputArchive& archive,
+    const Optimizer& optimizer);
+
+/// Deserializes a `Tensor` from an `InputArchive`.
+TORCH_API serialize::InputArchive& operator>>(
+    serialize::InputArchive& archive,
+    Optimizer& optimizer);
+
+} // namespace torch::optim
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/rmsprop.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/rmsprop.h
new file mode 100644
index 0000000000000000000000000000000000000000..2f42d6b2ca9b499c5b27573eeb46c7a36a611a61
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/rmsprop.h
@@ -0,0 +1,91 @@
+#pragma once
+
+#include <torch/nn/module.h>
+#include <torch/optim/optimizer.h>
+#include <torch/optim/serialize.h>
+#include <torch/serialize/archive.h>
+#include <torch/types.h>
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace torch::serialize {
+class OutputArchive;
+class InputArchive;
+} // namespace torch::serialize
+
+namespace torch::optim {
+
+struct TORCH_API RMSpropOptions
+    : public OptimizerCloneableOptions<RMSpropOptions> {
+  RMSpropOptions(double lr = 1e-2);
+  TORCH_ARG(double, lr) = 1e-2;
+  TORCH_ARG(double, alpha) = 0.99;
+  TORCH_ARG(double, eps) = 1e-8;
+  TORCH_ARG(double, weight_decay) = 0;
+  TORCH_ARG(double, momentum) = 0;
+  TORCH_ARG(bool, centered) = false;
+
+ public:
+  void serialize(torch::serialize::InputArchive& archive) override;
+  void serialize(torch::serialize::OutputArchive& archive) const override;
+  TORCH_API friend bool operator==(
+      const RMSpropOptions& lhs,
+      const RMSpropOptions& rhs);
+  double get_lr() const override;
+  void set_lr(const double lr) override;
+};
+
+struct TORCH_API RMSpropParamState
+    : public OptimizerCloneableParamState<RMSpropParamState> {
+  TORCH_ARG(int64_t, step) = 0;
+  TORCH_ARG(torch::Tensor, square_avg);
+  TORCH_ARG(torch::Tensor, momentum_buffer) = {};
+  TORCH_ARG(torch::Tensor, grad_avg) = {};
+
+ public:
+  void serialize(torch::serialize::InputArchive& archive) override;
+  void serialize(torch::serialize::OutputArchive& archive) const override;
+  TORCH_API friend bool operator==(
+      const RMSpropParamState& lhs,
+      const RMSpropParamState& rhs);
+};
+
+class TORCH_API RMSprop : public Optimizer {
+ public:
+  explicit RMSprop(
+      const std::vector<OptimizerParamGroup>& param_groups,
+      RMSpropOptions defaults = {})
+      : Optimizer(param_groups, std::make_unique<RMSpropOptions>(defaults)) {
+    TORCH_CHECK(defaults.lr() >= 0, "Invalid learning rate: ", defaults.lr());
+    TORCH_CHECK(defaults.eps() >= 0, "Invalid epsilon value: ", defaults.eps());
+    TORCH_CHECK(
+        defaults.momentum() >= 0,
+        "Invalid momentum value: ",
+        defaults.momentum());
+    TORCH_CHECK(
+        defaults.weight_decay() >= 0,
+        "Invalid weight_decay value: ",
+        defaults.weight_decay());
+    TORCH_CHECK(
+        defaults.alpha() >= 0, "Invalid alpha value: ", defaults.alpha());
+  }
+
+  explicit RMSprop(std::vector<Tensor> params, RMSpropOptions defaults = {})
+      : RMSprop({OptimizerParamGroup(std::move(params))}, std::move(defaults)) {
+  }
+
+  torch::Tensor step(LossClosure closure = nullptr) override;
+  void save(serialize::OutputArchive& archive) const override;
+  void load(serialize::InputArchive& archive) override;
+
+ private:
+  template <typename Self, typename Archive>
+  static void serialize(Self& self, Archive& archive) {
+    _TORCH_OPTIM_SERIALIZE_WITH_TEMPLATE_ARG(RMSprop);
+  }
+};
+} // namespace torch::optim
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/schedulers/lr_scheduler.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/schedulers/lr_scheduler.h
new file mode 100644
index 0000000000000000000000000000000000000000..8212f150c54e77e4e7eab1e24844f37561433526
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/schedulers/lr_scheduler.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <torch/optim/optimizer.h>
+
+#include <torch/csrc/Export.h>
+
+namespace torch::optim {
+
+class TORCH_API LRScheduler {
+ public:
+  // This class needs to take a reference of an optimizer from outside such that
+  // it can modify its learning rates; due to this the lifetime of said
+  // optimizer must be maintained
+  LRScheduler(torch::optim::Optimizer& optimizer);
+
+  virtual ~LRScheduler() = default;
+
+  void step();
+
+ protected:
+  // A vector of learning rates is calculated and returned from the specific
+  // subclass. A vector is returned with each element being a separate learning
+  // rate for each param group - although the normal use case would be to return
+  // a vector of identical elements.
+  virtual std::vector<double> get_lrs() = 0;
+
+  // Get current learning rates from the optimizer
+  std::vector<double> get_current_lrs() const;
+
+  unsigned step_count_{};
+
+ private:
+  void set_optimizer_lrs(const std::vector<double>& learning_rates);
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  torch::optim::Optimizer& optimizer_;
+};
+} // namespace torch::optim
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/schedulers/reduce_on_plateau_scheduler.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/schedulers/reduce_on_plateau_scheduler.h
new file mode 100644
index 0000000000000000000000000000000000000000..94fdad6a0cc070ae64167439bea6c656ebf28c96
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/schedulers/reduce_on_plateau_scheduler.h
@@ -0,0 +1,59 @@
+#pragma once
+
+#include <torch/optim/optimizer.h>
+#include <torch/optim/schedulers/lr_scheduler.h>
+
+#include <torch/csrc/Export.h>
+
+#include <cmath>
+
+namespace torch::optim {
+
+class TORCH_API ReduceLROnPlateauScheduler {
+ public:
+  enum SchedulerMode { min, max };
+  enum ThresholdMode { rel, abs };
+  ReduceLROnPlateauScheduler(
+      Optimizer& optimizer,
+      SchedulerMode mode = min,
+      float factor = 0.1,
+      int patience = 10,
+      double threshold = 1e-4,
+      ThresholdMode threshold_mode = rel,
+      int cooldown = 0,
+      const std::vector<float>& min_lr = std::vector<float>(),
+      double eps = 1e-8,
+      bool verbose = false);
+
+  virtual ~ReduceLROnPlateauScheduler() = default;
+
+  void step(float metric);
+
+ private:
+  void reset();
+  void reduce_lr(int epoch);
+  bool in_cooldown() const;
+  bool is_better(float a);
+  void init_is_better(
+      SchedulerMode mode,
+      double threshold,
+      ThresholdMode threshold_mode);
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  Optimizer& optimizer;
+  SchedulerMode mode{};
+  float mode_worse{};
+  float factor;
+  int patience;
+  double threshold{};
+  ThresholdMode threshold_mode{};
+  int cooldown{};
+  int cooldown_counter{};
+  std::vector<float> min_lrs;
+  double eps;
+  float best{};
+  bool verbose;
+  int last_epoch{};
+  int num_bad_epochs{};
+};
+} // namespace torch::optim
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/schedulers/step_lr.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/schedulers/step_lr.h
new file mode 100644
index 0000000000000000000000000000000000000000..afa3eacee15006ee4ea94712523ec0484585edf2
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/schedulers/step_lr.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <torch/optim/schedulers/lr_scheduler.h>
+
+namespace torch::optim {
+
+class TORCH_API StepLR : public LRScheduler {
+ public:
+  StepLR(
+      torch::optim::Optimizer& optimizer,
+      const unsigned step_size,
+      const double gamma = 0.1);
+
+ private:
+  std::vector<double> get_lrs() override;
+
+  const unsigned step_size_;
+  const double gamma_;
+};
+} // namespace torch::optim
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/serialize.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/serialize.h
new file mode 100644
index 0000000000000000000000000000000000000000..e10c2637747b5d9564247776f4544b3a964e3f28
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/serialize.h
@@ -0,0 +1,315 @@
+#pragma once
+
+#include <c10/util/irange.h>
+#include <torch/optim/optimizer.h>
+#include <torch/serialize/archive.h>
+#include <torch/types.h>
+#include <cstddef>
+#include <cstdint>
+#include <deque>
+#include <string>
+#include <vector>
+
+namespace torch::optim {
+namespace detail {
+// Utility function to save state
+template <typename DerivedOptimizerParamState>
+void serialize(
+    serialize::OutputArchive& archive,
+    const ska::flat_hash_map<void*, std::unique_ptr<OptimizerParamState>>&
+        state) {
+  for (const auto& item : state) {
+    serialize::OutputArchive param_state_archive(archive.compilation_unit());
+    std::string tensorimpl_key =
+        std::to_string(reinterpret_cast<size_t>(item.first));
+    const DerivedOptimizerParamState& curr_state =
+        static_cast<const DerivedOptimizerParamState&>(*(item.second));
+    curr_state.serialize(param_state_archive);
+    archive.write(tensorimpl_key, param_state_archive);
+  }
+}
+
+// Utility function to load state
+template <typename DerivedOptimizerParamState>
+void serialize(
+    serialize::InputArchive& archive,
+    ska::flat_hash_map<void*, std::unique_ptr<OptimizerParamState>>& state) {
+  std::vector<std::string> tensorimpl_keys = archive.keys();
+  for (const std::string& tensorimpl_key : tensorimpl_keys) {
+    serialize::InputArchive param_state_archive;
+    archive.read(tensorimpl_key, param_state_archive);
+    DerivedOptimizerParamState param_state;
+    param_state.serialize(param_state_archive);
+    // NOLINTNEXTLINE(performance-no-int-to-ptr)
+    state[reinterpret_cast<void*>(std::stoull(tensorimpl_key))] =
+        std::make_unique<DerivedOptimizerParamState>(param_state);
+  }
+}
+
+// Utility function to save param_groups
+template <typename DerivedOptimizerParamOptions>
+void serialize(
+    serialize::OutputArchive& archive,
+    const std::vector<OptimizerParamGroup>& param_groups) {
+  archive.write(
+      "param_groups/size",
+      torch::tensor(static_cast<int64_t>(param_groups.size())));
+  for (const auto i : c10::irange(param_groups.size())) {
+    serialize::OutputArchive param_group_archive(archive.compilation_unit());
+    std::vector<Tensor> params = param_groups[i].params();
+    param_group_archive.write(
+        "params/size", torch::tensor(static_cast<int64_t>(params.size())));
+    for (const auto index : c10::irange(params.size())) {
+      param_group_archive.write(
+          "params/" + std::to_string(index),
+          IValue(std::to_string(
+              reinterpret_cast<size_t>(params[index].unsafeGetTensorImpl()))));
+    }
+    const DerivedOptimizerParamOptions& param_group_options =
+        static_cast<const DerivedOptimizerParamOptions&>(
+            param_groups[i].options());
+    serialize::OutputArchive param_group_options_archive(
+        param_group_archive.compilation_unit());
+    param_group_options.serialize(param_group_options_archive);
+    param_group_archive.write("options", param_group_options_archive);
+    archive.write("param_groups/" + std::to_string(i), param_group_archive);
+  }
+}
+
+// Utility function to load param_groups
+// We take as input vector of pair of string and unique_ptr to optimizer options
+// so that we can retain the state for each param by using the old tensor impl
+// keys (saved during serialization) and map the new tensor impl keys to the
+// correct state for each param
+template <typename DerivedOptimizerParamOptions>
+void serialize(
+    serialize::InputArchive& archive,
+    std::vector<
+        std::pair<std::vector<std::string>, std::unique_ptr<OptimizerOptions>>>&
+        param_groups) {
+  torch::Tensor param_groups_size_tensor;
+  archive.read("param_groups/size", param_groups_size_tensor);
+  const int64_t param_groups_size = param_groups_size_tensor.item<int64_t>();
+  for (const auto i : c10::irange(param_groups_size)) {
+    serialize::InputArchive param_group_archive;
+    archive.read("param_groups/" + std::to_string(i), param_group_archive);
+    torch::Tensor size_tensor;
+    param_group_archive.read("params/size", size_tensor);
+    const int64_t size = size_tensor.item<int64_t>();
+    std::vector<std::string> params;
+    for (const auto index : c10::irange(size)) {
+      IValue ivalue;
+      param_group_archive.read("params/" + std::to_string(index), ivalue);
+      std::string element = ivalue.toStringRef();
+      params.emplace_back(element);
+    }
+    serialize::InputArchive param_group_options_archive;
+    param_group_archive.read("options", param_group_options_archive);
+    DerivedOptimizerParamOptions param_group_options(0);
+    param_group_options.serialize(param_group_options_archive);
+    param_groups.emplace_back(std::make_pair(
+        params,
+        std::make_unique<DerivedOptimizerParamOptions>(param_group_options)));
+  }
+}
+} // namespace detail
+
+// Note: These functions are all called `serialize()` so they can be called
+// inside a template where the archive type is a template type and can thus be
+// passed such that the appropriate overload is selected.
+
+/// Utility function to save a value of `int64_t` type.
+void serialize(
+    serialize::OutputArchive& archive,
+    const std::string& key,
+    const int64_t& value);
+
+/// Utility function to load a value of `int64_t` type.
+void serialize(
+    serialize::InputArchive& archive,
+    const std::string& key,
+    int64_t& value);
+
+/// Utility function to save a vector of step buffers.
+void serialize(
+    serialize::OutputArchive& archive,
+    const std::string& key,
+    const std::vector<int64_t>& steps);
+
+/// Utility function to load a vector of step buffers.
+void serialize(
+    serialize::InputArchive& archive,
+    const std::string& key,
+    std::vector<int64_t>& steps);
+
+// Utility function to save state and param_groups
+template <
+    typename DerivedOptimizerParamState,
+    typename DerivedOptimizerParamOptions>
+void serialize(serialize::OutputArchive& archive, const Optimizer& optimizer) {
+  archive.write("pytorch_version", IValue("1.5.0"));
+  serialize::OutputArchive state_archive(archive.compilation_unit());
+  detail::serialize<DerivedOptimizerParamState>(
+      state_archive, optimizer.state());
+  archive.write("state", state_archive);
+
+  serialize::OutputArchive param_groups_archive(archive.compilation_unit());
+  detail::serialize<DerivedOptimizerParamOptions>(
+      param_groups_archive, optimizer.param_groups());
+  archive.write("param_groups", param_groups_archive);
+}
+
+// Utility function to load state and param_groups and update state
+template <
+    typename DerivedOptimizerParamState,
+    typename DerivedOptimizerParamOptions>
+void serialize(serialize::InputArchive& archive, Optimizer& optimizer) {
+  IValue pytorch_version;
+  archive.read("pytorch_version", pytorch_version);
+  TORCH_INTERNAL_ASSERT(pytorch_version.toStringRef() == "1.5.0");
+  serialize::InputArchive state_archive;
+  archive.read("state", state_archive);
+  ska::flat_hash_map<void*, std::unique_ptr<OptimizerParamState>> saved_state;
+  detail::serialize<DerivedOptimizerParamState>(state_archive, saved_state);
+
+  serialize::InputArchive param_groups_archive;
+  archive.read("param_groups", param_groups_archive);
+  std::vector<
+      std::pair<std::vector<std::string>, std::unique_ptr<OptimizerOptions>>>
+      saved_param_groups;
+  detail::serialize<DerivedOptimizerParamOptions>(
+      param_groups_archive, saved_param_groups);
+
+  // update state and optimizer options
+  TORCH_CHECK(
+      saved_param_groups.size() == optimizer.param_groups().size(),
+      "loaded state dict has a different number of parameter groups");
+  for (const auto i : c10::irange(saved_param_groups.size())) {
+    std::vector<std::string> param_group_old_keys = saved_param_groups[i].first;
+    std::vector<Tensor> params = optimizer.param_groups()[i].params();
+    TORCH_CHECK(
+        param_group_old_keys.size() == params.size(),
+        "loaded state dict contains a parameter group that has a different size than the optimizer's parameter group");
+
+    for (const auto idx : c10::irange(params.size())) {
+      auto param_group_old_key =
+          // NOLINTNEXTLINE(performance-no-int-to-ptr)
+          reinterpret_cast<void*>(std::stoull(param_group_old_keys[idx]));
+      if (saved_state.find(param_group_old_key) != saved_state.end()) {
+        optimizer.state()[params[idx].unsafeGetTensorImpl()] =
+            std::move(saved_state[param_group_old_key]);
+      }
+    }
+
+    auto& saved_options = reinterpret_cast<DerivedOptimizerParamOptions&>(
+        *saved_param_groups[i].second);
+    auto& current_options = reinterpret_cast<DerivedOptimizerParamOptions&>(
+        optimizer.param_groups()[i].options());
+    current_options = saved_options;
+  }
+}
+
+/// Utility function to save a vector of buffers.
+template <typename BufferContainer>
+void serialize(
+    serialize::OutputArchive& archive,
+    const std::string& key,
+    const BufferContainer& buffers) {
+  archive.write(
+      key + "/size", torch::tensor(static_cast<int64_t>(buffers.size())));
+  for (const auto index : c10::irange(buffers.size())) {
+    archive.write(
+        key + "/" + std::to_string(index), buffers[index], /*is_buffer=*/true);
+  }
+}
+
+/// Utility function to load a vector of buffers.
+template <typename BufferContainer>
+void serialize(
+    serialize::InputArchive& archive,
+    const std::string& key,
+    BufferContainer& buffers) {
+  buffers.clear();
+  torch::Tensor size_tensor;
+  archive.read(key + "/size", size_tensor);
+  const size_t size = size_tensor.item<int64_t>();
+  for (const auto index : c10::irange(size)) {
+    buffers.emplace_back();
+    archive.read(
+        key + "/" + std::to_string(index), buffers.back(), /*is_buffer=*/true);
+  }
+}
+
+template <typename T>
+c10::List<T> deque_to_list(const std::deque<T>& dq) {
+  c10::List<T> list;
+  list.reserve(dq.size());
+  for (const auto& e : dq) {
+    list.emplace_back(e);
+  }
+  return list;
+}
+
+template <typename T>
+std::deque<T> list_to_deque(const c10::List<T>& list) {
+  std::deque<T> dq;
+  for (const auto& e : list) {
+    dq.emplace_back(e);
+  }
+  return dq;
+}
+
+#define _TORCH_OPTIM_SERIALIZE(name) \
+  torch::optim::serialize(archive, #name, self.name)
+
+#define _TORCH_OPTIM_SERIALIZE_WITH_TEMPLATE_ARG(OptimizerName)               \
+  torch::optim::serialize<OptimizerName##ParamState, OptimizerName##Options>( \
+      archive, self)
+
+#define _TORCH_OPTIM_SERIALIZE_TORCH_ARG(name)           \
+  {                                                      \
+    auto ivalue = torch::IValue(name());                 \
+    /* do not serialize if name is an undefined tensor*/ \
+    if (!(ivalue.isTensor() &&                           \
+          ivalue.unsafeToTensorImpl() ==                 \
+              at::UndefinedTensorImpl::singleton())) {   \
+      archive.write(#name, ivalue);                      \
+    }                                                    \
+  }
+
+#define _TORCH_OPTIM_SERIALIZE_TORCH_ARG_DEQUE(name)           \
+  {                                                            \
+    c10::IValue ivalue = torch::IValue(deque_to_list(name())); \
+    archive.write(#name, ivalue);                              \
+  }
+
+#define _TORCH_OPTIM_DESERIALIZE_TORCH_ARG(T, name)                        \
+  {                                                                        \
+    c10::IValue ivalue;                                                    \
+    bool exists = archive.try_read(#name, ivalue);                         \
+    if (exists) {                                                          \
+      name(ivalue.to<T>());                                                \
+    } else {                                                               \
+      constexpr bool is_tensor_type = std::is_base_of_v<torch::Tensor, T>; \
+      TORCH_INTERNAL_ASSERT(is_tensor_type);                               \
+    }                                                                      \
+  }
+
+#define _TORCH_OPTIM_DESERIALIZE_TORCH_ARG_OPTIONAL(T, name) \
+  {                                                          \
+    c10::IValue ivalue;                                      \
+    bool exists = archive.try_read(#name, ivalue);           \
+    if (exists) {                                            \
+      name(ivalue.toOptional<T>());                          \
+    }                                                        \
+  }
+
+#define _TORCH_OPTIM_DESERIALIZE_TORCH_ARG_DEQUE(T, name) \
+  {                                                       \
+    c10::IValue ivalue;                                   \
+    archive.read(#name, ivalue);                          \
+    auto list = ivalue.to<c10::List<T::value_type>>();    \
+    name(list_to_deque(list));                            \
+  }
+
+} // namespace torch::optim
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/sgd.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/sgd.h
new file mode 100644
index 0000000000000000000000000000000000000000..0984ba25868d428d9217eeefbcbfdd5ddb439d7d
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/optim/sgd.h
@@ -0,0 +1,85 @@
+#pragma once
+
+#include <torch/nn/module.h>
+#include <torch/optim/optimizer.h>
+#include <torch/optim/serialize.h>
+#include <torch/serialize/archive.h>
+#include <torch/types.h>
+
+#include <cstddef>
+#include <utility>
+#include <vector>
+
+namespace torch::serialize {
+class OutputArchive;
+class InputArchive;
+} // namespace torch::serialize
+
+namespace torch::optim {
+
+struct TORCH_API SGDOptions : public OptimizerCloneableOptions<SGDOptions> {
+  SGDOptions(double lr);
+  TORCH_ARG(double, lr);
+  TORCH_ARG(double, momentum) = 0;
+  TORCH_ARG(double, dampening) = 0;
+  TORCH_ARG(double, weight_decay) = 0;
+  TORCH_ARG(bool, nesterov) = false;
+
+ public:
+  void serialize(torch::serialize::InputArchive& archive) override;
+  void serialize(torch::serialize::OutputArchive& archive) const override;
+  TORCH_API friend bool operator==(
+      const SGDOptions& lhs,
+      const SGDOptions& rhs);
+  double get_lr() const override;
+  void set_lr(const double lr) override;
+};
+
+struct TORCH_API SGDParamState
+    : public OptimizerCloneableParamState<SGDParamState> {
+  TORCH_ARG(torch::Tensor, momentum_buffer);
+
+ public:
+  void serialize(torch::serialize::InputArchive& archive) override;
+  void serialize(torch::serialize::OutputArchive& archive) const override;
+  TORCH_API friend bool operator==(
+      const SGDParamState& lhs,
+      const SGDParamState& rhs);
+};
+
+class TORCH_API SGD : public Optimizer {
+ public:
+  explicit SGD(
+      const std::vector<OptimizerParamGroup>& param_groups,
+      SGDOptions defaults)
+      : Optimizer(param_groups, std::make_unique<SGDOptions>(defaults)) {
+    TORCH_CHECK(defaults.lr() >= 0, "Invalid learning rate: ", defaults.lr());
+    TORCH_CHECK(
+        defaults.momentum() >= 0,
+        "Invalid momentum value: ",
+        defaults.momentum());
+    TORCH_CHECK(
+        defaults.weight_decay() >= 0,
+        "Invalid weight_decay value: ",
+        defaults.weight_decay());
+    TORCH_CHECK(
+        !defaults.nesterov() ||
+            (defaults.momentum() > 0 && defaults.dampening() == 0),
+        "Nesterov momentum requires a momentum and zero dampening");
+  }
+
+  explicit SGD(std::vector<Tensor> params, SGDOptions defaults)
+      : SGD({OptimizerParamGroup(std::move(params))}, std::move(defaults)) {}
+
+  torch::Tensor step(LossClosure closure = nullptr) override;
+
+  void save(serialize::OutputArchive& archive) const override;
+  void load(serialize::InputArchive& archive) override;
+
+ private:
+  template <typename Self, typename Archive>
+  static void serialize(Self& self, Archive& archive) {
+    _TORCH_OPTIM_SERIALIZE_WITH_TEMPLATE_ARG(SGD);
+  }
+};
+} // namespace torch::optim
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/ordered_dict.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/ordered_dict.h
new file mode 100644
index 0000000000000000000000000000000000000000..4eed65300bd989888973db3d080c8538adc306f2
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/ordered_dict.h
@@ -0,0 +1,516 @@
+#pragma once
+
+#include <cstdint>
+#include <initializer_list>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace torch {
+/// An ordered dictionary implementation, akin to Python's `OrderedDict`.
+template <typename Key, typename Value>
+class OrderedDict {
+ public:
+  /// A (key, value) pair.
+  class Item;
+
+  // The lifetime of an iterator is bound to the lifetime of the `OrderedDict`.
+  // Further, any `insert()` operation may invalidate all iterators
+  // pointing into the vector.
+  using Iterator = typename std::vector<Item>::iterator;
+  using ConstIterator = typename std::vector<Item>::const_iterator;
+
+  /// Constructs the `OrderedDict` with a short description of the kinds of keys
+  /// stored in the `OrderedDict`. This description is used in error messages
+  /// thrown by the `OrderedDict`.
+  explicit OrderedDict(std::string key_description = "Key");
+
+  /// Copy constructs this `OrderedDict` from `other`.
+  OrderedDict(const OrderedDict& other);
+
+  /// Assigns items from `other` to this `OrderedDict`.
+  OrderedDict& operator=(const OrderedDict& other);
+
+  // NB: Move works by default, because you can move-construct vectors of const
+  // values. I tried to make this noexcept (conditional on the move constructors
+  // of index_ and items_ being noexcept) but the obvious spelling didn't
+  // compile on Windows.
+  OrderedDict(OrderedDict&& other) noexcept = default;
+  OrderedDict& operator=(OrderedDict&& other) noexcept = default;
+
+  ~OrderedDict() = default;
+
+  /// Constructs a new `OrderedDict` and pre-populates it with the given
+  /// `Item`s.
+  /*implicit */ OrderedDict(std::initializer_list<Item> initializer_list);
+
+  /// Returns the key description string the `OrderedDict` was constructed with.
+  const std::string& key_description() const noexcept;
+
+  // Element Access
+
+  /// Returns the very first item in the `OrderedDict` and throws an exception
+  /// if it is empty.
+  Item& front();
+
+  /// Returns the very first item in the `OrderedDict` and throws an exception
+  /// if it is empty.
+  const Item& front() const;
+
+  /// Returns the very last item in the `OrderedDict` and throws an exception
+  /// if it is empty.
+  Item& back();
+
+  /// Returns the very last item in the `OrderedDict` and throws an exception
+  /// if it is empty.
+  const Item& back() const;
+
+  /// Returns the item at the `index`-th position in the `OrderedDict`. Throws
+  /// an exception if the index is out of bounds.
+  Item& operator[](size_t index);
+
+  /// Returns the item at the `index`-th position in the `OrderedDict`. Throws
+  /// an exception if the index is out of bounds.
+  const Item& operator[](size_t index) const;
+
+  /// Returns the value associated with the given `key`. Throws an exception if
+  /// no such key is stored in the `OrderedDict`. Use `find()` for a
+  /// non-throwing way of accessing a value if it is present.
+  Value& operator[](const Key& key);
+
+  /// Returns the value associated with the given `key`. Throws an exception if
+  /// no such key is stored in the `OrderedDict`. Use `find()` for a
+  /// non-throwing way of accessing a value if it is present.
+  const Value& operator[](const Key& key) const;
+
+  // Lookup
+
+  /// Returns a pointer to the value associated with the given key, or a
+  /// `nullptr` if no such key is stored in the `OrderedDict`.
+  Value* find(const Key& key) noexcept;
+
+  /// Returns a pointer to the value associated with the given key, or a
+  /// `nullptr` if no such key is stored in the `OrderedDict`.
+  const Value* find(const Key& key) const noexcept;
+
+  /// Returns true if the key is present in the `OrderedDict`.
+  bool contains(const Key& key) const noexcept;
+
+  // Iterators
+
+  /// Returns an iterator to the first item in the `OrderedDict`. Iteration is
+  /// ordered.
+  Iterator begin();
+
+  /// Returns an iterator to the first item in the `OrderedDict`. Iteration is
+  /// ordered.
+  ConstIterator begin() const;
+
+  /// Returns an iterator one past the last item in the `OrderedDict`.
+  Iterator end();
+
+  /// Returns an iterator one past the last item in the `OrderedDict`.
+  ConstIterator end() const;
+
+  // Capacity
+
+  /// Returns the number of items currently stored in the `OrderedDict`.
+  size_t size() const noexcept;
+
+  /// Returns true if the `OrderedDict` contains no elements.
+  bool is_empty() const noexcept;
+
+  /// Resizes internal storage to fit at least `requested_capacity` items
+  /// without requiring reallocation.
+  void reserve(size_t requested_capacity);
+
+  // Modifiers
+
+  /// Inserts a new `(key, value)` pair into the `OrderedDict`. Throws an
+  /// exception if the key is already present. If insertion is successful,
+  /// immediately returns a reference to the inserted value.
+  template <typename K, typename V>
+  Value& insert(K&& key, V&& value);
+
+  /// Inserts a new `(key, value)` pair into the `OrderedDict`. Throws an
+  /// exception if the key is already present. If insertion is successful,
+  /// immediately returns a reference to the inserted value.
+  Value& insert(Key key, Value&& value);
+
+  /// Inserts all items from `other` into this `OrderedDict`. If any key from
+  /// `other` is already present in this `OrderedDict`, an exception is thrown.
+  void update(OrderedDict&& other);
+
+  /// Inserts all items from `other` into this `OrderedDict`. If any key from
+  /// `other` is already present in this `OrderedDict`, an exception is thrown.
+  void update(const OrderedDict& other);
+
+  /// Removes the item that has `key` from this `OrderedDict` if exists and if
+  /// it doesn't an exception is thrown.
+  void erase(const Key& key);
+
+  /// Removes all items from this `OrderedDict`.
+  void clear();
+
+  // Observers
+
+  /// Returns the items stored in the `OrderedDict`.
+  const std::vector<Item>& items() const noexcept;
+
+  /// Returns a newly allocated vector and copies all keys from this
+  /// `OrderedDict` into the vector.
+  ::std::vector<Key> keys() const;
+
+  /// Returns a newly allocated vector and copies all values from this
+  /// `OrderedDict` into the vector.
+  ::std::vector<Value> values() const;
+
+  /// Returns a newly allocated vector and copies all keys and values from this
+  /// `OrderedDict` into a vector of `std::pair<Key, Value>`.
+  ::std::vector<std::pair<Key, Value>> pairs() const;
+
+  /// Returns true if both dicts contain the same keys and values, in the same
+  /// order.
+  template <typename K, typename V>
+  friend bool operator==(
+      const OrderedDict<K, V>& a,
+      const OrderedDict<K, V>& b);
+
+ private:
+  /// A mapping from a key to an index into the `items_` vector.
+  ::std::unordered_map<Key, size_t> index_;
+
+  /// The items stored in the `OrderedDict`.
+  ::std::vector<Item> items_;
+
+  /// A description of the keys stored in the `OrderedDict`.
+  ::std::string key_description_{"Key"};
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ OrderedDict::Item ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <typename Key, typename Value>
+class OrderedDict<Key, Value>::Item {
+ public:
+  /// Constructs a new item.
+  Item(Key key, Value value) : pair_(std::move(key), std::move(value)) {}
+
+  /// Returns a reference to the value.
+  Value& operator*() {
+    return value();
+  }
+
+  /// Returns a reference to the value.
+  const Value& operator*() const {
+    return value();
+  }
+
+  /// Allows access to the value using the arrow operator.
+  Value* operator->() {
+    return &value();
+  }
+
+  /// Allows access to the value using the arrow operator.
+  const Value* operator->() const {
+    return &value();
+  }
+
+  /// Returns a reference to the key.
+  const Key& key() const noexcept {
+    return pair_.first;
+  }
+
+  /// Returns a reference to the value.
+  Value& value() noexcept {
+    return pair_.second;
+  }
+
+  /// Returns a reference to the value.
+  const Value& value() const noexcept {
+    return pair_.second;
+  }
+
+  /// Returns a `(key, value)` pair.
+  const std::pair<Key, Value>& pair() const noexcept {
+    return pair_;
+  }
+
+ private:
+  /// This is stored as an std::pair because it will make Python binding a lot,
+  /// lot easier.
+  ::std::pair<Key, Value> pair_;
+};
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ OrderedDict ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <typename Key, typename Value>
+OrderedDict<Key, Value>::OrderedDict(std::string key_description)
+    : key_description_(std::move(key_description)) {}
+
+template <typename Key, typename Value>
+OrderedDict<Key, Value>::OrderedDict(const OrderedDict& other)
+    : index_(other.index_), key_description_(other.key_description_) {
+  // Copy we have to do ourselves, because items' keys are const, so we have to
+  // re-insert the items.
+  for (const auto& item : other.items_) {
+    items_.push_back(item);
+  }
+}
+
+template <typename Key, typename Value>
+OrderedDict<Key, Value>& OrderedDict<Key, Value>::operator=(
+    const OrderedDict& other) {
+  index_ = other.index_;
+  items_.clear();
+  for (auto& item : other.items_) {
+    items_.push_back(item);
+  }
+  key_description_ = other.key_description_;
+  return *this;
+}
+
+template <typename Key, typename Value>
+OrderedDict<Key, Value>::OrderedDict(
+    std::initializer_list<Item> initializer_list)
+    : OrderedDict("Key") {
+  items_.reserve(initializer_list.size());
+  for (auto& item : initializer_list) {
+    // Copy the key here and move it into the index.
+    items_.emplace_back(item.key(), std::move(item.value()));
+    index_.emplace(std::move(item.key()), size() - 1);
+  }
+}
+
+template <typename Key, typename Value>
+typename OrderedDict<Key, Value>::Iterator OrderedDict<Key, Value>::begin() {
+  return items_.begin();
+}
+
+template <typename Key, typename Value>
+typename OrderedDict<Key, Value>::ConstIterator OrderedDict<Key, Value>::begin()
+    const {
+  return items_.begin();
+}
+
+template <typename Key, typename Value>
+typename OrderedDict<Key, Value>::Iterator OrderedDict<Key, Value>::end() {
+  return items_.end();
+}
+
+template <typename Key, typename Value>
+typename OrderedDict<Key, Value>::ConstIterator OrderedDict<Key, Value>::end()
+    const {
+  return items_.end();
+}
+
+template <typename Key, typename Value>
+typename OrderedDict<Key, Value>::Item& OrderedDict<Key, Value>::front() {
+  TORCH_CHECK(!items_.empty(), "Called front() on an empty OrderedDict");
+  return items_.front();
+}
+
+template <typename Key, typename Value>
+const typename OrderedDict<Key, Value>::Item& OrderedDict<Key, Value>::front()
+    const {
+  TORCH_CHECK(!items_.empty(), "Called front() on an empty OrderedDict");
+  return items_.front();
+}
+
+template <typename Key, typename Value>
+typename OrderedDict<Key, Value>::Item& OrderedDict<Key, Value>::back() {
+  TORCH_CHECK(!items_.empty(), "Called back() on an empty OrderedDict");
+  return items_.back();
+}
+
+template <typename Key, typename Value>
+const typename OrderedDict<Key, Value>::Item& OrderedDict<Key, Value>::back()
+    const {
+  TORCH_CHECK(!items_.empty(), "Called back() on an empty OrderedDict");
+  return items_.back();
+}
+
+template <typename Key, typename Value>
+typename OrderedDict<Key, Value>::Item& OrderedDict<Key, Value>::operator[](
+    size_t index) {
+  TORCH_CHECK(index < items_.size(), "Index ", index, " is out of bounds");
+  return items_[index];
+}
+
+template <typename Key, typename Value>
+const typename OrderedDict<Key, Value>::Item& OrderedDict<Key, Value>::
+operator[](size_t index) const {
+  TORCH_CHECK(index < items_.size(), "Index ", index, " is out of bounds");
+  return items_[index];
+}
+
+template <typename Key, typename Value>
+Value& OrderedDict<Key, Value>::operator[](const Key& key) {
+  if (auto* value = find(key)) {
+    return *value;
+  }
+  TORCH_CHECK(false, key_description_, " '", key, "' is not defined");
+}
+
+template <typename Key, typename Value>
+const Value& OrderedDict<Key, Value>::operator[](const Key& key) const {
+  if (auto* value = find(key)) {
+    return *value;
+  }
+  TORCH_CHECK(false, key_description_, " '", key, "' is not defined");
+}
+
+template <typename Key, typename Value>
+template <typename K, typename V>
+Value& OrderedDict<Key, Value>::insert(K&& key, V&& value) {
+  TORCH_CHECK(
+      index_.count(key) == 0, key_description_, " '", key, "' already defined");
+  // Copy `key` here and move it into the index.
+  items_.emplace_back(key, std::forward<V>(value));
+  index_.emplace(std::forward<K>(key), size() - 1);
+  return items_.back().value();
+}
+
+template <typename Key, typename Value>
+Value& OrderedDict<Key, Value>::insert(Key key, Value&& value) {
+  return insert<Key, Value>(std::move(key), std::move(value));
+}
+
+template <typename Key, typename Value>
+void OrderedDict<Key, Value>::update(OrderedDict&& other) {
+  reserve(size() + other.size());
+  for (auto&& item : std::move(other)) {
+    // We want to call `insert()` to prevent duplicate keys.
+    insert(std::move(item.key()), std::move(item.value()));
+  }
+}
+
+template <typename Key, typename Value>
+void OrderedDict<Key, Value>::update(const OrderedDict& other) {
+  reserve(size() + other.size());
+  for (auto& item : other) {
+    // We want to call `insert()` to prevent duplicate keys.
+    insert(item.key(), item.value());
+  }
+}
+
+template <typename Key, typename Value>
+Value* OrderedDict<Key, Value>::find(const Key& key) noexcept {
+  auto iterator = index_.find(key);
+  if (iterator == index_.end()) {
+    return nullptr;
+  }
+  return &items_[iterator->second].value();
+}
+
+template <typename Key, typename Value>
+const Value* OrderedDict<Key, Value>::find(const Key& key) const noexcept {
+  auto iterator = index_.find(key);
+  if (iterator == index_.end()) {
+    return nullptr;
+  }
+  return &items_[iterator->second].value();
+}
+
+template <typename Key, typename Value>
+void OrderedDict<Key, Value>::erase(const Key& key) {
+  auto it = index_.find(key);
+  TORCH_CHECK(it != index_.end(), "Key '", key, "' doesn't exist");
+
+  auto index = it->second;
+  index_.erase(it);
+  items_.erase(items_.begin() + index);
+
+  for (auto& pair : index_)
+    if (pair.second > index)
+      --pair.second;
+}
+
+template <typename Key, typename Value>
+bool OrderedDict<Key, Value>::contains(const Key& key) const noexcept {
+  return find(key) != nullptr;
+}
+
+template <typename Key, typename Value>
+void OrderedDict<Key, Value>::clear() {
+  index_.clear();
+  items_.clear();
+}
+
+template <typename Key, typename Value>
+size_t OrderedDict<Key, Value>::size() const noexcept {
+  return items_.size();
+}
+
+template <typename Key, typename Value>
+bool OrderedDict<Key, Value>::is_empty() const noexcept {
+  return items_.empty();
+}
+
+template <typename Key, typename Value>
+const std::string& OrderedDict<Key, Value>::key_description() const noexcept {
+  return key_description_;
+}
+
+template <typename Key, typename Value>
+const std::vector<typename OrderedDict<Key, Value>::Item>& OrderedDict<
+    Key,
+    Value>::items() const noexcept {
+  return items_;
+}
+
+template <typename Key, typename Value>
+::std::vector<Key> OrderedDict<Key, Value>::keys() const {
+  std::vector<Key> keys;
+  keys.reserve(size());
+  for (const auto& item : items_) {
+    keys.push_back(item.key());
+  }
+  return keys;
+}
+
+template <typename Key, typename Value>
+::std::vector<Value> OrderedDict<Key, Value>::values() const {
+  std::vector<Value> values;
+  values.reserve(size());
+  for (const auto& item : items_) {
+    values.push_back(item.value());
+  }
+  return values;
+}
+
+template <typename Key, typename Value>
+::std::vector<std::pair<Key, Value>> OrderedDict<Key, Value>::pairs() const {
+  std::vector<std::pair<Key, Value>> values;
+  values.reserve(size());
+  for (const auto& item : items_) {
+    values.push_back(item.pair());
+  }
+  return values;
+}
+
+template <typename Key, typename Value>
+void OrderedDict<Key, Value>::reserve(size_t requested_capacity) {
+  index_.reserve(requested_capacity);
+  items_.reserve(requested_capacity);
+}
+
+template <typename K, typename V>
+bool operator==(
+    const torch::OrderedDict<K, V>& a,
+    const torch::OrderedDict<K, V>& b) {
+  using Item = typename torch::OrderedDict<K, V>::Item;
+  if (a.index_ != b.index_)
+    return false;
+  if (a.items_.size() != b.items_.size())
+    return false;
+  // NOTE: There's no point in comparing keys for items_, as we already know
+  // that index is equal.
+  return std::equal(
+      a.items_.begin(),
+      a.items_.end(),
+      b.items_.begin(),
+      [](const Item& a, const Item& b) { return a.value() == b.value(); });
+}
+
+} // namespace torch
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/python.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/python.h
new file mode 100644
index 0000000000000000000000000000000000000000..a78ccf1d8fed2b61dcca93ef5911a05ec7a37fdc
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/python.h
@@ -0,0 +1,259 @@
+#pragma once
+
+#include <torch/detail/static.h>
+#include <torch/nn/module.h>
+#include <torch/ordered_dict.h>
+#include <torch/types.h>
+
+#include <torch/csrc/Device.h>
+#include <torch/csrc/Dtype.h>
+#include <torch/csrc/DynamicTypes.h>
+#include <torch/csrc/Exceptions.h>
+#include <torch/csrc/autograd/python_variable.h>
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/pybind.h>
+#include <torch/csrc/utils/python_numbers.h>
+#include <torch/csrc/utils/python_tuples.h>
+
+#include <iterator>
+#include <string>
+#include <utility>
+
+namespace torch::python {
+namespace detail {
+inline Device py_object_to_device(py::object object) {
+  PyObject* obj = object.ptr();
+  if (THPDevice_Check(obj)) {
+    return reinterpret_cast<THPDevice*>(obj)->device;
+  }
+  throw TypeError("Expected device");
+}
+
+inline Dtype py_object_to_dtype(py::object object) {
+  PyObject* obj = object.ptr();
+  if (THPDtype_Check(obj)) {
+    return reinterpret_cast<THPDtype*>(obj)->scalar_type;
+  }
+  throw TypeError("Expected dtype");
+}
+
+template <typename ModuleType>
+using PyModuleClass =
+    py::class_<ModuleType, torch::nn::Module, std::shared_ptr<ModuleType>>;
+
+/// Dynamically creates a subclass of `torch.nn.cpp.ModuleWrapper` that is also
+/// a subclass of `torch.nn.Module`, and passes it the user-provided C++ module
+/// to which it delegates all calls.
+template <typename ModuleType>
+void bind_cpp_module_wrapper(
+    const py::module& module,
+    PyModuleClass<ModuleType> cpp_class,
+    const char* name) {
+  // Grab the `torch.nn.cpp.ModuleWrapper` class, which we'll subclass
+  // with a dynamically created class below.
+  py::object cpp_module =
+      py::module::import("torch.nn.cpp").attr("ModuleWrapper");
+
+  // Grab the `type` class which we'll use as a metaclass to create a new class
+  // dynamically.
+  py::object type_metaclass =
+      py::reinterpret_borrow<py::object>((PyObject*)&PyType_Type);
+
+  // The `ModuleWrapper` constructor copies all functions to its own `__dict__`
+  // in its constructor, but we do need to give our dynamic class a constructor.
+  // Inside, we construct an instance of the original C++ module we're binding
+  // (the `torch::nn::Module` subclass), and then forward it to the
+  // `ModuleWrapper` constructor.
+  py::dict attributes;
+
+  // `type()` always needs a `str`, but pybind11's `str()` method always creates
+  // a `unicode` object.
+  py::object name_str = py::str(name);
+
+  // Dynamically create the subclass of `ModuleWrapper`, which is a subclass of
+  // `torch.nn.Module`, and will delegate all calls to the C++ module we're
+  // binding.
+  py::object wrapper_class =
+      type_metaclass(name_str, py::make_tuple(cpp_module), attributes);
+
+  // The constructor of the dynamic class calls `ModuleWrapper.__init__()`,
+  // which replaces its methods with those of the C++ module.
+  wrapper_class.attr("__init__") = py::cpp_function(
+      [cpp_module, cpp_class](
+          const py::object& self,
+          const py::args& args,
+          const py::kwargs& kwargs) {
+        cpp_module.attr("__init__")(self, cpp_class(*args, **kwargs));
+      },
+      py::is_method(wrapper_class));
+
+  // Calling `my_module.my_class` now means that `my_class` is a subclass of
+  // `ModuleWrapper`, and whose methods call into the C++ module we're binding.
+  module.attr(name) = wrapper_class;
+}
+} // namespace detail
+
+/// Adds method bindings for a pybind11 `class_` that binds an `nn::Module`
+/// subclass.
+///
+/// Say you have a pybind11 class object created with `py::class_<Net>(m,
+/// "Net")`. This function will add all the necessary `.def()` calls to bind the
+/// `nn::Module` base class' methods, such as `train()`, `eval()` etc. into
+/// Python.
+///
+/// Users should prefer to use `bind_module` if possible.
+template <typename ModuleType, typename... Extra>
+py::class_<ModuleType, Extra...> add_module_bindings(
+    py::class_<ModuleType, Extra...> module) {
+  // clang-format off
+  return module
+      .def("train",
+          [](ModuleType& module, bool mode) { module.train(mode); },
+          py::arg("mode") = true)
+      .def("eval", [](ModuleType& module) { module.eval(); })
+      .def("clone", [](ModuleType& module) { return module.clone(); })
+      .def_property_readonly(
+          "training", [](ModuleType& module) { return module.is_training(); })
+      .def("zero_grad", [](ModuleType& module) { module.zero_grad(); })
+      .def_property_readonly( "_parameters", [](ModuleType& module) {
+            return module.named_parameters(/*recurse=*/false);
+          })
+      .def("parameters", [](ModuleType& module, bool recurse) {
+            return module.parameters(recurse);
+          },
+          py::arg("recurse") = true)
+      .def("named_parameters", [](ModuleType& module, bool recurse) {
+            return module.named_parameters(recurse);
+          },
+          py::arg("recurse") = true)
+      .def_property_readonly("_buffers", [](ModuleType& module) {
+            return module.named_buffers(/*recurse=*/false);
+          })
+      .def("buffers", [](ModuleType& module, bool recurse) {
+            return module.buffers(recurse); },
+          py::arg("recurse") = true)
+      .def("named_buffers", [](ModuleType& module, bool recurse) {
+            return module.named_buffers(recurse);
+          },
+          py::arg("recurse") = true)
+      .def_property_readonly(
+        "_modules", [](ModuleType& module) { return module.named_children(); })
+      .def("modules", [](ModuleType& module) { return module.modules(); })
+      .def("named_modules",
+           [](ModuleType& module, const py::object& /* unused */, std::string prefix, bool remove_duplicate /* unused */) {
+            return module.named_modules(std::move(prefix));
+          },
+          py::arg("memo") = py::none(),
+          py::arg("prefix") = std::string(),
+          py::arg("remove_duplicate") = true)
+      .def("children", [](ModuleType& module) { return module.children(); })
+      .def("named_children",
+          [](ModuleType& module) { return module.named_children(); })
+      .def("to", [](ModuleType& module, py::object object, bool non_blocking) {
+            if (THPDevice_Check(object.ptr())) {
+              module.to(
+                  reinterpret_cast<THPDevice*>(object.ptr())->device,
+                  non_blocking);
+            } else {
+              module.to(detail::py_object_to_dtype(object), non_blocking);
+            }
+          },
+          py::arg("dtype_or_device"),
+          py::arg("non_blocking") = false)
+      .def("to",
+          [](ModuleType& module,
+             const py::object& device,
+             const py::object& dtype,
+             bool non_blocking) {
+              if (device.is_none()) {
+                module.to(detail::py_object_to_dtype(dtype), non_blocking);
+              } else if (dtype.is_none()) {
+                module.to(detail::py_object_to_device(device), non_blocking);
+              } else {
+                module.to(
+                    detail::py_object_to_device(device),
+                    detail::py_object_to_dtype(dtype),
+                    non_blocking);
+              }
+          },
+          py::arg("device"),
+          py::arg("dtype"),
+          py::arg("non_blocking") = false)
+      .def("cuda", [](ModuleType& module) { module.to(kCUDA); })
+      .def("cpu", [](ModuleType& module) { module.to(kCPU); })
+      .def("float", [](ModuleType& module) { module.to(kFloat32); })
+      .def("double", [](ModuleType& module) { module.to(kFloat64); })
+      .def("half", [](ModuleType& module) { module.to(kFloat16); })
+      .def("__str__", [](ModuleType& module) { return module.name(); })
+      .def("__repr__", [](ModuleType& module) { return module.name(); });
+  // clang-format on
+}
+
+/// Creates a pybind11 class object for an `nn::Module` subclass type and adds
+/// default bindings.
+///
+/// After adding the default bindings, the class object is returned, such that
+/// you can add more bindings.
+///
+/// Example usage:
+/// \rst
+/// .. code-block:: cpp
+///
+///   struct Net : torch::nn::Module {
+///     Net(int in, int out) { }
+///     torch::Tensor forward(torch::Tensor x) { return x; }
+///   };
+///
+///   PYBIND11_MODULE(my_module, m) {
+///     torch::python::bind_module<Net>(m, "Net")
+///       .def(py::init<int, int>())
+///       .def("forward", &Net::forward);
+///  }
+/// \endrst
+template <typename ModuleType, bool force_enable = false>
+std::enable_if_t<
+    !torch::detail::has_forward<ModuleType>::value || force_enable,
+    detail::PyModuleClass<ModuleType>>
+bind_module(py::module module, const char* name) {
+  py::module cpp = module.def_submodule("cpp");
+  auto cpp_class =
+      add_module_bindings(detail::PyModuleClass<ModuleType>(cpp, name));
+  detail::bind_cpp_module_wrapper(module, cpp_class, name);
+  return cpp_class;
+}
+
+/// Creates a pybind11 class object for an `nn::Module` subclass type and adds
+/// default bindings.
+///
+/// After adding the default bindings, the class object is returned, such that
+/// you can add more bindings.
+///
+/// If the class has a `forward()` method, it is automatically exposed as
+/// `forward()` and `__call__` in Python.
+///
+/// Example usage:
+/// \rst
+/// .. code-block:: cpp
+///
+///   struct Net : torch::nn::Module {
+///     Net(int in, int out) { }
+///     torch::Tensor forward(torch::Tensor x) { return x; }
+///   };
+///
+///   PYBIND11_MODULE(my_module, m) {
+///     torch::python::bind_module<Net>(m, "Net")
+///       .def(py::init<int, int>())
+///       .def("forward", &Net::forward);
+///  }
+/// \endrst
+template <
+    typename ModuleType,
+    typename = std::enable_if_t<torch::detail::has_forward<ModuleType>::value>>
+detail::PyModuleClass<ModuleType> bind_module(
+    py::module module,
+    const char* name) {
+  return bind_module<ModuleType, /*force_enable=*/true>(module, name)
+      .def("forward", &ModuleType::forward)
+      .def("__call__", &ModuleType::forward);
+}
+} // namespace torch::python
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/python/init.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/python/init.h
new file mode 100644
index 0000000000000000000000000000000000000000..716b8b2283919c4b72e53b1e93653f6d391da1ab
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/python/init.h
@@ -0,0 +1,8 @@
+#pragma once
+
+#include <torch/csrc/utils/python_stub.h>
+
+namespace torch::python {
+/// Initializes Python bindings for the C++ frontend.
+void init_bindings(PyObject* module);
+} // namespace torch::python
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/serialize.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/serialize.h
new file mode 100644
index 0000000000000000000000000000000000000000..8aa3e2d7c4954683e5a73c20d28061d1bd0464d7
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/serialize.h
@@ -0,0 +1,144 @@
+#pragma once
+
+#include <c10/util/irange.h>
+#include <torch/csrc/Export.h>
+#include <torch/serialize/archive.h>
+#include <torch/serialize/tensor.h>
+
+#include <utility>
+
+namespace torch {
+
+/// Serializes the given `value`.
+/// There must be an overload of `operator<<` between `serialize::OutputArchive`
+/// and `Value` for this method to be well-formed. Currently, such an overload
+/// is provided for (subclasses of):
+///
+/// - `torch::nn::Module`,
+/// - `torch::optim::Optimizer`
+/// - `torch::Tensor`
+///
+/// To perform the serialization, a `serialize::OutputArchive` is constructed,
+/// and all arguments after the `value` are forwarded to its `save_to` method.
+/// For example, you can pass a filename, or an `ostream`.
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   torch::nn::Linear model(3, 4);
+///   torch::save(model, "model.pt");
+///
+///   torch::optim::SGD sgd(model->parameters(), 0.9); // 0.9 is learning rate
+///   std::ostringstream stream;
+///   // Note that the same stream cannot be used in multiple torch::save(...)
+///   // invocations, otherwise the header will be corrupted.
+///   torch::save(sgd, stream);
+///
+///   auto tensor = torch::ones({3, 4});
+///   torch::save(tensor, "my_tensor.pt");
+/// \endrst
+template <typename Value, typename... SaveToArgs>
+void save(const Value& value, SaveToArgs&&... args) {
+  serialize::OutputArchive archive(std::make_shared<jit::CompilationUnit>());
+  archive << value;
+  archive.save_to(std::forward<SaveToArgs>(args)...);
+}
+
+/// Serializes the given `tensor_vec` of type `std::vector<torch::Tensor>`.
+///
+/// To perform the serialization, a `serialize::OutputArchive` is constructed,
+/// and all arguments after the `tensor_vec` are forwarded to its `save_to`
+/// method. For example, you can pass a filename, or an `ostream`.
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   std::vector<torch::Tensor> tensor_vec = { torch::randn({1, 2}),
+///   torch::randn({3, 4}) }; torch::save(tensor_vec, "my_tensor_vec.pt");
+///
+///   std::vector<torch::Tensor> tensor_vec = { torch::randn({5, 6}),
+///   torch::randn({7, 8}) }; std::ostringstream stream;
+///   // Note that the same stream cannot be used in multiple torch::save(...)
+///   // invocations, otherwise the header will be corrupted.
+///   torch::save(tensor_vec, stream);
+/// \endrst
+template <typename... SaveToArgs>
+void save(const std::vector<torch::Tensor>& tensor_vec, SaveToArgs&&... args) {
+  serialize::OutputArchive archive(std::make_shared<jit::CompilationUnit>());
+  for (const auto i : c10::irange(tensor_vec.size())) {
+    auto& value = tensor_vec[i];
+    archive.write(std::to_string(i), value);
+  }
+  archive.save_to(std::forward<SaveToArgs>(args)...);
+}
+
+TORCH_API std::vector<char> pickle_save(const torch::IValue& ivalue);
+TORCH_API torch::IValue pickle_load(const std::vector<char>& data);
+
+/// Deserializes the given `value`.
+/// There must be an overload of `operator>>` between `serialize::InputArchive`
+/// and `Value` for this method to be well-formed. Currently, such an overload
+/// is provided for (subclasses of):
+///
+/// - `torch::nn::Module`,
+/// - `torch::optim::Optimizer`
+/// - `torch::Tensor`
+///
+/// To perform the serialization, a `serialize::InputArchive` is constructed,
+/// and all arguments after the `value` are forwarded to its `load_from` method.
+/// For example, you can pass a filename, or an `istream`.
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   torch::nn::Linear model(3, 4);
+///   torch::load(model, "model.pt");
+///
+///   torch::optim::SGD sgd(model->parameters(), 0.9); // 0.9 is learning rate
+///   std::istringstream stream("...");
+///   torch::load(sgd, stream);
+///
+///   auto tensor = torch::ones({3, 4});
+///   torch::load(tensor, "my_tensor.pt");
+/// \endrst
+template <typename Value, typename... LoadFromArgs>
+void load(Value& value, LoadFromArgs&&... args) {
+  serialize::InputArchive archive;
+  archive.load_from(std::forward<LoadFromArgs>(args)...);
+  archive >> value;
+}
+
+/// Deserializes the given `tensor_vec` of type `std::vector<torch::Tensor>`.
+///
+/// To perform the serialization, a `serialize::InputArchive` is constructed,
+/// and all arguments after the `value` are forwarded to its `load_from` method.
+/// For example, you can pass a filename, or an `istream`.
+///
+/// \rst
+/// .. code-block:: cpp
+///
+///   std::vector<torch::Tensor> tensor_vec;
+///   torch::load(tensor_vec, "my_tensor_vec.pt");
+///
+///   std::vector<torch::Tensor> tensor_vec;
+///   std::istringstream stream("...");
+///   torch::load(tensor_vec, stream);
+/// \endrst
+template <typename... LoadFromArgs>
+void load(std::vector<torch::Tensor>& tensor_vec, LoadFromArgs&&... args) {
+  serialize::InputArchive archive;
+  archive.load_from(std::forward<LoadFromArgs>(args)...);
+
+  // NOTE: The number of elements in the serialized `std::vector<torch::Tensor>`
+  // is not known ahead of time, so we need a while-loop to increment the index,
+  // and use `archive.try_read(...)` to check whether we have reached the end of
+  // the serialized `std::vector<torch::Tensor>`.
+  size_t index = 0;
+  torch::Tensor value;
+  while (archive.try_read(std::to_string(index), value)) {
+    tensor_vec.push_back(std::move(value));
+    value = torch::Tensor();
+    index++;
+  }
+}
+} // namespace torch
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/serialize/archive.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/serialize/archive.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a80de488cc2d22ab3b671b2c422a8b75bef27fd
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/serialize/archive.h
@@ -0,0 +1,4 @@
+#pragma once
+
+#include <torch/serialize/input-archive.h>
+#include <torch/serialize/output-archive.h>
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/serialize/input-archive.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/serialize/input-archive.h
new file mode 100644
index 0000000000000000000000000000000000000000..e15f25125325a443a2880aec1bced67c6a1f72ea
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/serialize/input-archive.h
@@ -0,0 +1,115 @@
+#pragma once
+
+#include <c10/core/Device.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/api/module.h>
+#include <torch/types.h>
+#include <optional>
+
+#include <iosfwd>
+#include <memory>
+#include <string>
+#include <utility>
+
+namespace at {
+class Tensor;
+} // namespace at
+
+namespace torch {
+using at::Tensor;
+namespace jit {
+struct Module;
+} // namespace jit
+} // namespace torch
+
+namespace torch::serialize {
+
+/// A recursive representation of tensors that can be deserialized from a file
+/// or stream. In most cases, users should not have to interact with this class,
+/// and should instead use `torch::load`.
+class TORCH_API InputArchive final {
+ public:
+  /// Default-constructs the `InputArchive`.
+  InputArchive();
+
+  // Move is allowed.
+  InputArchive(InputArchive&&) = default;
+  InputArchive& operator=(InputArchive&&) = default;
+
+  // Copy is disallowed.
+  InputArchive(InputArchive&) = delete;
+  InputArchive& operator=(InputArchive&) = delete;
+
+  ~InputArchive() = default;
+
+  /// Reads an `IValue` associated with a given `key`.
+  void read(const std::string& key, c10::IValue& ivalue);
+
+  /// Reads an `IValue` associated with a given `key`. If there is no `IValue`
+  /// associated with the `key`, this returns false, otherwise it returns true.
+  bool try_read(const std::string& key, c10::IValue& ivalue);
+
+  /// Reads a `tensor` associated with a given `key`. If there is no `tensor`
+  /// associated with the `key`, this returns false, otherwise it returns true.
+  /// If the tensor is expected to be a buffer (not differentiable), `is_buffer`
+  /// must be `true`.
+  bool try_read(const std::string& key, Tensor& tensor, bool is_buffer = false);
+
+  /// Reads a `tensor` associated with a given `key`.
+  /// If the tensor is expected to be a buffer (not differentiable), `is_buffer`
+  /// must be `true`.
+  void read(const std::string& key, Tensor& tensor, bool is_buffer = false);
+
+  /// Reads a `InputArchive` associated with a given `key`. If there is no
+  /// `InputArchive` associated with the `key`, this returns false, otherwise
+  /// it returns true.
+  bool try_read(const std::string& key, InputArchive& archive);
+
+  /// Reads an `InputArchive` associated with a given `key`.
+  /// The archive can thereafter be used for further deserialization of the
+  /// nested data.
+  void read(const std::string& key, InputArchive& archive);
+
+  /// Loads the `InputArchive` from a serialized representation stored in the
+  /// file at `filename`. Storage are remapped using device option. If device
+  /// is not specified, the module is loaded to the original device.
+  void load_from(
+      const std::string& filename,
+      std::optional<torch::Device> device = std::nullopt);
+
+  /// Loads the `InputArchive` from a serialized representation stored in the
+  /// given `stream`. Storage are remapped using device option. If device
+  /// is not specified, the module is loaded to the original device.
+  void load_from(
+      std::istream& stream,
+      std::optional<torch::Device> device = std::nullopt);
+
+  // Loads given the specified flat array.
+  void load_from(
+      const char* data,
+      size_t size,
+      std::optional<torch::Device> device = std::nullopt);
+
+  // Loads given the specified read and size functions.
+  void load_from(
+      const std::function<size_t(uint64_t pos, void* buf, size_t nbytes)>&
+          read_func,
+      const std::function<size_t(void)>& size_func,
+      std::optional<torch::Device> device = std::nullopt);
+
+  // Returns the vector of keys in the input archive.
+  std::vector<std::string> keys();
+
+  /// Forwards all arguments to `read()`.
+  /// Useful for generic code that can be reused for both `InputArchive` and
+  /// `OutputArchive` (where `operator()` forwards to `write()`).
+  template <typename... Ts>
+  void operator()(Ts&&... ts) {
+    read(std::forward<Ts>(ts)...);
+  }
+
+ private:
+  jit::Module module_;
+  std::string hierarchy_prefix_;
+};
+} // namespace torch::serialize
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/serialize/output-archive.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/serialize/output-archive.h
new file mode 100644
index 0000000000000000000000000000000000000000..1d6fb7bf211d3e3c11f865a65d10b331979ec123
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/serialize/output-archive.h
@@ -0,0 +1,80 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/api/module.h>
+
+#include <iosfwd>
+#include <memory>
+#include <string>
+#include <utility>
+
+namespace at {
+class Tensor;
+} // namespace at
+
+namespace torch {
+using at::Tensor;
+namespace jit {
+struct Module;
+} // namespace jit
+} // namespace torch
+
+namespace torch::serialize {
+class TORCH_API OutputArchive final {
+ public:
+  explicit OutputArchive(std::shared_ptr<jit::CompilationUnit> cu);
+  explicit OutputArchive()
+      : cu_(std::make_shared<jit::CompilationUnit>()),
+        module_("__torch__.Module", cu_) {}
+
+  // Move is allowed.
+  OutputArchive(OutputArchive&&) = default;
+  OutputArchive& operator=(OutputArchive&&) = default;
+
+  // Copy is disallowed.
+  OutputArchive(OutputArchive&) = delete;
+  OutputArchive& operator=(OutputArchive&) = delete;
+
+  std::shared_ptr<jit::CompilationUnit> compilation_unit() const {
+    return cu_;
+  }
+
+  /// Writes an `IValue` to the `OutputArchive`.
+  void write(const std::string& key, const c10::IValue& ivalue);
+
+  /// Writes a `(key, tensor)` pair to the `OutputArchive`, and marks it as
+  /// being or not being a buffer (non-differentiable tensor).
+  void write(
+      const std::string& key,
+      const Tensor& tensor,
+      bool is_buffer = false);
+
+  /// Writes a nested `OutputArchive` under the given `key` to this
+  /// `OutputArchive`.
+  void write(const std::string& key, OutputArchive& nested_archive);
+
+  /// Saves the `OutputArchive` into a serialized representation in a file at
+  /// `filename`.
+  void save_to(const std::string& filename);
+
+  /// Saves the `OutputArchive` into a serialized representation into the given
+  /// `stream`.
+  void save_to(std::ostream& stream);
+
+  /// Saves the `OutputArchive` into a serialized representation using the
+  /// given writer function.
+  void save_to(const std::function<size_t(const void*, size_t)>& func);
+
+  /// Forwards all arguments to `write()`.
+  /// Useful for generic code that can be reused for both `OutputArchive` and
+  /// `InputArchive` (where `operator()` forwards to `read()`).
+  template <typename... Ts>
+  void operator()(Ts&&... ts) {
+    write(std::forward<Ts>(ts)...);
+  }
+
+ private:
+  std::shared_ptr<jit::CompilationUnit> cu_;
+  jit::Module module_;
+};
+} // namespace torch::serialize
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/serialize/tensor.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/serialize/tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..c1c43d1def971ff8fcf738cd208f897d580d92de
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/serialize/tensor.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <torch/serialize/archive.h>
+#include <torch/types.h>
+
+namespace torch {
+inline serialize::OutputArchive& operator<<(
+    serialize::OutputArchive& archive,
+    const Tensor& tensor) {
+  archive.write("0", tensor);
+  return archive;
+}
+
+inline serialize::InputArchive& operator>>(
+    serialize::InputArchive& archive,
+    Tensor& tensor) {
+  archive.read("0", tensor);
+  return archive;
+}
+} // namespace torch
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/sparse.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/sparse.h
new file mode 100644
index 0000000000000000000000000000000000000000..91d0733424ec7305a656a61e93ad0058b300cd8a
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/sparse.h
@@ -0,0 +1,3 @@
+#pragma once
+
+#include <ATen/ATen.h>
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/special.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/special.h
new file mode 100644
index 0000000000000000000000000000000000000000..20b88f2dbe84caa435b9829a4373cb3177887f92
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/special.h
@@ -0,0 +1,1403 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <torch/types.h>
+
+namespace torch::special {
+
+/// Computes the natural logarithm of the absolute value of the gamma function
+/// See https://pytorch.org/docs/main/special.html#torch.special.gammaln.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::gammaln(t);
+/// ```
+inline Tensor gammaln(const Tensor& self) {
+  return torch::special_gammaln(self);
+}
+
+inline Tensor& gammaln_out(Tensor& result, const Tensor& self) {
+  return torch::special_gammaln_out(result, self);
+}
+
+/// Computes the regularized lower incomplete gamma function
+/// See https://pytorch.org/docs/main/special.html#torch.special.gammainc.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// auto s = torch::randn(128, dtype=kDouble);
+/// torch::special::gammainc(s, t);
+/// ```
+inline Tensor gammainc(const Tensor& self, const Tensor& other) {
+  return torch::special_gammainc(self, other);
+}
+
+inline Tensor& gammainc_out(
+    Tensor& result,
+    const Tensor& self,
+    const Tensor& other) {
+  return torch::special_gammainc_out(result, self, other);
+}
+
+/// Computes the regularized upper incomplete gamma function
+/// See https://pytorch.org/docs/main/special.html#torch.special.gammainc.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// auto s = torch::randn(128, dtype=kDouble);
+/// torch::special::gammaincc(s, t);
+/// ```
+inline Tensor gammaincc(const Tensor& self, const Tensor& other) {
+  return torch::special_gammaincc(self, other);
+}
+
+inline Tensor& gammaincc_out(
+    Tensor& result,
+    const Tensor& self,
+    const Tensor& other) {
+  return torch::special_gammaincc_out(result, self, other);
+}
+
+/// Computes the multivariate log-gamma function with dimension `p`, elementwise
+/// See https://pytorch.org/docs/main/special.html#torch.special.multigammaln.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::multigammaln(t, 1);
+/// ```
+inline Tensor multigammaln(const Tensor& self, int64_t p) {
+  return torch::special_multigammaln(self, p);
+}
+
+inline Tensor& multigammaln_out(Tensor& result, const Tensor& self, int64_t p) {
+  return torch::special_multigammaln_out(result, self, p);
+}
+
+/// Computes the nth derivative of the digamma function on the input.
+/// See https:://pytorch.org/docs/main/special.html#torch.special.polygamma.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::polygamma(2, t);
+/// ```
+inline Tensor polygamma(int64_t n, const Tensor& self) {
+  return torch::special_polygamma(n, self);
+}
+
+inline Tensor& polygamma_out(Tensor& result, int64_t n, const Tensor& self) {
+  return torch::special_polygamma_out(result, n, self);
+}
+
+/// Computes the logarithmic derivative of the gamma function on input
+/// See https://pytorch.org/docs/main/special.html#torch.special.psi
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::psi(t);
+/// ```
+inline Tensor psi(const Tensor& self) {
+  return torch::special_psi(self);
+}
+
+inline Tensor& psi_out(Tensor& result, const Tensor& self) {
+  return torch::special_psi_out(result, self);
+}
+
+/// Computes the logarithmic derivative of the gamma function on input
+/// See https://pytorch.org/docs/main/special.html#torch.special.digamma
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::digamma(t);
+/// ```
+inline Tensor digamma(const Tensor& self) {
+  return torch::special_digamma(self);
+}
+
+inline Tensor& digamma_out(Tensor& result, const Tensor& self) {
+  return torch::special_digamma_out(result, self);
+}
+
+/// Computes entropy of input, elementwise
+/// See https://pytorch.org/docs/main/special.html#torch.special.entr.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::entr(t);
+/// ```
+inline Tensor entr(const Tensor& self) {
+  return torch::special_entr(self);
+}
+
+inline Tensor& entr_out(Tensor& result, const Tensor& self) {
+  return torch::special_entr_out(result, self);
+}
+
+/// Computes the error function
+/// See https://pytorch.org/docs/main/special.html#torch.special.erf.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::erf(t);
+/// ```
+inline Tensor erf(const Tensor& self) {
+  return torch::special_erf(self);
+}
+
+inline Tensor& erf_out(Tensor& result, const Tensor& self) {
+  return torch::special_erf_out(result, self);
+}
+
+/// Computes the complementary error function
+/// See https://pytorch.org/docs/main/special.html#torch.special.erfc.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::erfc(t);
+/// ```
+inline Tensor erfc(const Tensor& self) {
+  return torch::special_erfc(self);
+}
+
+inline Tensor& erfc_out(Tensor& result, const Tensor& self) {
+  return torch::special_erfc_out(result, self);
+}
+
+/// Computes the scaled complementary error function
+/// See https://pytorch.org/docs/main/special.html#torch.special.erfcx.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::erfcx(t);
+/// ```
+inline Tensor erfcx(const Tensor& self) {
+  return torch::special_erfcx(self);
+}
+
+inline Tensor& erfcx_out(Tensor& result, const Tensor& self) {
+  return torch::special_erfcx_out(result, self);
+}
+
+/// Computes the inverse error function
+/// See https://pytorch.org/docs/main/special.html#torch.special.erfinv.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::erfinv(t);
+/// ```
+inline Tensor erfinv(const Tensor& self) {
+  return torch::special_erfinv(self);
+}
+
+inline Tensor& erfinv_out(Tensor& result, const Tensor& self) {
+  return torch::special_erfinv_out(result, self);
+}
+
+/// Computes the log of summed exponentials of each row of input in the given
+/// dimension dim See
+/// https://pytorch.org/docs/main/special.html#torch.special.logsumexp.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(3, 3);
+/// torch::special::logsumexp(t, 1);
+/// ```
+inline Tensor logsumexp(const Tensor& self, IntArrayRef dims, bool keepdim) {
+  return torch::special_logsumexp(self, dims, keepdim);
+}
+
+inline Tensor& logsumexp_out(
+    Tensor& result,
+    const Tensor& self,
+    IntArrayRef dims,
+    bool keepdim) {
+  return torch::special_logsumexp_out(result, self, dims, keepdim);
+}
+
+/// Computes the argument, x, for which the area under the Gaussian probability
+/// density function (integrated from minus infinity to x) is equal to input,
+/// elementwise. See
+/// https://pytorch.org/docs/main/special.html#torch.special.ndtri
+///
+/// Example:
+/// ```
+/// auto t = torch::rand(128, dtype=kDouble);
+/// torch::special::ndtri(t);
+/// ```
+inline Tensor ndtri(const Tensor& self) {
+  return torch::special_ndtri(self);
+}
+
+inline Tensor& ndtri_out(Tensor& result, const Tensor& self) {
+  return torch::special_ndtri_out(result, self);
+}
+
+/// Computes the log of area under the standard Gaussian probability density
+/// function, integrated from minus infinity to :attr:`input`, elementwise See
+/// https://pytorch.org/docs/main/special.html#torch.special.log_ndtr
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::log_ndtr(t);
+/// ```
+inline Tensor log_ndtr(const Tensor& self) {
+  return torch::special_log_ndtr(self);
+}
+
+inline Tensor& log_ndtr_out(Tensor& result, const Tensor& self) {
+  return torch::special_log_ndtr_out(result, self);
+}
+
+/// Computes the logit of input, elementwise.
+/// See https://pytorch.org/docs/main/special.html#torch.special.logit.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::logit(t);
+/// ```
+inline Tensor logit(const Tensor& self) {
+  return torch::special_logit(self);
+}
+
+inline Tensor& logit_out(Tensor& result, const Tensor& self) {
+  return torch::special_logit_out(result, self);
+}
+
+/// Computes the expit (also known as the logistic sigmoid function) of input,
+/// elementwise See
+/// https://pytorch.org/docs/main/special.html#torch.special.expit.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::expit(t);
+/// ```
+inline Tensor expit(const Tensor& self) {
+  return torch::special_expit(self);
+}
+
+inline Tensor& expit_out(Tensor& result, const Tensor& self) {
+  return torch::special_expit_out(result, self);
+}
+
+/// Computes the base two exponential function of :attr:`input`, elementwise
+/// See https://pytorch.org/docs/main/special.html#torch.special.exp2.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::exp2(t);
+/// ```
+inline Tensor exp2(const Tensor& self) {
+  return torch::special_exp2(self);
+}
+
+inline Tensor& exp2_out(Tensor& result, const Tensor& self) {
+  return torch::special_exp2_out(result, self);
+}
+
+/// Computes the exponential of the elements minus 1, elementwise
+/// See https://pytorch.org/docs/main/special.html#torch.special.expm1.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::expm1(t);
+/// ```
+inline Tensor expm1(const Tensor& self) {
+  return torch::special_expm1(self);
+}
+
+inline Tensor& expm1_out(Tensor& result, const Tensor& self) {
+  return torch::special_expm1_out(result, self);
+}
+
+/// Computes x * log(y) for inputs, elementwise
+/// See https://pytorch.org/docs/main/special.html#torch.special.xlogy.
+///
+/// Example:
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+/// auto y = torch::randn(128, dtype=kDouble);
+/// torch::special::xlogy(x, y);
+/// ```
+inline Tensor xlogy(const Tensor& self, const Tensor& other) {
+  return torch::special_xlogy(self, other);
+}
+
+inline Tensor xlogy(const Scalar& self, const Tensor& other) {
+  return torch::special_xlogy(self, other);
+}
+
+inline Tensor xlogy(const Tensor& self, const Scalar& other) {
+  return torch::special_xlogy(self, other);
+}
+
+inline Tensor& xlogy_out(
+    Tensor& result,
+    const Tensor& self,
+    const Tensor& other) {
+  return torch::special_xlogy_out(result, self, other);
+}
+
+inline Tensor& xlogy_out(
+    Tensor& result,
+    const Scalar& self,
+    const Tensor& other) {
+  return torch::special_xlogy_out(result, self, other);
+}
+
+inline Tensor& xlogy_out(
+    Tensor& result,
+    const Tensor& self,
+    const Scalar& other) {
+  return torch::special_xlogy_out(result, self, other);
+}
+
+/// Computes x * log1p(y) for inputs, elementwise
+/// See https://pytorch.org/docs/main/special.html#torch.special.xlog1py.
+///
+/// Example:
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+/// auto y = torch::randn(128, dtype=kDouble);
+/// torch::special::xlog1py(x, y);
+/// ```
+inline Tensor xlog1py(const Tensor& self, const Tensor& other) {
+  return torch::special_xlog1py(self, other);
+}
+
+inline Tensor xlog1py(const Scalar& self, const Tensor& other) {
+  return torch::special_xlog1py(self, other);
+}
+
+inline Tensor xlog1py(const Tensor& self, const Scalar& other) {
+  return torch::special_xlog1py(self, other);
+}
+
+inline Tensor& xlog1py_out(
+    Tensor& result,
+    const Tensor& self,
+    const Tensor& other) {
+  return torch::special_xlog1py_out(result, self, other);
+}
+
+inline Tensor& xlog1py_out(
+    Tensor& result,
+    const Scalar& self,
+    const Tensor& other) {
+  return torch::special_xlog1py_out(result, self, other);
+}
+
+inline Tensor& xlog1py_out(
+    Tensor& result,
+    const Tensor& self,
+    const Scalar& other) {
+  return torch::special_xlog1py_out(result, self, other);
+}
+
+/// Computes Hurwitz Zeta function for inputs, elementwise
+/// See https://pytorch.org/docs/main/special.html#torch.special.zeta.
+///
+/// Example:
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+/// auto y = torch::randn(128, dtype=kDouble);
+/// torch::special::zeta(x, y);
+/// ```
+inline Tensor zeta(const Tensor& self, const Tensor& other) {
+  return torch::special_zeta(self, other);
+}
+
+inline Tensor zeta(const Scalar& self, const Tensor& other) {
+  return torch::special_zeta(self, other);
+}
+
+inline Tensor zeta(const Tensor& self, const Scalar& other) {
+  return torch::special_zeta(self, other);
+}
+
+inline Tensor& zeta_out(
+    Tensor& result,
+    const Tensor& self,
+    const Tensor& other) {
+  return torch::special_zeta_out(result, self, other);
+}
+
+inline Tensor& zeta_out(
+    Tensor& result,
+    const Scalar& self,
+    const Tensor& other) {
+  return torch::special_zeta_out(result, self, other);
+}
+
+inline Tensor& zeta_out(
+    Tensor& result,
+    const Tensor& self,
+    const Scalar& other) {
+  return torch::special_zeta_out(result, self, other);
+}
+
+/// Computes the zeroth order modified Bessel function of the first kind of
+/// input, elementwise See
+/// https://pytorch.org/docs/main/special.html#torch.special.i0
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::i0(t);
+/// ```
+inline Tensor i0(const Tensor& self) {
+  return torch::special_i0(self);
+}
+
+inline Tensor& i0_out(Tensor& result, const Tensor& self) {
+  return torch::special_i0_out(result, self);
+}
+
+/// Computes the area under the standard Gaussian probability density function,
+/// integrated from minus infinity to :attr:`input`, elementwise
+/// See https://pytorch.org/docs/main/special.html#torch.special.ndtr
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::ndtr(t);
+/// ```
+inline Tensor ndtr(const Tensor& self) {
+  return torch::special_ndtr(self);
+}
+
+inline Tensor& ndtr_out(Tensor& result, const Tensor& self) {
+  return torch::special_ndtr_out(result, self);
+}
+
+/// Computes the exponentially scaled zeroth order modified Bessel function of
+/// the first kind See
+/// https://pytorch.org/docs/main/special.html#torch.special.i0e.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::i0e(t);
+/// ```
+inline Tensor i0e(const Tensor& self) {
+  return torch::special_i0e(self);
+}
+
+inline Tensor& i0e_out(Tensor& result, const Tensor& self) {
+  return torch::special_i0e_out(result, self);
+}
+
+/// Computes the first order modified Bessel function of the first kind
+/// See https://pytorch.org/docs/main/special.html#torch.special.i1.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::i1(t);
+/// ```
+inline Tensor i1(const Tensor& self) {
+  return torch::special_i1(self);
+}
+
+inline Tensor& i1_out(Tensor& result, const Tensor& self) {
+  return torch::special_i1_out(result, self);
+}
+
+/// Computes the exponentially scaled first order modified Bessel function of
+/// the first kind See
+/// https://pytorch.org/docs/main/special.html#torch.special.i1e.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::i1e(t);
+/// ```
+inline Tensor i1e(const Tensor& self) {
+  return torch::special_i1e(self);
+}
+
+inline Tensor& i1e_out(Tensor& result, const Tensor& self) {
+  return torch::special_i1e_out(result, self);
+}
+
+/// Computes the sinc of input, elementwise
+/// See https://pytorch.org/docs/main/special.html#torch.special.sinc.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::sinc(t);
+/// ```
+inline Tensor sinc(const Tensor& self) {
+  return torch::special_sinc(self);
+}
+
+inline Tensor& sinc_out(Tensor& result, const Tensor& self) {
+  return torch::special_sinc_out(result, self);
+}
+
+/// Rounds the elements of the input
+/// See https://pytorch.org/docs/main/special.html#torch.special.round.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::round(t);
+/// ```
+inline Tensor round(const Tensor& self) {
+  return torch::special_round(self);
+}
+
+inline Tensor& round_out(Tensor& result, const Tensor& self) {
+  return torch::special_round_out(result, self);
+}
+
+/// Computes log(1 + x) of the input, elementwise
+/// See https://pytorch.org/docs/main/special.html#torch.special.log1p.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, dtype=kDouble);
+/// torch::special::log1p(t);
+/// ```
+inline Tensor log1p(const Tensor& self) {
+  return torch::special_log1p(self);
+}
+
+inline Tensor& log1p_out(Tensor& result, const Tensor& self) {
+  return torch::special_log1p_out(result, self);
+}
+
+/// Computes log followed by softmax(x) of the input
+/// See https://pytorch.org/docs/main/special.html#torch.special.log_softmax.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, 128, dtype=kDouble);
+/// torch::special::log_softmax(t, 0);
+/// ```
+inline Tensor log_softmax(
+    const Tensor& self,
+    int64_t dim,
+    std::optional<ScalarType> dtype) {
+  return torch::special_log_softmax(self, dim, dtype);
+}
+
+/// Computes softmax of the input along a given dimension
+/// See https://pytorch.org/docs/main/special.html#torch.special.softmax.
+///
+/// Example:
+/// ```
+/// auto t = torch::randn(128, 128, dtype=kDouble);
+/// torch::special::softmax(t, 0);
+/// ```
+inline Tensor softmax(
+    const Tensor& self,
+    int64_t dim,
+    std::optional<ScalarType> dtype) {
+  return torch::special_softmax(self, dim, dtype);
+}
+
+/// Airy function Ai.
+///
+/// See https://pytorch.org/docs/main/special.html#torch.special.airy_ai.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::airy_ai(x);
+/// ```
+inline Tensor airy_ai(const Tensor& x) {
+  return torch::special_airy_ai(x);
+}
+
+inline Tensor& airy_ai_out(Tensor& y, const Tensor& x) {
+  return torch::special_airy_ai_out(y, x);
+}
+
+/// Bessel function of the first kind of order 0.
+///
+/// See https://pytorch.org/docs/main/special.html#torch.special.bessel_j0.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::bessel_j0(x);
+/// ```
+inline Tensor bessel_j0(const Tensor& self) {
+  return torch::special_bessel_j0(self);
+}
+
+inline Tensor& bessel_j0_out(Tensor& result, const Tensor& self) {
+  return torch::special_bessel_j0_out(result, self);
+}
+
+/// Bessel function of the first kind of order 1.
+///
+/// See https://pytorch.org/docs/main/special.html#torch.special.bessel_j1.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::bessel_j1(x);
+/// ```
+inline Tensor bessel_j1(const Tensor& self) {
+  return torch::special_bessel_j1(self);
+}
+
+inline Tensor& bessel_j1_out(Tensor& result, const Tensor& self) {
+  return torch::special_bessel_j1_out(result, self);
+}
+
+/// Bessel function of the second kind of order 0.
+///
+/// See https://pytorch.org/docs/main/special.html#torch.special.bessel_y0.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::bessel_y0(x);
+/// ```
+inline Tensor bessel_y0(const Tensor& self) {
+  return torch::special_bessel_y0(self);
+}
+
+inline Tensor& bessel_y0_out(Tensor& result, const Tensor& self) {
+  return torch::special_bessel_y0_out(result, self);
+}
+
+/// Bessel function of the second kind of order 1.
+///
+/// See https://pytorch.org/docs/main/special.html#torch.special.bessel_y1.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::bessel_y1(x);
+/// ```
+inline Tensor bessel_y1(const Tensor& self) {
+  return torch::special_bessel_y1(self);
+}
+
+inline Tensor& bessel_y1_out(Tensor& result, const Tensor& self) {
+  return torch::special_bessel_y1_out(result, self);
+}
+
+/// Chebyshev polynomial of the first kind.
+///
+/// See
+/// https://pytorch.org/docs/main/special.html#torch.special.chebyshev_polynomial_t.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+/// auto n = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::chebyshev_polynomial_t(x, n);
+/// ```
+inline Tensor chebyshev_polynomial_t(const Tensor& x, const Tensor& n) {
+  return torch::special_chebyshev_polynomial_t(x, n);
+}
+
+inline Tensor chebyshev_polynomial_t(const Scalar& x, const Tensor& n) {
+  return torch::special_chebyshev_polynomial_t(x, n);
+}
+
+inline Tensor chebyshev_polynomial_t(const Tensor& x, const Scalar& n) {
+  return torch::special_chebyshev_polynomial_t(x, n);
+}
+
+inline Tensor& chebyshev_polynomial_t_out(
+    Tensor& output,
+    const Tensor& x,
+    const Tensor& n) {
+  return torch::special_chebyshev_polynomial_t_out(output, x, n);
+}
+
+inline Tensor& chebyshev_polynomial_t_out(
+    Tensor& output,
+    const Scalar& x,
+    const Tensor& n) {
+  return torch::special_chebyshev_polynomial_t_out(output, x, n);
+}
+
+inline Tensor& chebyshev_polynomial_t_out(
+    Tensor& output,
+    const Tensor& x,
+    const Scalar& n) {
+  return torch::special_chebyshev_polynomial_t_out(output, x, n);
+}
+
+/// Chebyshev polynomial of the second kind.
+///
+/// See
+/// https://pytorch.org/docs/main/special.html#torch.special.chebyshev_polynomial_u.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+/// auto n = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::chebyshev_polynomial_u(x, n);
+/// ```
+inline Tensor chebyshev_polynomial_u(const Tensor& x, const Tensor& n) {
+  return torch::special_chebyshev_polynomial_u(x, n);
+}
+
+inline Tensor chebyshev_polynomial_u(const Scalar& x, const Tensor& n) {
+  return torch::special_chebyshev_polynomial_u(x, n);
+}
+
+inline Tensor chebyshev_polynomial_u(const Tensor& x, const Scalar& n) {
+  return torch::special_chebyshev_polynomial_u(x, n);
+}
+
+inline Tensor& chebyshev_polynomial_u_out(
+    Tensor& output,
+    const Tensor& x,
+    const Tensor& n) {
+  return torch::special_chebyshev_polynomial_u_out(output, x, n);
+}
+
+inline Tensor& chebyshev_polynomial_u_out(
+    Tensor& output,
+    const Scalar& x,
+    const Tensor& n) {
+  return torch::special_chebyshev_polynomial_u_out(output, x, n);
+}
+
+inline Tensor& chebyshev_polynomial_u_out(
+    Tensor& output,
+    const Tensor& x,
+    const Scalar& n) {
+  return torch::special_chebyshev_polynomial_u_out(output, x, n);
+}
+
+/// Chebyshev polynomial of the third kind.
+///
+/// See
+/// https://pytorch.org/docs/main/special.html#torch.special.chebyshev_polynomial_v.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+/// auto n = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::chebyshev_polynomial_v(x, n);
+/// ```
+inline Tensor chebyshev_polynomial_v(const Tensor& x, const Tensor& n) {
+  return torch::special_chebyshev_polynomial_v(x, n);
+}
+
+inline Tensor chebyshev_polynomial_v(const Scalar& x, const Tensor& n) {
+  return torch::special_chebyshev_polynomial_v(x, n);
+}
+
+inline Tensor chebyshev_polynomial_v(const Tensor& x, const Scalar& n) {
+  return torch::special_chebyshev_polynomial_v(x, n);
+}
+
+inline Tensor& chebyshev_polynomial_v_out(
+    Tensor& output,
+    const Tensor& x,
+    const Tensor& n) {
+  return torch::special_chebyshev_polynomial_v_out(output, x, n);
+}
+
+inline Tensor& chebyshev_polynomial_v_out(
+    Tensor& output,
+    const Scalar& x,
+    const Tensor& n) {
+  return torch::special_chebyshev_polynomial_v_out(output, x, n);
+}
+
+inline Tensor& chebyshev_polynomial_v_out(
+    Tensor& output,
+    const Tensor& x,
+    const Scalar& n) {
+  return torch::special_chebyshev_polynomial_v_out(output, x, n);
+}
+
+/// Chebyshev polynomial of the fourth kind.
+///
+/// See
+/// https://pytorch.org/docs/main/special.html#torch.special.chebyshev_polynomial_w.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+/// auto n = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::chebyshev_polynomial_w(x, n);
+/// ```
+inline Tensor chebyshev_polynomial_w(const Tensor& x, const Tensor& n) {
+  return torch::special_chebyshev_polynomial_w(x, n);
+}
+
+inline Tensor chebyshev_polynomial_w(const Scalar& x, const Tensor& n) {
+  return torch::special_chebyshev_polynomial_w(x, n);
+}
+
+inline Tensor chebyshev_polynomial_w(const Tensor& x, const Scalar& n) {
+  return torch::special_chebyshev_polynomial_w(x, n);
+}
+
+inline Tensor& chebyshev_polynomial_w_out(
+    Tensor& output,
+    const Tensor& x,
+    const Tensor& n) {
+  return torch::special_chebyshev_polynomial_w_out(output, x, n);
+}
+
+inline Tensor& chebyshev_polynomial_w_out(
+    Tensor& output,
+    const Scalar& x,
+    const Tensor& n) {
+  return torch::special_chebyshev_polynomial_w_out(output, x, n);
+}
+
+inline Tensor& chebyshev_polynomial_w_out(
+    Tensor& output,
+    const Tensor& x,
+    const Scalar& n) {
+  return torch::special_chebyshev_polynomial_w_out(output, x, n);
+}
+
+/// Physicist’s Hermite polynomial.
+///
+/// See
+/// https://pytorch.org/docs/main/special.html#torch.special.hermite_polynomial_h.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+/// auto n = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::hermite_polynomial_h(x, n);
+/// ```
+inline Tensor hermite_polynomial_h(const Tensor& x, const Tensor& n) {
+  return torch::special_hermite_polynomial_h(x, n);
+}
+
+inline Tensor hermite_polynomial_h(const Scalar& x, const Tensor& n) {
+  return torch::special_hermite_polynomial_h(x, n);
+}
+
+inline Tensor hermite_polynomial_h(const Tensor& x, const Scalar& n) {
+  return torch::special_hermite_polynomial_h(x, n);
+}
+
+inline Tensor& hermite_polynomial_h_out(
+    Tensor& output,
+    const Tensor& x,
+    const Tensor& n) {
+  return torch::special_hermite_polynomial_h_out(output, x, n);
+}
+
+inline Tensor& hermite_polynomial_h_out(
+    Tensor& output,
+    const Scalar& x,
+    const Tensor& n) {
+  return torch::special_hermite_polynomial_h_out(output, x, n);
+}
+
+inline Tensor& hermite_polynomial_h_out(
+    Tensor& output,
+    const Tensor& x,
+    const Scalar& n) {
+  return torch::special_hermite_polynomial_h_out(output, x, n);
+}
+
+/// Probabilist’s Hermite polynomial.
+///
+/// See
+/// https://pytorch.org/docs/main/special.html#torch.special.hermite_polynomial_he.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+/// auto n = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::hermite_polynomial_he(x, n);
+/// ```
+inline Tensor hermite_polynomial_he(const Tensor& x, const Tensor& n) {
+  return torch::special_hermite_polynomial_he(x, n);
+}
+
+inline Tensor hermite_polynomial_he(const Scalar& x, const Tensor& n) {
+  return torch::special_hermite_polynomial_he(x, n);
+}
+
+inline Tensor hermite_polynomial_he(const Tensor& x, const Scalar& n) {
+  return torch::special_hermite_polynomial_he(x, n);
+}
+
+inline Tensor& hermite_polynomial_he_out(
+    Tensor& output,
+    const Tensor& x,
+    const Tensor& n) {
+  return torch::special_hermite_polynomial_he_out(output, x, n);
+}
+
+inline Tensor& hermite_polynomial_he_out(
+    Tensor& output,
+    const Scalar& x,
+    const Tensor& n) {
+  return torch::special_hermite_polynomial_he_out(output, x, n);
+}
+
+inline Tensor& hermite_polynomial_he_out(
+    Tensor& output,
+    const Tensor& x,
+    const Scalar& n) {
+  return torch::special_hermite_polynomial_he_out(output, x, n);
+}
+
+/// Laguerre polynomial.
+///
+/// See
+/// https://pytorch.org/docs/main/special.html#torch.special.laguerre_polynomial_l.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+/// auto n = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::laguerre_polynomial_l(x, n);
+/// ```
+inline Tensor laguerre_polynomial_l(const Tensor& x, const Tensor& n) {
+  return torch::special_laguerre_polynomial_l(x, n);
+}
+
+inline Tensor laguerre_polynomial_l(const Scalar& x, const Tensor& n) {
+  return torch::special_laguerre_polynomial_l(x, n);
+}
+
+inline Tensor laguerre_polynomial_l(const Tensor& x, const Scalar& n) {
+  return torch::special_laguerre_polynomial_l(x, n);
+}
+
+inline Tensor& laguerre_polynomial_l_out(
+    Tensor& output,
+    const Tensor& x,
+    const Tensor& n) {
+  return torch::special_laguerre_polynomial_l_out(output, x, n);
+}
+
+inline Tensor& laguerre_polynomial_l_out(
+    Tensor& output,
+    const Scalar& x,
+    const Tensor& n) {
+  return torch::special_laguerre_polynomial_l_out(output, x, n);
+}
+
+inline Tensor& laguerre_polynomial_l_out(
+    Tensor& output,
+    const Tensor& x,
+    const Scalar& n) {
+  return torch::special_laguerre_polynomial_l_out(output, x, n);
+}
+
+/// Legendre polynomial.
+///
+/// See
+/// https://pytorch.org/docs/main/special.html#torch.special.legendre_polynomial_p.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+/// auto n = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::legendre_polynomial_p(x, n);
+/// ```
+inline Tensor legendre_polynomial_p(const Tensor& x, const Tensor& n) {
+  return torch::special_legendre_polynomial_p(x, n);
+}
+
+inline Tensor legendre_polynomial_p(const Scalar& x, const Tensor& n) {
+  return torch::special_legendre_polynomial_p(x, n);
+}
+
+inline Tensor legendre_polynomial_p(const Tensor& x, const Scalar& n) {
+  return torch::special_legendre_polynomial_p(x, n);
+}
+
+inline Tensor& legendre_polynomial_p_out(
+    Tensor& output,
+    const Tensor& x,
+    const Tensor& n) {
+  return torch::special_legendre_polynomial_p_out(output, x, n);
+}
+
+inline Tensor& legendre_polynomial_p_out(
+    Tensor& output,
+    const Scalar& x,
+    const Tensor& n) {
+  return torch::special_legendre_polynomial_p_out(output, x, n);
+}
+
+inline Tensor& legendre_polynomial_p_out(
+    Tensor& output,
+    const Tensor& x,
+    const Scalar& n) {
+  return torch::special_legendre_polynomial_p_out(output, x, n);
+}
+
+/// Modified Bessel function of the first kind of order 0.
+///
+/// See
+/// https://pytorch.org/docs/main/special.html#torch.special.modified_bessel_i0.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::modified_bessel_i0(x);
+/// ```
+inline Tensor modified_bessel_i0(const Tensor& self) {
+  return torch::special_modified_bessel_i0(self);
+}
+
+inline Tensor& modified_bessel_i0_out(Tensor& result, const Tensor& self) {
+  return torch::special_modified_bessel_i0_out(result, self);
+}
+
+/// Modified Bessel function of the first kind of order 1.
+///
+/// See
+/// https://pytorch.org/docs/main/special.html#torch.special.modified_bessel_i1.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::modified_bessel_i1(x);
+/// ```
+inline Tensor modified_bessel_i1(const Tensor& self) {
+  return torch::special_modified_bessel_i1(self);
+}
+
+inline Tensor& modified_bessel_i1_out(Tensor& result, const Tensor& self) {
+  return torch::special_modified_bessel_i1_out(result, self);
+}
+
+/// Modified Bessel function of the second kind of order 0.
+///
+/// See
+/// https://pytorch.org/docs/main/special.html#torch.special.modified_bessel_k0.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::modified_bessel_k0(x);
+/// ```
+inline Tensor modified_bessel_k0(const Tensor& self) {
+  return torch::special_modified_bessel_k0(self);
+}
+
+inline Tensor& modified_bessel_k0_out(Tensor& result, const Tensor& self) {
+  return torch::special_modified_bessel_k0_out(result, self);
+}
+
+/// Modified Bessel function of the second kind of order 1.
+///
+/// See
+/// https://pytorch.org/docs/main/special.html#torch.special.modified_bessel_k1.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::modified_bessel_k1(x);
+/// ```
+inline Tensor modified_bessel_k1(const Tensor& self) {
+  return torch::special_modified_bessel_k1(self);
+}
+
+inline Tensor& modified_bessel_k1_out(Tensor& result, const Tensor& self) {
+  return torch::special_modified_bessel_k1_out(result, self);
+}
+
+/// Scaled modified Bessel function of the second kind of order 0.
+///
+/// See
+/// https://pytorch.org/docs/main/special.html#torch.special.scaled_modified_bessel_k0.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::scaled_modified_bessel_k0(x);
+/// ```
+inline Tensor scaled_modified_bessel_k0(const Tensor& x) {
+  return torch::special_scaled_modified_bessel_k0(x);
+}
+
+inline Tensor& scaled_modified_bessel_k0_out(Tensor& y, const Tensor& x) {
+  return torch::special_scaled_modified_bessel_k0_out(y, x);
+}
+
+/// Scaled modified Bessel function of the second kind of order 1.
+///
+/// See
+/// https://pytorch.org/docs/main/special.html#torch.special.scaled_modified_bessel_k1.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::scaled_modified_bessel_k1(x);
+/// ```
+inline Tensor scaled_modified_bessel_k1(const Tensor& x) {
+  return torch::special_scaled_modified_bessel_k1(x);
+}
+
+inline Tensor& scaled_modified_bessel_k1_out(Tensor& y, const Tensor& x) {
+  return torch::special_scaled_modified_bessel_k1_out(y, x);
+}
+
+/// Shifted Chebyshev polynomial of the first kind.
+///
+/// See
+/// https://pytorch.org/docs/main/special.html#torch.special.shifted_chebyshev_polynomial_t.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+/// auto n = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::shifted_chebyshev_polynomial_t(x, n);
+/// ```
+inline Tensor shifted_chebyshev_polynomial_t(const Tensor& x, const Tensor& n) {
+  return torch::special_shifted_chebyshev_polynomial_t(x, n);
+}
+
+inline Tensor shifted_chebyshev_polynomial_t(const Scalar& x, const Tensor& n) {
+  return torch::special_shifted_chebyshev_polynomial_t(x, n);
+}
+
+inline Tensor shifted_chebyshev_polynomial_t(const Tensor& x, const Scalar& n) {
+  return torch::special_shifted_chebyshev_polynomial_t(x, n);
+}
+
+inline Tensor& shifted_chebyshev_polynomial_t_out(
+    Tensor& output,
+    const Tensor& x,
+    const Tensor& n) {
+  return torch::special_shifted_chebyshev_polynomial_t_out(output, x, n);
+}
+
+inline Tensor& shifted_chebyshev_polynomial_t_out(
+    Tensor& output,
+    const Scalar& x,
+    const Tensor& n) {
+  return torch::special_shifted_chebyshev_polynomial_t_out(output, x, n);
+}
+
+inline Tensor& shifted_chebyshev_polynomial_t_out(
+    Tensor& output,
+    const Tensor& x,
+    const Scalar& n) {
+  return torch::special_shifted_chebyshev_polynomial_t_out(output, x, n);
+}
+
+/// Shifted Chebyshev polynomial of the second kind.
+///
+/// See
+/// https://pytorch.org/docs/main/special.html#torch.special.shifted_chebyshev_polynomial_u.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+/// auto n = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::shifted_chebyshev_polynomial_u(x, n);
+/// ```
+inline Tensor shifted_chebyshev_polynomial_u(const Tensor& x, const Tensor& n) {
+  return torch::special_shifted_chebyshev_polynomial_u(x, n);
+}
+
+inline Tensor shifted_chebyshev_polynomial_u(const Scalar& x, const Tensor& n) {
+  return torch::special_shifted_chebyshev_polynomial_u(x, n);
+}
+
+inline Tensor shifted_chebyshev_polynomial_u(const Tensor& x, const Scalar& n) {
+  return torch::special_shifted_chebyshev_polynomial_u(x, n);
+}
+
+inline Tensor& shifted_chebyshev_polynomial_u_out(
+    Tensor& output,
+    const Tensor& x,
+    const Tensor& n) {
+  return torch::special_shifted_chebyshev_polynomial_u_out(output, x, n);
+}
+
+inline Tensor& shifted_chebyshev_polynomial_u_out(
+    Tensor& output,
+    const Scalar& x,
+    const Tensor& n) {
+  return torch::special_shifted_chebyshev_polynomial_u_out(output, x, n);
+}
+
+inline Tensor& shifted_chebyshev_polynomial_u_out(
+    Tensor& output,
+    const Tensor& x,
+    const Scalar& n) {
+  return torch::special_shifted_chebyshev_polynomial_u_out(output, x, n);
+}
+
+/// Shifted Chebyshev polynomial of the third kind.
+///
+/// See
+/// https://pytorch.org/docs/main/special.html#torch.special.shifted_chebyshev_polynomial_v.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+/// auto n = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::shifted_chebyshev_polynomial_v(x, n);
+/// ```
+inline Tensor shifted_chebyshev_polynomial_v(const Tensor& x, const Tensor& n) {
+  return torch::special_shifted_chebyshev_polynomial_v(x, n);
+}
+
+inline Tensor shifted_chebyshev_polynomial_v(const Scalar& x, const Tensor& n) {
+  return torch::special_shifted_chebyshev_polynomial_v(x, n);
+}
+
+inline Tensor shifted_chebyshev_polynomial_v(const Tensor& x, const Scalar& n) {
+  return torch::special_shifted_chebyshev_polynomial_v(x, n);
+}
+
+inline Tensor& shifted_chebyshev_polynomial_v_out(
+    Tensor& output,
+    const Tensor& x,
+    const Tensor& n) {
+  return torch::special_shifted_chebyshev_polynomial_v_out(output, x, n);
+}
+
+inline Tensor& shifted_chebyshev_polynomial_v_out(
+    Tensor& output,
+    const Scalar& x,
+    const Tensor& n) {
+  return torch::special_shifted_chebyshev_polynomial_v_out(output, x, n);
+}
+
+inline Tensor& shifted_chebyshev_polynomial_v_out(
+    Tensor& output,
+    const Tensor& x,
+    const Scalar& n) {
+  return torch::special_shifted_chebyshev_polynomial_v_out(output, x, n);
+}
+
+/// Shifted Chebyshev polynomial of the fourth kind.
+///
+/// See
+/// https://pytorch.org/docs/main/special.html#torch.special.shifted_chebyshev_polynomial_w.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+/// auto n = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::shifted_chebyshev_polynomial_w(x, n);
+/// ```
+inline Tensor shifted_chebyshev_polynomial_w(const Tensor& x, const Tensor& n) {
+  return torch::special_shifted_chebyshev_polynomial_w(x, n);
+}
+
+inline Tensor shifted_chebyshev_polynomial_w(const Scalar& x, const Tensor& n) {
+  return torch::special_shifted_chebyshev_polynomial_w(x, n);
+}
+
+inline Tensor shifted_chebyshev_polynomial_w(const Tensor& x, const Scalar& n) {
+  return torch::special_shifted_chebyshev_polynomial_w(x, n);
+}
+
+inline Tensor& shifted_chebyshev_polynomial_w_out(
+    Tensor& output,
+    const Tensor& x,
+    const Tensor& n) {
+  return torch::special_shifted_chebyshev_polynomial_w_out(output, x, n);
+}
+
+inline Tensor& shifted_chebyshev_polynomial_w_out(
+    Tensor& output,
+    const Scalar& x,
+    const Tensor& n) {
+  return torch::special_shifted_chebyshev_polynomial_w_out(output, x, n);
+}
+
+inline Tensor& shifted_chebyshev_polynomial_w_out(
+    Tensor& output,
+    const Tensor& x,
+    const Scalar& n) {
+  return torch::special_shifted_chebyshev_polynomial_w_out(output, x, n);
+}
+
+/// Spherical Bessel function of the first kind of order 0.
+///
+/// See
+/// https://pytorch.org/docs/main/special.html#torch.special.spherical_bessel_j0.
+///
+/// Example:
+///
+/// ```
+/// auto x = torch::randn(128, dtype=kDouble);
+///
+/// torch::special::spherical_bessel_j0(x);
+/// ```
+inline Tensor spherical_bessel_j0(const Tensor& x) {
+  return torch::special_spherical_bessel_j0(x);
+}
+
+inline Tensor& spherical_bessel_j0_out(Tensor& y, const Tensor& x) {
+  return torch::special_spherical_bessel_j0_out(y, x);
+}
+} // namespace torch::special
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/torch.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/torch.h
new file mode 100644
index 0000000000000000000000000000000000000000..aed9f82b0d0177b14e32bc1b381228a4b87ce896
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/torch.h
@@ -0,0 +1,8 @@
+#pragma once
+
+#include <torch/all.h>
+
+#ifdef TORCH_API_INCLUDE_EXTENSION_H
+#include <torch/extension.h>
+
+#endif // defined(TORCH_API_INCLUDE_EXTENSION_H)
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/types.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/types.h
new file mode 100644
index 0000000000000000000000000000000000000000..d220832ea042c643e5f969e5dcbfee255b5bd973
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/types.h
@@ -0,0 +1,70 @@
+#pragma once
+
+#include <ATen/ATen.h>
+
+#include <optional>
+
+#include <torch/csrc/autograd/generated/variable_factories.h>
+#include <torch/csrc/autograd/variable.h>
+
+#include <torch/library.h>
+
+namespace torch {
+
+// NOTE [ Exposing declarations in `at::` to `torch::` ]
+//
+// The following line `using namespace at;` is responsible for exposing all
+// declarations in `at::` namespace to `torch::` namespace.
+//
+// According to the rules laid out in
+// https://en.cppreference.com/w/cpp/language/qualified_lookup, section
+// "Namespace members":
+// ```
+// Qualified lookup within the scope of a namespace N first considers all
+// declarations that are located in N and all declarations that are located in
+// the inline namespace members of N (and, transitively, in their inline
+// namespace members). If there are no declarations in that set then it
+// considers declarations in all namespaces named by using-directives found in N
+// and in all transitive inline namespace members of N.
+// ```
+//
+// This means that if both `at::` and `torch::` namespaces have a function with
+// the same signature (e.g. both `at::func()` and `torch::func()` exist), after
+// `namespace torch { using namespace at; }`, when we call `torch::func()`, the
+// `func()` function defined in `torch::` namespace will always be called, and
+// the `func()` function defined in `at::` namespace is always hidden.
+using namespace at; // NOLINT
+
+#if !defined(FBCODE_CAFFE2) && !defined(C10_NODEPRECATED)
+using std::nullopt; // NOLINT
+using std::optional; // NOLINT
+#endif
+
+using Dtype = at::ScalarType;
+
+/// Fixed width dtypes.
+constexpr auto kUInt8 = at::kByte;
+constexpr auto kInt8 = at::kChar;
+constexpr auto kInt16 = at::kShort;
+constexpr auto kInt32 = at::kInt;
+constexpr auto kInt64 = at::kLong;
+constexpr auto kUInt16 = at::kUInt16;
+constexpr auto kUInt32 = at::kUInt32;
+constexpr auto kUInt64 = at::kUInt64;
+constexpr auto kFloat16 = at::kHalf;
+constexpr auto kFloat32 = at::kFloat;
+constexpr auto kFloat64 = at::kDouble;
+
+/// Rust-style short dtypes.
+constexpr auto kU8 = kUInt8;
+constexpr auto kU16 = kUInt16;
+constexpr auto kU32 = kUInt32;
+constexpr auto kU64 = kUInt64;
+constexpr auto kI8 = kInt8;
+constexpr auto kI16 = kInt16;
+constexpr auto kI32 = kInt32;
+constexpr auto kI64 = kInt64;
+constexpr auto kF16 = kFloat16;
+constexpr auto kF32 = kFloat32;
+constexpr auto kF64 = kFloat64;
+} // namespace torch
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/utils.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..d5363b9bb99d16059b6a2fab190d65717d4a32b6
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/utils.h
@@ -0,0 +1,117 @@
+#pragma once
+
+#include <ATen/Parallel.h>
+#include <ATen/record_function.h>
+#include <torch/csrc/api/include/torch/types.h>
+#include <torch/csrc/autograd/grad_mode.h>
+#include <torch/csrc/autograd/profiler.h>
+
+// NOLINTBEGIN(misc-unused-using-decls)
+namespace torch {
+
+/// A RAII, thread-local guard that disabled gradient calculation.
+///
+/// Disabling gradient calculation is useful for inference, when you are sure
+/// that you will not call `at::Tensor::backward`. It will reduce memory
+/// consumption for computations that would otherwise have `requires_grad() ==
+/// true`.
+///
+/// In this mode, the result of every computation will have
+/// `requires_grad() == false`, even when the inputs have `requires_grad() ==
+/// true`.
+///
+/// This context manager is thread-local; it will not affect computation
+/// in other threads.
+///
+/// Example:
+/// @code
+/// auto x = torch::tensor({1.}, torch::requires_grad());
+/// {
+///   torch::NoGradGuard no_grad;
+///   auto y = x * 2;
+///   std::cout << y.requires_grad() << std::endl; // prints `false`
+/// }
+/// {
+///   auto doubler = [](torch::Tensor x) {
+///     torch::NoGradGuard no_grad;
+///     return x * 2;
+///   };
+///   auto z = doubler(x);
+///   std::cout << z.requires_grad() << std::endl; // prints `false`
+/// }
+/// @endcode
+using NoGradGuard = at::NoGradGuard;
+
+/// A RAII, thread-local guard that sets gradient calculation to on or off.
+///
+/// ``AutoGradMode`` will enable or disable grads based on its argument
+/// `enabled`.
+///
+/// This context manager is thread-local; it will not affect computation
+/// in other threads.
+///
+/// \param enabled: Flag whether to enable grad (``true``), or disable
+///              (``false``). This can be used to conditionally enable
+///              gradients.
+///
+/// Example:
+/// @code
+/// auto x = torch::tensor({1.}, torch::requires_grad());
+/// {
+///   torch::AutoGradMode enable_grad(true);
+///   auto y = x * 2;
+///   std::cout << y.requires_grad() << std::endl; // prints `true`
+/// }
+/// {
+///   torch::AutoGradMode enable_grad(false);
+///   auto y = x * 2;
+///   std::cout << y.requires_grad() << std::endl; // prints `false`
+/// }
+/// @endcode
+using AutoGradMode = at::AutoGradMode;
+
+/// Sets the global random seed for all newly created CPU and CUDA tensors.
+using at::manual_seed;
+
+// Called during new thread initialization
+using at::init_num_threads;
+
+// Returns the number of threads used in parallel region.
+using at::get_num_threads;
+
+// Sets the number of threads to be used in parallel region.
+using at::set_num_threads;
+
+// Returns the number of threads used for inter-op parallelism.
+using at::get_num_interop_threads;
+
+// Sets the number of threads to be used for inter-op parallelism.
+using at::set_num_interop_threads;
+
+// Returns true if both t1, t2 are undefined or both are defined and equal
+inline bool equal_if_defined(const Tensor& t1, const Tensor& t2) {
+  return (
+      (!t1.defined() && !t2.defined()) ||
+      (t1.defined() && t2.defined() && torch::equal(t1, t2)));
+}
+
+// RecordFunction API
+using at::addGlobalCallback;
+using at::addThreadLocalCallback;
+using at::CallbackHandle;
+using at::clearCallbacks;
+using at::clearGlobalCallbacks;
+using at::clearThreadLocalCallbacks;
+using at::DisableRecordFunctionGuard;
+using at::enableRecordFunction;
+using at::hasCallbacks;
+using at::hasGlobalCallbacks;
+using at::hasThreadLocalCallbacks;
+using at::isRecordFunctionEnabled;
+using at::RecordFunction;
+using at::RecordFunctionCallback;
+using at::RecordFunctionGuard;
+using at::removeCallback;
+
+} // namespace torch
+// NOLINTEND(misc-unused-using-decls)
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/version.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/version.h
new file mode 100644
index 0000000000000000000000000000000000000000..441d0dcd9b414819e780164e2cea71052bed3f96
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/version.h
@@ -0,0 +1,26 @@
+#pragma once
+
+/// Indicates the major version of LibTorch.
+#define TORCH_VERSION_MAJOR 2
+
+/// Indicates the minor version of LibTorch.
+#define TORCH_VERSION_MINOR 8
+
+/// Indicates the patch version of LibTorch.
+#define TORCH_VERSION_PATCH 0
+
+/// Indicates the ABI version tag of LibTorch.
+#define TORCH_VERSION_ABI_TAG 0
+
+/// Indicates the version of LibTorch as a string literal.
+#define TORCH_VERSION \
+  "2.8.0"
+
+/// Indicates the ABI version of LibTorch as a single uint64.
+/// [ byte ][ byte ][ byte ][ byte ][ byte ][ byte ][ byte ][ byte ]
+/// [ MAJ  ][ MIN  ][ PATCH][                              ABI TAG ]
+#define TORCH_ABI_VERSION \
+  (uint64_t)TORCH_VERSION_MAJOR << 56 | \
+  (uint64_t)TORCH_VERSION_MINOR << 48 | \
+  (uint64_t)TORCH_VERSION_PATCH << 40 | \
+  TORCH_VERSION_ABI_TAG << 0
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/xpu.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/xpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..edb607ee00a69b4c09cedb40ce01d24388c53626
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/api/include/torch/xpu.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+
+#include <cstddef>
+#include <cstdint>
+
+namespace torch::xpu {
+
+/// Returns the number of XPU devices available.
+size_t TORCH_API device_count();
+
+/// Returns true if at least one XPU device is available.
+bool TORCH_API is_available();
+
+/// Sets the seed for the current GPU.
+void TORCH_API manual_seed(uint64_t seed);
+
+/// Sets the seed for all available GPUs.
+void TORCH_API manual_seed_all(uint64_t seed);
+
+/// Waits for all kernels in all streams on a XPU device to complete.
+void TORCH_API synchronize(int64_t device_index);
+
+} // namespace torch::xpu
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/FunctionsManual.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/FunctionsManual.h
new file mode 100644
index 0000000000000000000000000000000000000000..0025232db83ed6f6b802a2cb5f9e709a6bd88211
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/FunctionsManual.h
@@ -0,0 +1,1131 @@
+#pragma once
+
+// NB: Must be at the top of file to avoid including the deprecated "math.h".
+// https://stackoverflow.com/questions/6563810/m-pi-works-with-math-h-but-not-with-cmath-in-visual-studio
+#ifdef _MSC_VER
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES
+#endif
+#include <cmath>
+#endif
+
+#include <ATen/ATen.h>
+#include <torch/csrc/autograd/generated/Functions.h>
+
+namespace torch::autograd::generated::details {
+
+extern const char* kCudnnDoubleBackwardMsg;
+
+// A simple way to imperatively compute index ranges for slots
+// that have been flattened
+struct TORCH_API IndexRangeGenerator {
+  IndexRange range(size_t range_size) {
+    i += range_size;
+    return {i - range_size, i};
+  }
+  size_t size() {
+    return i;
+  }
+
+ private:
+  size_t i = 0;
+};
+
+TORCH_API Tensor toNonOptFwGrad(const std::optional<Tensor>& t);
+TORCH_API Tensor toNonOptPrimal(const std::optional<Tensor>& t);
+TORCH_API Tensor toNonOptTensor(const std::optional<Tensor>& t);
+
+TORCH_API inline std::optional<Tensor> wrap_opt_if(
+    const Tensor& t,
+    const bool cond) {
+  using OptTensor = std::optional<Tensor>;
+  return cond ? OptTensor(t) : static_cast<OptTensor>(std::nullopt);
+}
+
+TORCH_API Tensor
+apply_loss_reduction(const Tensor& unreduced, int64_t reduction);
+TORCH_API bool any_variable_defined(const variable_list& variables);
+TORCH_API void copy_range(
+    variable_list& out,
+    IndexRange range,
+    const at::Tensor& t);
+TORCH_API void copy_range(
+    variable_list& out,
+    IndexRange range,
+    at::ArrayRef<at::Tensor> t);
+TORCH_API at::Tensor copysign_tensor_self_backward(
+    const Tensor& grad,
+    const Tensor& self,
+    const Tensor& result);
+TORCH_API at::Tensor not_implemented(const char* name, const char* reason = "");
+TORCH_API std::vector<Tensor> not_implemented_list(
+    const char* name,
+    const char* reason = "");
+at::Tensor handle_r_to_c(ScalarType self_st, Tensor gradient_result);
+at::Tensor maybe_multiply(const at::Tensor& t, const at::Scalar& s);
+int64_t _safe_size(IntArrayRef sizes, IntArrayRef dim);
+Tensor restore_reduced_dims(
+    const Tensor& output,
+    IntArrayRef dims,
+    bool keepdim);
+Tensor scale_grad_by_count(
+    const Tensor& grad,
+    const Tensor& mask,
+    IntArrayRef dims);
+at::Tensor norm_backward(
+    const at::Tensor& grad,
+    const at::Tensor& self,
+    const std::optional<at::Scalar>& p_,
+    const at::Tensor& norm);
+at::Tensor norm_backward(
+    at::Tensor grad,
+    const at::Tensor& self,
+    const std::optional<at::Scalar>& p_,
+    at::Tensor norm,
+    at::IntArrayRef dim,
+    bool keepdim);
+Tensor norm_jvp(
+    const Tensor& self_p,
+    const Tensor& self_t,
+    const std::optional<Scalar>& p_,
+    Tensor norm,
+    IntArrayRef dim,
+    bool keepdim);
+Tensor norm_jvp(
+    const Tensor& grad,
+    const Tensor& self,
+    const std::optional<Scalar>& p_,
+    Tensor norm);
+Tensor _nested_from_padded_backward(
+    const Tensor& grad,
+    const Tensor& input,
+    const bool do_transform_0213);
+std::tuple<Tensor, Tensor, Tensor> linear_double_backward(
+    const variable_list& grads,
+    const Tensor& self,
+    const Tensor& grad_output,
+    const Tensor& weight);
+Tensor linalg_vector_norm_jvp(
+    const Tensor& self_p,
+    const Tensor& self_t,
+    const Scalar& scalar_ord,
+    Tensor norm,
+    const at::OptionalIntArrayRef& opt_dim,
+    bool keepdim);
+at::Tensor linalg_vector_norm_backward(
+    at::Tensor grad,
+    const at::Tensor& self,
+    const at::Scalar& ord,
+    at::Tensor norm,
+    const at::OptionalIntArrayRef& opt_dim,
+    bool keepdim);
+at::Tensor pow_backward(
+    at::Tensor grad,
+    const at::Tensor& self,
+    const at::Scalar& exponent_);
+at::Tensor pow_backward_self(
+    const at::Tensor& grad,
+    const at::Tensor& self,
+    const at::Tensor& exponent);
+at::Tensor pow_backward_exponent(
+    const at::Tensor& grad,
+    const at::Tensor& self,
+    const at::Tensor& exponent,
+    const at::Tensor& result);
+at::Tensor pow_backward_exponent(
+    const at::Tensor& grad,
+    const at::Scalar& base,
+    const at::Tensor& exponent,
+    const at::Tensor& result);
+at::Tensor angle_backward(const at::Tensor& grad, const at::Tensor& self);
+template <typename T>
+at::Tensor mul_tensor_backward(const Tensor& grad, T other, ScalarType self_st);
+template <typename T>
+at::Tensor div_tensor_self_backward(
+    const Tensor& grad,
+    T other,
+    ScalarType self_st,
+    const std::optional<std::string_view>& rounding_mode = std::nullopt);
+at::Tensor div_tensor_other_backward(
+    const Tensor& grad,
+    const Tensor& self,
+    const Tensor& other,
+    const std::optional<std::string_view>& rounding_mode = std::nullopt);
+at::Tensor mvlgamma_backward(
+    const at::Tensor& grad,
+    const at::Tensor& self,
+    int64_t p);
+at::Tensor permute_backwards(const at::Tensor& grad, at::IntArrayRef fwd_dims);
+at::Tensor rad2deg_backward(const at::Tensor& grad);
+at::Tensor deg2rad_backward(const at::Tensor& grad);
+at::Tensor unsqueeze_multiple(
+    const at::Tensor& t,
+    at::OptionalIntArrayRef opt_dim,
+    size_t n_dims);
+at::Tensor sum_backward(
+    const at::Tensor& grad,
+    at::SymIntArrayRef sizes,
+    at::OptionalIntArrayRef opt_dims,
+    bool keepdim);
+at::Tensor sum_backward(
+    const at::Tensor& grad,
+    c10::SymIntArrayRef sizes,
+    c10::IntArrayRef dims,
+    bool keepdim);
+at::Tensor nansum_backward(
+    const at::Tensor& grad,
+    const at::Tensor& self,
+    at::OptionalIntArrayRef dims,
+    bool keepdim);
+std::vector<int64_t> reverse_list(const at::IntArrayRef list);
+std::vector<c10::SymInt> reverse_list_symint(const c10::SymIntArrayRef list);
+at::Tensor reverse_dim(const at::Tensor& t, int64_t dim);
+at::Tensor prod_safe_zeros_backward(
+    const at::Tensor& grad,
+    const at::Tensor& inp,
+    int64_t dim);
+at::Tensor prod_backward(
+    const at::Tensor& grad,
+    const at::Tensor& input,
+    const at::Tensor& result);
+at::Tensor prod_backward(
+    at::Tensor grad,
+    const at::Tensor& input,
+    at::Tensor result,
+    int64_t dim,
+    bool keepdim);
+at::Tensor solve_jvp(
+    const Tensor& X,
+    const Tensor& A,
+    const Tensor& dA,
+    const Tensor& dB);
+at::Tensor solve_backward_self(
+    const at::Tensor& grad,
+    const at::Tensor& self,
+    const at::Tensor& A);
+at::Tensor solve_backward_A(
+    const at::Tensor& grad,
+    const at::Tensor& self,
+    const at::Tensor& A,
+    const at::Tensor& solution);
+at::Tensor cumsum_backward(const at::Tensor& grad, int64_t dim);
+at::Tensor logsumexp_backward(
+    at::Tensor grad,
+    const at::Tensor& self,
+    at::Tensor result,
+    at::IntArrayRef dim,
+    bool keepdim);
+at::Tensor logsumexp_jvp(
+    const at::Tensor& self_p,
+    const at::Tensor& self_t,
+    IntArrayRef dim,
+    bool keepdim);
+at::Tensor safe_logsumexp_jvp(
+    const at::Tensor& self_p,
+    const at::Tensor& self_t,
+    IntArrayRef dim,
+    bool keepdim);
+at::Tensor logcumsumexp_backward(
+    at::Tensor grad,
+    const at::Tensor& self,
+    const at::Tensor& result,
+    int64_t dim);
+at::Tensor logcumsumexp_jvp(
+    const at::Tensor& self_p,
+    const at::Tensor& self_t,
+    int64_t dim);
+at::Tensor unbind_backward(const variable_list& grads, int64_t dim);
+at::Tensor unbind_backward_nested(
+    const variable_list& grads,
+    const Tensor& nt_sizes,
+    int64_t dim,
+    const at::TensorOptions& options);
+at::Tensor unbind_backward_nested_jagged(
+    const variable_list& grads,
+    const Tensor& self,
+    int64_t dim);
+at::Tensor unsqueeze_to(const at::Tensor& self, c10::SymIntArrayRef sym_sizes);
+at::Tensor unsqueeze_to(
+    const at::Tensor& self,
+    int64_t dim,
+    c10::SymIntArrayRef sym_sizes);
+at::Tensor unsqueeze_to(
+    const at::Tensor& self,
+    IntArrayRef dim,
+    c10::SymIntArrayRef sym_sizes);
+std::vector<at::Tensor> cat_tensors_backward(
+    const at::Tensor& grad,
+    const std::vector<std::vector<c10::SymInt>>& sizes,
+    const std::vector<ScalarType>& dtypes,
+    int64_t dim);
+std::vector<at::Tensor> stack_tensors_backward(
+    const at::Tensor& grad,
+    int64_t dim,
+    const std::vector<ScalarType>& dtypes);
+std::vector<at::Tensor> block_diag_backward(
+    const at::Tensor& grad,
+    const std::vector<std::vector<int64_t>>& sizes,
+    const std::vector<ScalarType>& dtypes);
+at::Tensor clamp_backward(
+    const at::Tensor& grad,
+    const at::Tensor& self,
+    const std::optional<at::Scalar>& min,
+    const std::optional<at::Scalar>& max);
+at::Tensor clamp_backward(
+    const at::Tensor& grad,
+    const at::Tensor& self,
+    const at::Tensor& min,
+    const at::Tensor& max);
+std::tuple<at::Tensor, at::Tensor> clamp_backward_min_max(
+    const at::Tensor& grad,
+    const at::Tensor& self,
+    const at::Tensor& min,
+    const at::Tensor& max,
+    const std::array<bool, 2>&);
+at::Tensor clamp_jvp(
+    const Tensor& self_p,
+    const Tensor& self_t,
+    const Tensor& min_p,
+    const Tensor& min_t,
+    const Tensor& max_p,
+    const Tensor& max_t);
+at::SymIntArrayRef strides_or_error(
+    const Tensor& input,
+    std::string_view const& input_name);
+at::Tensor mm_mat1_backward(
+    const Tensor& grad,
+    const Tensor& mat2,
+    at::SymIntArrayRef mat1_sizes,
+    at::SymIntArrayRef mat1_strides,
+    c10::Layout mat1_layout,
+    const Scalar& alpha);
+at::Tensor mm_mat2_backward(
+    const at::Tensor& grad,
+    const at::Tensor& mat1,
+    at::SymIntArrayRef sizes,
+    at::SymIntArrayRef strides,
+    c10::Layout layout,
+    const at::Scalar& alpha);
+at::Tensor _grouped_mm_mat1_backward(
+    const Tensor& grad,
+    const Tensor& mat2,
+    at::SymIntArrayRef mat1_sizes,
+    at::SymIntArrayRef mat1_strides,
+    c10::Layout mat1_layout,
+    std::optional<Tensor> offs,
+    const Scalar& alpha);
+at::Tensor _grouped_mm_mat2_backward(
+    const at::Tensor& grad,
+    const at::Tensor& mat1,
+    at::SymIntArrayRef sizes,
+    at::SymIntArrayRef strides,
+    c10::Layout layout,
+    std::optional<Tensor> offs,
+    const at::Scalar& alpha);
+at::Tensor mm_mat1_sparse_backward(
+    const at::Tensor& grad,
+    const at::Tensor& mat1,
+    const at::Tensor& mat2,
+    const at::Scalar& alpha);
+std::tuple<Tensor, Tensor, Tensor> sparse_sampled_addmm_backward(
+    const Tensor& grad,
+    const Tensor& self,
+    const std::optional<Tensor>& mat1,
+    const std::optional<Tensor>& mat2,
+    const Scalar& alpha,
+    const Scalar& beta,
+    const std::array<bool, 3>& grad_input_mask);
+at::Tensor sparse_mask_backward(
+    const at::Tensor& grad,
+    const at::Tensor& mask,
+    c10::Layout self_layout);
+at::Tensor sparse_sparse_matmul_backward(
+    const at::Tensor& grad,
+    const at::Tensor& mat1,
+    const at::Tensor& mat2,
+    int64_t grad_order);
+at::Tensor renorm_backward(
+    const at::Tensor& grad,
+    const at::Tensor& self,
+    const at::Scalar& p,
+    int64_t dim,
+    const at::Scalar& maxnorm);
+at::Tensor renorm_jvp(
+    const at::Tensor& self_p,
+    const at::Tensor& self_t,
+    const at::Scalar& p,
+    int64_t dim,
+    const at::Scalar& maxnorm);
+at::Tensor repeat_backward(
+    at::Tensor grad,
+    at::SymIntArrayRef repeats,
+    at::SymIntArrayRef input_shape);
+at::Tensor _fused_dropout_backward(
+    const at::Tensor& grad,
+    const at::Tensor& mask,
+    double p1m);
+at::Tensor infinitely_differentiable_native_dropout_backward(
+    const at::Tensor& grad,
+    const at::Tensor& mask,
+    double scale);
+at::Tensor native_dropout_double_backward(
+    const at::Tensor& ggI,
+    const at::Tensor& grad,
+    const at::Tensor& mask,
+    double scale);
+at::Tensor evenly_distribute_backward(
+    const at::Tensor& grad,
+    const at::Tensor& input,
+    const at::Tensor& value);
+Tensor sgn_backward(const Tensor& x, const Tensor& gx, const Tensor& sgn);
+Tensor masked_fill_backward(const Tensor& grad, const Tensor& mask);
+at::Tensor var_backward(
+    at::Tensor grad,
+    const at::Tensor& self,
+    at::OptionalIntArrayRef dim,
+    const std::optional<c10::Scalar>& correction,
+    bool keepdim);
+at::Tensor var_jvp(
+    const at::Tensor& self_t,
+    const at::Tensor& self_p,
+    const at::Tensor& result,
+    at::OptionalIntArrayRef dim_opt,
+    const std::optional<c10::Scalar>& correction,
+    bool keepdim);
+at::Tensor std_backward(
+    const at::Tensor& result,
+    const at::Tensor& grad,
+    const at::Tensor& self,
+    at::OptionalIntArrayRef dim,
+    const std::optional<c10::Scalar>& correction,
+    bool keepdim);
+Tensor mean_backward(
+    const Tensor& grad,
+    c10::SymIntArrayRef shape,
+    at::OptionalIntArrayRef opt_dim,
+    c10::SymInt numel,
+    bool keepdim);
+Tensor var_mean_backward(
+    const Tensor& gvar,
+    const Tensor& gmean,
+    const Tensor& self,
+    at::OptionalIntArrayRef dim_opt,
+    const std::optional<c10::Scalar>& correction,
+    bool keepdim);
+Tensor std_mean_backward(
+    const Tensor& gstd,
+    const Tensor& gmean,
+    const Tensor& self,
+    const Tensor& std,
+    at::OptionalIntArrayRef dim_opt,
+    const std::optional<c10::Scalar>& correction,
+    bool keepdim);
+at::Tensor cholesky_backward(
+    const at::Tensor& grad,
+    bool upper,
+    const at::Tensor& L);
+at::Tensor cholesky_jvp(
+    const at::Tensor& input_tangent,
+    const at::Tensor& L,
+    bool upper);
+at::Tensor cholesky_inverse_backward(
+    const at::Tensor& grad,
+    const at::Tensor& L,
+    bool upper,
+    const at::Tensor& inverse);
+at::Tensor cholesky_inverse_jvp(
+    const at::Tensor& F,
+    const at::Tensor& dF,
+    const at::Tensor& X,
+    bool upper);
+Tensor pinv_jvp(const Tensor& A, const Tensor& pinvA, const Tensor& dA);
+Tensor pinv_backward(const Tensor& grad, const Tensor& pinvA, const Tensor& A);
+Tensor chunk_backward_nested(
+    const std::vector<torch::autograd::Variable>& grads,
+    const Tensor& self,
+    int64_t chunks,
+    int64_t dim);
+at::Tensor split_with_sizes_backward(
+    const std::vector<torch::autograd::Variable>& grads,
+    c10::SymIntArrayRef split_sizes,
+    int64_t dim,
+    c10::SymIntArrayRef sizes,
+    const at::TensorOptions& options);
+at::Tensor _nested_split_with_sizes_backward(
+    const std::vector<torch::autograd::Variable>& grads,
+    c10::SymIntArrayRef split_sizes,
+    int64_t dim,
+    const Tensor& nt_sizes,
+    const at::TensorOptions& options);
+at::Tensor split_backward(
+    const std::vector<torch::autograd::Variable>& grads,
+    const c10::SymInt& split_size,
+    int64_t dim,
+    c10::SymIntArrayRef sizes,
+    const at::TensorOptions& options);
+at::Tensor max_pool_double_backward(
+    const at::Tensor& grad,
+    const at::Tensor& indices,
+    int dim);
+at::Tensor error_for_max_pool2d_double_backward();
+at::Tensor glu_double_backward(
+    const at::Tensor& grad,
+    const at::Tensor& grad_output,
+    const at::Tensor& input,
+    int64_t dim);
+at::Tensor glu_double_backward_grad_output(
+    const at::Tensor& grad,
+    const at::Tensor& input,
+    int64_t dim);
+at::Tensor infinitely_differentiable_silu_backward(
+    const at::Tensor& grad_output,
+    const at::Tensor& input);
+at::Tensor infinitely_differentiable_mish_backward(
+    const at::Tensor& grad_output,
+    const at::Tensor& input);
+Tensor infinitely_differentiable_logit_backward(
+    const Tensor& grad,
+    const Tensor& self,
+    std::optional<double> eps);
+Tensor binary_cross_entropy_target_backward(
+    const Tensor& grad,
+    const Tensor& self,
+    const Tensor& target,
+    const std::optional<Tensor>& weight,
+    int64_t reduction);
+Tensor binary_cross_entropy_double_backward_target(
+    const Tensor& grad,
+    const Tensor& grad_output,
+    const Tensor& self,
+    const Tensor& target,
+    const std::optional<Tensor>& weight,
+    int64_t reduction);
+Tensor binary_cross_entropy_with_logits_backward(
+    const Tensor& grad,
+    const Tensor& input,
+    const Tensor& target,
+    const std::optional<Tensor>& weight_opt,
+    const std::optional<Tensor>& pos_weight_opt,
+    int64_t reduction);
+at::Tensor binary_cross_entropy_with_logits_target_backward(
+    const at::Tensor& grad_output,
+    const at::Tensor& self,
+    const at::Tensor& target,
+    const std::optional<at::Tensor>& weight,
+    const std::optional<at::Tensor>& pos_weight,
+    int64_t reduction);
+at::Tensor log_sigmoid_double_backward(
+    const at::Tensor& grad,
+    const at::Tensor& input);
+at::Tensor softmax_double_backward(
+    const at::Tensor& grad,
+    const at::Tensor& grad_output,
+    int dim,
+    const at::Tensor& output);
+at::Tensor binary_cross_entropy_double_backward(
+    const at::Tensor& grad_output,
+    const at::Tensor& grad,
+    const at::Tensor& input,
+    const at::Tensor& target,
+    const std::optional<at::Tensor>& weight,
+    int64_t reduction);
+at::Tensor binary_cross_entropy_double_backward_grad_output(
+    const at::Tensor& grad,
+    const at::Tensor& input,
+    const at::Tensor& target,
+    const std::optional<at::Tensor>& weight,
+    int64_t reduction);
+at::Tensor smooth_l1_loss_double_backward(
+    const at::Tensor& grad,
+    const at::Tensor& input,
+    const at::Tensor& target,
+    int64_t reduction,
+    double beta);
+at::Tensor huber_loss_double_backward(
+    const at::Tensor& grad,
+    const at::Tensor& input,
+    const at::Tensor& target,
+    int64_t reduction,
+    double delta);
+at::Tensor huber_loss_double_backward_grad_output(
+    const at::Tensor& grad,
+    const at::Tensor& grad_output,
+    const at::Tensor& input,
+    const at::Tensor& target,
+    int64_t reduction,
+    double delta);
+at::Tensor mse_loss_double_backward(
+    const at::Tensor& grad,
+    const at::Tensor& input,
+    int64_t reduction);
+at::Tensor soft_margin_loss_double_backward(
+    const at::Tensor& grad,
+    const at::Tensor& input,
+    const at::Tensor& target,
+    int64_t reduction);
+at::Tensor soft_margin_loss_double_backward_grad_output(
+    const at::Tensor& grad,
+    const at::Tensor& grad_output,
+    const at::Tensor& input,
+    const at::Tensor& target,
+    int64_t reduction);
+at::Tensor softplus_double_backward(
+    const at::Tensor& grad,
+    const at::Tensor& input,
+    const at::Scalar& beta,
+    const at::Scalar& threshold);
+std::tuple<at::Tensor, at::Tensor> slogdet_jvp(
+    const at::Tensor& LU,
+    const at::Tensor& pivots,
+    const at::Tensor& dA,
+    const at::Tensor& sign,
+    const bool use_A_T);
+at::Tensor slogdet_backward(
+    const at::Tensor& grad_sign,
+    const at::Tensor& grad_logabsdet,
+    const at::Tensor& A,
+    const at::Tensor& signdet,
+    const at::Tensor& LU,
+    const at::Tensor& pivots);
+at::Tensor log1p_backward(const at::Tensor& grad, const at::Tensor& self);
+at::Tensor sinc_backward(const at::Tensor& grad, const at::Tensor& self);
+at::Tensor sparse_constructor_values_backward(
+    const at::Tensor& sparse_grad_out,
+    const at::Tensor& indices);
+at::Tensor embedding_dense_double_backward_symint(
+    const at::Tensor& grad,
+    const at::Tensor& indices,
+    const c10::SymInt& padding_idx);
+at::Tensor index_backward(
+    at::Tensor zeros_like_self,
+    const torch::List<std::optional<Tensor>>& indices,
+    const at::Tensor& grad);
+at::Tensor _cudnn_ctc_loss_backward(
+    const at::Tensor& grad_out,
+    const at::Tensor& loss,
+    const at::Tensor& raw_grad,
+    bool zero_infinity);
+at::Tensor elu_double_backward(
+    const Tensor& grad,
+    const Tensor& grad_output,
+    const Scalar& alpha,
+    const Scalar& scale,
+    const Scalar& input_scale,
+    bool is_result,
+    const Tensor& self_or_result);
+
+Tensor svd_backward(
+    const Tensor& gU,
+    const Tensor& gS,
+    const Tensor& gVh,
+    const Tensor& U,
+    const Tensor& S,
+    const Tensor& Vh);
+
+std::tuple<Tensor, Tensor, Tensor> linalg_svd_jvp(
+    const Tensor& dA,
+    const Tensor& U,
+    const Tensor& S,
+    const Tensor& Vh,
+    const bool full_matrices);
+Tensor slice_backward_wrapper(
+    const at::Tensor& grad,
+    const c10::SymIntArrayRef& input_sizes,
+    int64_t dim,
+    std::optional<c10::SymInt> start,
+    std::optional<c10::SymInt> end,
+    c10::SymInt step);
+std::tuple<Tensor, Tensor> linalg_eig_jvp(
+    const Tensor& dA,
+    const Tensor& L,
+    const Tensor& V,
+    const bool is_hermitian);
+Tensor linalg_eig_backward(
+    const Tensor& gL,
+    const Tensor& gV,
+    const Tensor& L,
+    const Tensor& V,
+    const bool is_hermitian,
+    const bool symeig_eigenvectors = true);
+Tensor linalg_lstsq_solution_jvp(
+    const Tensor& A,
+    const Tensor& B_,
+    const Tensor& dA,
+    const Tensor& dB_);
+Tensor linalg_lstsq_residuals_jvp(
+    const Tensor& A,
+    const Tensor& B_,
+    const Tensor& dA,
+    const Tensor& dB_,
+    const Tensor& X_,
+    const Tensor& L);
+std::tuple<Tensor, Tensor> triangular_solve_backward(
+    const Tensor& grad_x,
+    const Tensor& grad_m,
+    const Tensor& b,
+    const Tensor& a,
+    const Tensor& x,
+    const bool upper,
+    const bool transpose,
+    const bool unitriangular,
+    std::array<bool, 2> output_mask);
+Tensor triangular_solve_jvp(
+    const Tensor& X,
+    const Tensor& A,
+    const Tensor& dA,
+    const Tensor& dB,
+    const bool upper,
+    const bool transpose,
+    const bool unitriangular);
+Tensor linalg_solve_triangular_forward_AD(
+    const Tensor& A_t,
+    const Tensor& B_t,
+    const Tensor& A,
+    const Tensor& X,
+    const bool upper,
+    const bool left,
+    const bool unitriangular);
+std::tuple<Tensor, Tensor> linalg_solve_triangular_backward(
+    const Tensor& grad,
+    const Tensor& A,
+    const Tensor& X,
+    const bool upper,
+    const bool left,
+    const bool unitriangular,
+    std::array<bool, 2> output_mask);
+std::tuple<Tensor, Tensor, Tensor> _trilinear_backward(
+    const Tensor& grad_out,
+    const std::optional<Tensor>& i1,
+    const std::optional<Tensor>& i2,
+    const std::optional<Tensor>& i3,
+    IntArrayRef expand1,
+    IntArrayRef expand2,
+    IntArrayRef expand3,
+    IntArrayRef sumdim,
+    std::array<bool, 3> grad_mask);
+std::tuple<Tensor, Tensor> linalg_qr_jvp(
+    const Tensor& dA,
+    const Tensor& Q,
+    const Tensor& R,
+    const std::string_view mode);
+Tensor linalg_qr_backward(
+    const Tensor& gQ,
+    const Tensor& gR,
+    const Tensor& Q,
+    const Tensor& R,
+    const std::string_view mode);
+Tensor linalg_matrix_exp_differential(
+    const Tensor& self,
+    const Tensor& grad,
+    bool adjoint);
+std::tuple<Tensor, Tensor, Tensor> batchnorm_double_backward(
+    const Tensor& input,
+    const std::optional<Tensor>& gamma,
+    const Tensor& ggI,
+    const Tensor& ggG,
+    const Tensor& ggB,
+    const Tensor& gO,
+    const std::optional<Tensor>& running_mean,
+    const std::optional<Tensor>& running_var,
+    bool training,
+    double eps,
+    const std::optional<Tensor>& save_mean,
+    const std::optional<Tensor>& save_invstd,
+    std::array<bool, 3> output_mask);
+std::tuple<Tensor, Tensor> _euclidean_dist_backward(
+    const Tensor& grad,
+    const Tensor& x1,
+    const Tensor& x2,
+    const Tensor& res);
+Tensor fft_backward(
+    const Tensor& self,
+    const Tensor& grad,
+    int64_t signal_ndim,
+    bool complex_input,
+    bool complex_output,
+    bool inverse,
+    IntArrayRef checked_signal_sizes,
+    int64_t normalization,
+    bool onesided,
+    IntArrayRef output_sizes);
+Tensor fft_r2c_backward(
+    const Tensor& grad,
+    at::IntArrayRef dim,
+    int64_t normalization,
+    bool onesided,
+    const c10::SymInt& last_dim_size);
+Tensor fft_c2r_backward(
+    const Tensor& grad,
+    IntArrayRef dim,
+    int64_t normalization);
+Tensor constant_pad_nd_backward(const Tensor& grad, c10::SymIntArrayRef pad);
+std::tuple<Tensor, Tensor> cholesky_solve_backward(
+    const Tensor& grad_x,
+    const Tensor& self,
+    const Tensor& input2,
+    const Tensor& result,
+    const bool upper,
+    std::array<bool, 2> output_mask);
+Tensor cholesky_solve_jvp(
+    const Tensor& X,
+    const Tensor& U,
+    const Tensor& dU,
+    const Tensor& dB,
+    const bool upper);
+std::tuple<Tensor, Tensor, Tensor>
+infinitely_differentiable_native_group_norm_backward(
+    const Tensor& dY,
+    const Tensor& dmean,
+    const Tensor& drstd,
+    const Tensor& X,
+    const Tensor& mean,
+    const Tensor& rstd,
+    const std::optional<Tensor>& gamma,
+    c10::SymInt N,
+    const c10::SymInt& C,
+    c10::SymInt HxW,
+    int64_t group,
+    double eps,
+    std::array<bool, 3> grad_input_mask);
+Tensor gelu_double_backward(
+    const Tensor& ggI,
+    const Tensor& gO,
+    const Tensor& input,
+    std::string_view approximate);
+Tensor as_strided_backward(
+    Tensor grad,
+    const TensorGeometry& input_geometry,
+    c10::SymIntArrayRef sizes,
+    c10::SymIntArrayRef strides,
+    const std::optional<c10::SymInt>& storage_offset_);
+Tensor as_strided_scatter_backward(
+    const Tensor& grad,
+    const TensorGeometry& input_geometry,
+    const TensorGeometry& src_geometry,
+    c10::SymIntArrayRef sizes,
+    c10::SymIntArrayRef strides,
+    std::optional<c10::SymInt> storage_offset);
+std::tuple<Tensor, Tensor> atan2_backward(
+    const Tensor& grad,
+    const Tensor& self,
+    const Tensor& other,
+    std::array<bool, 2> output_mask);
+Tensor amaxamin_jvp(
+    const Tensor& x,
+    const Tensor& dx,
+    const Tensor& result,
+    IntArrayRef dim,
+    bool keepdim);
+std::tuple<Tensor, Tensor, Tensor> layer_norm_double_backward(
+    const Tensor& input,
+    const std::optional<Tensor>& gamma,
+    const Tensor& ggI,
+    const Tensor& ggG,
+    const Tensor& ggB,
+    const Tensor& gO,
+    const Tensor& save_mean,
+    const Tensor& save_invstd,
+    c10::SymIntArrayRef normalized_shape,
+    std::array<bool, 3> output_mask);
+
+std::tuple<Tensor, Tensor> householder_product_backward(
+    const Tensor& grad,
+    const Tensor& result,
+    const Tensor& input,
+    const Tensor& tau,
+    const bool flip_order = false);
+Tensor householder_product_jvp(
+    const Tensor& dV,
+    const Tensor& dtau,
+    const Tensor& prod,
+    const Tensor& V,
+    const Tensor& tau);
+std::tuple<Tensor, Tensor, Tensor> ormqr_backward(
+    const Tensor& grad,
+    const Tensor& result,
+    const Tensor& self,
+    const Tensor& tau,
+    const Tensor& other,
+    bool left,
+    bool transpose,
+    std::array<bool, 3> grad_output_mask);
+std::tuple<Tensor, Tensor> polar_backward(
+    const Tensor& grad,
+    const Tensor& result);
+Tensor i1_backward(
+    const Tensor& grad,
+    const Tensor& self,
+    const Tensor& result);
+Tensor i1e_backward(
+    const Tensor& grad,
+    const Tensor& self,
+    const Tensor& result);
+Tensor linalg_lu_solve_LU(
+    const Tensor& grad,
+    const Tensor& LU,
+    const Tensor& pivots,
+    const Tensor& X,
+    const bool left,
+    const bool adjoint);
+Tensor linalg_lu_solve_jvp(
+    const Tensor& X,
+    const Tensor& LU,
+    const Tensor& pivots,
+    const Tensor& dLU,
+    const Tensor& dB,
+    const bool left,
+    const bool adjoint);
+std::tuple<Tensor, Tensor> linalg_solve_backward(
+    const Tensor& gX,
+    const Tensor& X,
+    const Tensor& A,
+    const Tensor& LU,
+    const Tensor& pivots,
+    const bool left,
+    const bool B_requires_grad);
+Tensor linalg_solve_jvp(
+    const Tensor& dA,
+    const Tensor& dB,
+    const Tensor& X,
+    const Tensor& LU,
+    const Tensor& pivots,
+    const bool left,
+    const bool use_A_T);
+Tensor lu_unpack_backward(
+    const Tensor& L_grad,
+    const Tensor& U_grad,
+    const c10::SymInt& m,
+    const c10::SymInt& n);
+
+Tensor linalg_det_backward(
+    const Tensor& grad,
+    const Tensor& det,
+    const Tensor& A,
+    const Tensor& LU,
+    const Tensor& pivots);
+Tensor linalg_det_jvp(
+    const Tensor& dA,
+    const Tensor& det,
+    const Tensor& LU,
+    const Tensor& pivots,
+    const bool use_A_T);
+std::tuple<Tensor, Tensor> linalg_lstsq_backward(
+    const Tensor& gX_,
+    const Tensor& gL,
+    const Tensor& A,
+    const Tensor& B_,
+    const Tensor& X_,
+    const std::array<bool, 2>& grad_input_mask);
+Tensor linalg_lu_backward(
+    const Tensor& L_grad,
+    const Tensor& U_grad,
+    const Tensor& P,
+    const Tensor& L,
+    const Tensor& U,
+    const bool pivot);
+
+std::tuple<Tensor, Tensor> linalg_lu_jvp(
+    const Tensor& dA,
+    const Tensor& P,
+    const Tensor& L,
+    const Tensor& U,
+    const bool pivot);
+
+Tensor lu_factor_ex_backward(
+    const Tensor& grad,
+    const Tensor& LU,
+    const Tensor& pivs,
+    const bool pivot);
+Tensor lu_factor_ex_jvp(
+    const Tensor& dX,
+    const Tensor& LU,
+    const Tensor& pivs,
+    const bool pivot);
+
+Tensor batch_norm_jvp(
+    const Tensor& input_p,
+    const Tensor& input_t,
+    const Tensor& weight_p,
+    const Tensor& weight_t,
+    const Tensor& bias_p,
+    const Tensor& bias_t,
+    const std::optional<Tensor>& running_mean,
+    const std::optional<Tensor>& running_var,
+    const Tensor& saved_mean,
+    const Tensor& saved_invstd,
+    bool train,
+    double eps);
+
+Tensor layer_norm_jvp(
+    const Tensor& input_p,
+    const Tensor& input_t,
+    const Tensor& weight_p,
+    const Tensor& weight_t,
+    const Tensor& bias_p,
+    const Tensor& bias_t,
+    const Tensor& saved_mean,
+    const Tensor& saved_invstd,
+    c10::SymIntArrayRef normalized_shape);
+
+Tensor group_norm_jvp(
+    const Tensor& input_p,
+    const Tensor& input_t,
+    const Tensor& weight_p,
+    const Tensor& weight_t,
+    const Tensor& bias_p,
+    const Tensor& bias_t,
+    const Tensor& saved_mean,
+    const Tensor& saved_invstd,
+    int64_t groups);
+Tensor group_norm_mean_jvp(
+    const Tensor& input_t,
+    const Tensor& mean_p,
+    int64_t groups);
+Tensor group_norm_invstd_jvp(
+    const Tensor& input_p,
+    const Tensor& input_t,
+    const Tensor& mean_p,
+    const Tensor& invstd_p,
+    int64_t groups);
+
+Tensor convolution_jvp(
+    const Tensor& input_p,
+    const Tensor& input_t,
+    const Tensor& weight_p,
+    const Tensor& weight_t,
+    const Tensor& bias_p,
+    const Tensor& bias_t,
+    at::SymIntArrayRef stride,
+    at::SymIntArrayRef padding,
+    at::SymIntArrayRef dilation,
+    bool transposed,
+    at::SymIntArrayRef output_padding,
+    const c10::SymInt& groups);
+
+Tensor _convolution_jvp(
+    const Tensor& input_p,
+    const Tensor& input_t,
+    const Tensor& weight_p,
+    const Tensor& weight_t,
+    const Tensor& bias_p,
+    const Tensor& bias_t,
+    at::SymIntArrayRef stride,
+    at::SymIntArrayRef padding,
+    at::SymIntArrayRef dilation,
+    bool transposed,
+    at::SymIntArrayRef output_padding,
+    const c10::SymInt& groups,
+    bool benchmark,
+    bool deterministic,
+    bool cudnn_enabled,
+    bool allow_tf32);
+
+Tensor convolution_backward_jvp_grad_bias(
+    const Tensor& grad_out_t,
+    const Tensor& grad_bias);
+
+Tensor cat_jvp(const at::ITensorListRef& tensors, int64_t dim);
+Tensor block_diag_jvp(at::TensorList tensors);
+Tensor stack_jvp(at::TensorList tensors, int64_t dim);
+Tensor cumprod_jvp(
+    const Tensor& self_t,
+    const Tensor& self_p,
+    const Tensor& result,
+    int dim);
+Tensor gather_with_keepdimed_indices(
+    const Tensor& input,
+    int64_t dim,
+    const Tensor& indices,
+    bool keepdim);
+Tensor evenly_read_jvp(
+    const Tensor& fw_grad,
+    const Tensor& input,
+    const Tensor& value);
+Tensor warn_backwards(const Tensor& grad_output);
+
+std::tuple<Tensor, Tensor> _cudnn_convolution_backward(
+    const at::Tensor& self,
+    const at::Tensor& grad_output,
+    const at::Tensor& weight,
+    at::SymIntArrayRef padding,
+    at::SymIntArrayRef output_padding,
+    at::SymIntArrayRef stride,
+    at::SymIntArrayRef dilation,
+    bool transposed,
+    c10::SymInt groups,
+    ::std::array<bool, 2> output_mask);
+
+Tensor scatter_reduce_jvp(
+    const Tensor& self_p,
+    const Tensor& self_t,
+    int dim,
+    const Tensor& index,
+    const Tensor& src_p,
+    const Tensor& src_t,
+    std::string_view reduce,
+    bool include_self,
+    const Tensor& result);
+
+std::tuple<Tensor, Tensor> scatter_reduce_backward(
+    const Tensor& grad,
+    const Tensor& self,
+    int dim,
+    const Tensor& index,
+    const Tensor& src,
+    std::string_view reduce,
+    bool include_self,
+    const Tensor& result);
+
+Tensor _to_copy_backward(
+    const Tensor& grad,
+    const c10::TensorOptions& self_options);
+
+std::tuple<Tensor, Tensor> index_reduce_backward(
+    const Tensor& grad,
+    const Tensor& self,
+    int dim,
+    const Tensor& index,
+    const Tensor& source,
+    std::string_view reduce,
+    bool include_self,
+    const Tensor& result);
+
+Tensor take_backward(
+    const Tensor& grad,
+    const Tensor& self,
+    const Tensor& indices);
+
+Tensor to_sparse_backward(
+    const Tensor& grad,
+    const c10::Layout self_layout,
+    const c10::OptionalArrayRef<c10::SymInt>& self_blocksize);
+
+std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor>
+mkldnn_rnn_layer_differentiable_backward(
+    const Tensor& input,
+    const Tensor& weight0,
+    const Tensor& weight1,
+    const Tensor& weight2,
+    const Tensor& weight3,
+    const Tensor& hx_,
+    const Tensor& cx_tmp,
+    const Tensor& output,
+    const Tensor& hy_,
+    const Tensor& cy_,
+    const std::optional<Tensor>& grad_output_r_opt,
+    const std::optional<Tensor>& grad_hy_r_opt,
+    const std::optional<Tensor>& grad_cy_r_opt,
+    bool reverse,
+    int64_t mode,
+    int64_t hidden_size,
+    int64_t num_layers,
+    bool has_biases,
+    bool train,
+    bool bidirectional,
+    at::IntArrayRef batch_sizes,
+    bool batch_first,
+    const at::Tensor& workspace);
+
+Tensor values_backward(const Tensor& grad, const Tensor& self);
+
+} // namespace torch::autograd::generated::details
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/InferenceMode.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/InferenceMode.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c5516e20af354b2dacb023834d23d4dafe13be0
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/InferenceMode.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <c10/core/InferenceMode.h>
+#include <torch/csrc/Export.h>
+
+namespace torch::autograd {
+
+using InferenceMode = c10::InferenceMode;
+
+}
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/VariableTypeUtils.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/VariableTypeUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..847870f45ebbfc59d0e29753ae421510ea001267
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/VariableTypeUtils.h
@@ -0,0 +1,441 @@
+#pragma once
+
+#include <c10/util/irange.h>
+
+#include <ATen/core/boxing/KernelFunction.h>
+#include <ATen/core/dispatch/Dispatcher.h>
+
+#include <torch/csrc/autograd/edge.h>
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/autograd/functions/basic_ops.h>
+#include <torch/csrc/autograd/functions/tensor.h>
+#include <torch/csrc/autograd/grad_mode.h>
+#include <torch/csrc/autograd/saved_variable.h>
+#include <torch/csrc/autograd/variable.h>
+
+#include <torch/csrc/autograd/functions/utils.h>
+#include <torch/csrc/autograd/jit_decomp_interface.h>
+#include <torch/csrc/utils/variadic.h>
+
+#include <cstddef>
+#include <functional>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#ifdef _MSC_VER
+#ifdef Type
+#undef Type
+#endif
+#endif
+
+namespace torch::autograd {
+enum class can_mutate_inplace_result {
+  success,
+  non_default_backward_view,
+  view_of_leaf,
+  is_leaf,
+};
+
+// The requires_grad argument is used to know if the inplace operation needs
+// gradient to be setup for it.
+// In particular, we can have tensor.requires_grad() != requires_grad when
+// writing a Tensor that requires gradients inplace into a Tensor that does not
+// require gradients: a = torch.rand(2) b = torch.rand(2, requires_grad=True)
+// a.copy_(b)
+inline can_mutate_inplace_result can_mutate_inplace(
+    const at::Tensor& tensor,
+    bool requires_grad) {
+  if (!requires_grad || !GradMode::is_enabled()) {
+    return can_mutate_inplace_result::success;
+  }
+  auto diff_view_meta = impl::get_view_autograd_meta(tensor);
+  if (diff_view_meta && diff_view_meta->has_bw_view()) {
+    if (diff_view_meta->get_creation_meta() != CreationMeta::DEFAULT) {
+      return can_mutate_inplace_result::non_default_backward_view;
+    }
+    if (tensor.requires_grad() && tensor._base().is_leaf()) {
+      return can_mutate_inplace_result::view_of_leaf;
+    }
+  }
+  if (tensor.requires_grad() && tensor.is_leaf()) {
+    return can_mutate_inplace_result::is_leaf;
+  }
+  return can_mutate_inplace_result::success;
+}
+
+inline void check_inplace(const at::Tensor& tensor, bool requires_grad) {
+  switch (can_mutate_inplace(tensor, requires_grad)) {
+    case can_mutate_inplace_result::success:
+      return;
+    case can_mutate_inplace_result::non_default_backward_view: {
+      return handle_view_on_rebase(impl::get_view_autograd_meta(tensor));
+    }
+    case can_mutate_inplace_result::view_of_leaf:
+      TORCH_CHECK(
+          false,
+          "a view of a leaf Variable that requires grad is being used in an in-place operation.");
+      break;
+
+    case can_mutate_inplace_result::is_leaf:
+      TORCH_CHECK(
+          false,
+          "a leaf Variable that requires grad is being used in an in-place operation.");
+      break;
+  }
+  TORCH_INTERNAL_ASSERT(false);
+}
+
+inline void check_inplace(at::ITensorListRef tensors, bool requires_grad) {
+  for (const auto& tensor : tensors) {
+    check_inplace(tensor, requires_grad);
+  }
+}
+
+inline void throw_error_out_requires_grad(const char* name) {
+  TORCH_CHECK(
+      false,
+      name,
+      "(): functions with out=... arguments don't support automatic differentiation, "
+      "but one of the arguments requires grad.");
+}
+
+inline void throw_error_for_complex_autograd(
+    const at::Tensor& tensor,
+    const char* name) {
+  if (tensor.requires_grad()) {
+    TORCH_CHECK(
+        !tensor.is_complex(),
+        name,
+        " does not support automatic differentiation for outputs with complex dtype.");
+  }
+}
+
+inline void throw_error_if_base_and_tensor_are_same(
+    const at::Tensor& base,
+    const at::Tensor& tensor) {
+  TORCH_CHECK(
+      base.unsafeGetTensorImpl() != tensor.unsafeGetTensorImpl(),
+      "View operation returned a tensor that is the same as the input base tensor.  This "
+      "is no longer allowed; you must explicitly create a new tensor (e.g., using .detach()). "
+      "As a user, you could have made a mistake implementing __torch_dispatch__ or a Python "
+      "operator decomposition or meta registration; if that's not the case, please "
+      "report a bug to PyTorch or the backend you are using.");
+}
+
+inline void throw_error_for_complex_autograd(
+    at::ITensorListRef tensorlist,
+    const char* name) {
+  for (const auto& tensor : tensorlist) {
+    throw_error_for_complex_autograd(tensor, name);
+  }
+}
+
+// TODO: Blegh, bare references
+
+inline void rebase_history(const Variable& var, std::shared_ptr<Node> grad_fn) {
+  if (grad_fn && var.defined()) {
+    grad_fn->add_input_metadata(var);
+    impl::rebase_history(var, {std::move(grad_fn), 0});
+  }
+}
+
+inline void rebase_history(
+    const std::vector<Variable>& vars,
+    const std::shared_ptr<Node>& grad_fn) {
+  if (grad_fn) {
+    for (auto& var : vars) {
+      if (var.defined()) {
+        auto output_nr = grad_fn->add_input_metadata(var);
+        impl::rebase_history(var, {grad_fn, output_nr});
+      } else {
+        grad_fn->add_input_metadata(Node::undefined_input());
+      }
+    }
+  }
+}
+
+inline void increment_version(const at::Tensor& t) {
+  impl::bump_version(t);
+}
+
+struct Flatten : IterArgs<Flatten> {
+  Flatten(variable_list& out) : out(out) {}
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  variable_list& out;
+  void operator()(const at::Tensor& x) {
+    out.emplace_back(x);
+  }
+  void operator()(const std::optional<at::Tensor>& x) {
+    if (x.has_value())
+      out.emplace_back(x.value());
+  }
+  void operator()(at::ArrayRef<at::Tensor> xs) {
+    out.insert(out.end(), xs.begin(), xs.end());
+  }
+};
+
+template <typename... Args>
+inline variable_list flatten_tensor_args(Args&&... args) {
+  variable_list out;
+  out.reserve(count_tensors(std::forward<Args>(args)...));
+  Flatten(out).apply(std::forward<Args>(args)...);
+  return out; // RVO
+}
+
+// See NOTE [ Autograd View Variables ] for details.
+inline at::Tensor as_view(
+    const at::Tensor& base,
+    const at::Tensor& tensor,
+    bool is_bw_differentiable,
+    bool is_fw_differentiable,
+    std::unique_ptr<ViewFunc> view_func = nullptr,
+    std::function<at::Tensor(const at::Tensor&)> rev_view_func = nullptr,
+    CreationMeta creation_meta = CreationMeta::DEFAULT,
+    bool allow_tensor_metadata_change = true) {
+  // Note [View of inference tensor]
+  // For inference tensor this code can only be hit outside InferenceMode
+  // since ADInplaceOrView is in the default_included_set.
+  // If Inplace and View were separate dispatch keys we can just put Inplace
+  // in the default_included_set, so that view ops on inference tensor doesn't
+  // have to go through as_view even outside InferenceMode.
+  if (base.is_inference())
+    return tensor;
+
+  auto diff_view_meta = torch::autograd::impl::get_view_autograd_meta(base);
+
+  // To speed up the most common case, we specially handle when both the forward
+  // and backward view infos are the same, and so a single shared ViewInfo can
+  // be used for both of them.
+  if ((!diff_view_meta || diff_view_meta->shared_view_info()) &&
+      is_bw_differentiable && is_fw_differentiable) {
+    throw_error_if_base_and_tensor_are_same(base, tensor);
+    if (diff_view_meta) {
+      creation_meta = propagate_creation_meta(
+          diff_view_meta->get_creation_meta(), creation_meta);
+      return make_variable_differentiable_view(
+          tensor,
+          diff_view_meta->get_backward_view().chain(
+              base, tensor, std::move(view_func), std::move(rev_view_func)),
+          std::nullopt,
+          /*shared_view_info*/ true,
+          creation_meta,
+          allow_tensor_metadata_change);
+    } else {
+      return make_variable_differentiable_view(
+          tensor,
+          ViewInfo(base, std::move(view_func), std::move(rev_view_func)),
+          std::nullopt,
+          /*shared_view_info*/ true,
+          creation_meta,
+          allow_tensor_metadata_change);
+    }
+  }
+
+  // If they cannot be shared, create the required view infos
+  std::optional<ViewInfo> new_bw_info;
+  std::optional<ViewInfo> new_fw_info;
+
+  if (is_bw_differentiable) {
+    auto bw_view_func = view_func ? view_func->clone_and_set() : nullptr;
+    if (diff_view_meta && diff_view_meta->has_bw_view()) {
+      const auto& base_bw_info = diff_view_meta->get_backward_view();
+      new_bw_info = base_bw_info.chain(
+          base, tensor, std::move(bw_view_func), rev_view_func);
+    } else {
+      new_bw_info = ViewInfo(base, std::move(bw_view_func), rev_view_func);
+    }
+  } else {
+    TORCH_CHECK(
+        creation_meta == CreationMeta::DEFAULT,
+        "Non-backward differentiable views must have creation_meta=CreationMeta::DEFAULT");
+  }
+
+  if (is_fw_differentiable) {
+    // Check if base is a forward differentiable view
+    if (diff_view_meta && diff_view_meta->has_fw_view()) {
+      const auto& base_fw_info = diff_view_meta->get_forward_view();
+      new_fw_info = base_fw_info.chain(
+          base, tensor, std::move(view_func), std::move(rev_view_func));
+    } else {
+      new_fw_info =
+          ViewInfo(base, std::move(view_func), std::move(rev_view_func));
+    }
+  }
+
+  if (is_fw_differentiable || is_bw_differentiable) {
+    if (diff_view_meta && diff_view_meta->has_bw_view()) {
+      creation_meta = propagate_creation_meta(
+          diff_view_meta->get_creation_meta(), creation_meta);
+    }
+    throw_error_if_base_and_tensor_are_same(base, tensor);
+    return make_variable_differentiable_view(
+        tensor,
+        std::move(new_bw_info),
+        std::move(new_fw_info),
+        /*shared_view_info*/ false,
+        creation_meta,
+        allow_tensor_metadata_change);
+  } else {
+    return make_variable_non_differentiable_view(
+        base, tensor, allow_tensor_metadata_change);
+  }
+}
+
+inline void check_no_requires_grad(
+    const at::Tensor& tensor,
+    const char* name,
+    const char* fn_name = "",
+    bool check_grad_mode = true) {
+  TORCH_CHECK(
+      !(tensor.defined() && tensor.requires_grad()) ||
+          !(check_grad_mode && GradMode::is_enabled()),
+      "The function '",
+      fn_name,
+      "' is not differentiable with respect to argument '",
+      name,
+      "'. This input cannot have requires_grad True.");
+}
+
+inline void check_no_requires_grad(
+    const std::optional<at::Tensor>& tensor,
+    const char* name,
+    const char* fn_name = "") {
+  if (tensor.has_value()) {
+    check_no_requires_grad(*tensor, name, fn_name);
+  }
+}
+
+inline void check_no_requires_grad(
+    at::ITensorListRef tensors,
+    const char* name,
+    const char* fn_name = "") {
+  // GradMode check is expensive, so check it only once for TensorLists
+  if (!GradMode::is_enabled()) {
+    return;
+  }
+  for (auto& tensor : tensors) {
+    check_no_requires_grad(tensor, name, fn_name, /*check_grad_mode*/ false);
+  }
+}
+
+inline void check_no_requires_grad(
+    const c10::List<std::optional<at::Tensor>>& tensors,
+    const char* name,
+    const char* fn_name = "") {
+  // GradMode check is expensive, so check it only once for TensorLists
+  if (!GradMode::is_enabled()) {
+    return;
+  }
+  for (std::optional<at::Tensor> tensor : tensors) {
+    if (tensor.has_value()) {
+      check_no_requires_grad(*tensor, name, fn_name, /*check_grad_mode*/ false);
+    }
+  }
+}
+
+// Assumed that saved tensor lists are never inplace outputs
+inline std::vector<SavedVariable> make_saved_variable_list(
+    at::ITensorListRef tensors,
+    const bool is_output = false) {
+  return fmap(tensors, [&is_output](const at::Tensor& tensor) -> SavedVariable {
+    return SavedVariable{tensor, is_output /* is output */};
+  });
+}
+
+// Assumed that saved tensor lists are never inplace outputs
+inline std::vector<SavedVariable> make_saved_variable_list(
+    const c10::List<std::optional<at::Tensor>>& tensors,
+    const bool is_output = false) {
+  return fmap(
+      tensors,
+      [&is_output](const std::optional<at::Tensor>& tensor) -> SavedVariable {
+        if (tensor.has_value()) {
+          return SavedVariable{*tensor, is_output /* is output */};
+        } else {
+          return SavedVariable{at::Tensor(), is_output /* is output */};
+        }
+      });
+}
+
+inline std::vector<std::vector<int64_t>> to_args_sizes(
+    at::ITensorListRef tensors) {
+  std::vector<std::vector<int64_t>> args_sizes(tensors.size());
+  size_t i = 0;
+  for (const auto& t : tensors) {
+    args_sizes[i++] = t.sizes().vec();
+  }
+  return args_sizes;
+}
+
+inline std::vector<std::vector<c10::SymInt>> to_args_sizes_symint(
+    at::ITensorListRef tensors) {
+  std::vector<std::vector<c10::SymInt>> args_sizes(tensors.size());
+  size_t i = 0;
+  for (const auto& t : tensors) {
+    args_sizes[i++] = t.sym_sizes().vec();
+  }
+  return args_sizes;
+}
+
+inline std::vector<c10::ScalarType> to_args_scalartypes(
+    at::ITensorListRef tensors) {
+  std::vector<c10::ScalarType> args_scalartypes(tensors.size());
+  size_t i = 0;
+  for (const auto& t : tensors) {
+    args_scalartypes[i++] = t.scalar_type();
+  }
+  return args_scalartypes;
+}
+
+namespace impl {
+
+namespace {
+
+// If run_jit_decomposition were not a member function, we would be able
+// to pass this as a template parameter to c10::Boxedkernel::makeFromFunction.
+// However, member functions cannot be passed this way - instead we wrap our
+// call in this functor so it can be passed to c10::BoxedKernel::makeFromFunctor
+class WrapperFunctor final : public c10::OperatorKernel {
+ public:
+  WrapperFunctor(JitDecompInterface* impl) : impl_(impl) {}
+
+  void operator()(
+      const c10::OperatorHandle& op,
+      c10::DispatchKeySet ks,
+      torch::jit::Stack* stack) {
+    impl_->run_jit_decomposition(op, stack);
+  }
+  JitDecompInterface* impl_;
+};
+
+} // namespace
+
+template <class Return, class... Args>
+Return run_jit_decomposition_with_args_for_jvp(
+    std::string_view name,
+    const c10::OperatorHandle& opHandle,
+    c10::DispatchKeySet dispatchKeySet,
+    Args&&... args) {
+  // see NOTE: [Jit Decomposition Interface]
+  JitDecompInterface* impl = getJitDecompImpl();
+
+  TORCH_CHECK_NOT_IMPLEMENTED(
+      impl && impl->has_jit_decomposition(opHandle.schema()),
+      "Trying to use forward AD with ",
+      name,
+      " that does not support it because it has not been implemented yet.\nPlease file an issue "
+      "to PyTorch at https://github.com/pytorch/pytorch/issues/new?template=feature-request.yml "
+      "so that we can prioritize its implementation or submit a PR adding the implementation to "
+      "derivatives.yaml");
+
+  return c10::KernelFunction::makeFromBoxedKernel(
+             c10::BoxedKernel::makeFromFunctor(
+                 std::make_unique<WrapperFunctor>(impl)))
+      .call<Return, Args...>(
+          opHandle, dispatchKeySet, std::forward<Args>(args)...);
+}
+
+} // namespace impl
+
+} // namespace torch::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/anomaly_mode.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/anomaly_mode.h
new file mode 100644
index 0000000000000000000000000000000000000000..364d3fcd9c657cca5537c071b968eb10f473544e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/anomaly_mode.h
@@ -0,0 +1,71 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <memory>
+#include <string>
+
+namespace torch::autograd {
+
+// forward declaration of Node from function.h
+struct Node;
+
+struct TORCH_API AnomalyMode {
+  static bool is_enabled() {
+    return _enabled;
+  }
+  static bool should_check_nan() {
+    return _check_nan;
+  }
+  static void set_enabled(bool enabled, bool check_nan = true) {
+    _enabled = enabled;
+    _check_nan = check_nan;
+  }
+
+ private:
+  static bool _enabled;
+  static bool _check_nan;
+};
+
+/// A RAII guard that enables Anomaly Detection Mode.
+///
+/// Anomaly detection mode is useful for debugging problems happening
+/// in the backward, such as unexpectedly modified tensors or NaNs
+/// occurring in the backward.
+///
+/// The enabling of anomaly mode is global - as soon as there is one
+/// such guard, it is enabled for all computation and threads. It also
+/// comes with a significant performance penalty.
+///
+/// Example:
+/// @code
+/// auto x = torch::tensor({1.}, torch::requires_grad());
+/// {
+///   torch::autograd::DetectAnomalyGuard detect_anomaly;
+///   auto x = torch::tensor({5.0}, torch::requires_grad());
+///   auto y = x * x;
+///   auto z = y * y;
+///   y += 1;
+///   z.backward();
+/// }
+/// @endcode
+class TORCH_API DetectAnomalyGuard {
+ public:
+  DetectAnomalyGuard(bool check_nan = true);
+  ~DetectAnomalyGuard();
+
+ private:
+  bool prev_check_nan_;
+};
+
+struct TORCH_API AnomalyMetadata {
+  virtual ~AnomalyMetadata();
+  virtual void store_stack();
+  virtual void print_stack(const std::string& current_node_name);
+  virtual void assign_parent(const std::shared_ptr<Node>& parent_node);
+
+ private:
+  std::string traceback_;
+  std::shared_ptr<Node> parent_;
+};
+
+} // namespace torch::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/autograd.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/autograd.h
new file mode 100644
index 0000000000000000000000000000000000000000..8fc86c8df2bbc10607d9b77d9fa6fad588ab5f2c
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/autograd.h
@@ -0,0 +1,104 @@
+#pragma once
+
+#include <torch/csrc/autograd/variable.h>
+
+namespace torch::autograd {
+
+/// Computes the sum of gradients of given tensors with respect to graph leaves.
+///
+/// The graph is differentiated using the chain rule. If any of ``tensors``
+/// are non-scalar (i.e. their data has more than one element) and require
+/// gradient, then the Jacobian-vector product would be computed, in this case
+/// the function additionally requires specifying `grad_tensors`. It should be a
+/// sequence of matching length, that contains the "vector" in the
+/// Jacobian-vector product, usually the gradient of the differentiated function
+/// w.r.t. corresponding tensors
+/// (`torch::Tensor()` is an acceptable value for all tensors that don't need
+/// gradient tensors).
+///
+/// This function accumulates gradients in the leaves - you might need to zero
+/// them before calling it.
+///
+/// \param tensors Tensors of which the derivative will be computed.
+/// \param grad_tensors The "vector" in the Jacobian-vector product, usually
+/// gradients
+///     w.r.t. each element of corresponding tensors. `torch::Tensor()` values
+///     can be specified for scalar Tensors or ones that don't require grad. If
+///     a `torch::Tensor()` value would be acceptable for all grad_tensors, then
+///     this argument is optional.
+/// \param retain_graph If `false`, the graph used to compute the grad will be
+/// freed.
+///     Note that in nearly all cases setting this option to `true` is not
+///     needed and often can be worked around in a much more efficient way.
+///     Defaults to the value of `create_graph`.
+/// \param create_graph If `true`, graph of the derivative will be constructed,
+/// allowing
+///     to compute higher order derivative products. Defaults to `false`.
+/// \param inputs Inputs w.r.t. which the gradient will be accumulated into
+///     `at::Tensor::grad`. All other Tensors will be ignored. If not provided,
+///     the gradient is accumulated into all the leaf Tensors that were used to
+///     compute param `tensors`.
+//      When inputs are provided and a given input is not a leaf,
+//      the current implementation will call its grad_fn (even though it is not
+//      strictly needed to get this gradients). It is an implementation detail
+//      on which the user should not rely. See
+//      https://github.com/pytorch/pytorch/pull/60521#issuecomment-867061780 for
+//      more details.
+TORCH_API void backward(
+    const variable_list& tensors,
+    const variable_list& grad_tensors = {},
+    std::optional<bool> retain_graph = std::nullopt,
+    bool create_graph = false,
+    const variable_list& inputs = {});
+
+/// Computes and returns the sum of gradients of outputs with respect to the
+/// inputs.
+///
+/// ``grad_outputs`` should be a sequence of length matching ``output``
+/// containing the "vector" in Jacobian-vector product, usually the pre-computed
+/// gradients w.r.t. each of the outputs. If an output doesn't require_grad,
+/// then the gradient can be ``torch::Tensor()``).
+///
+/// \param outputs outputs of the differentiated function.
+/// \param inputs Inputs w.r.t. which the gradient will be
+///     returned (and not accumulated into ``at::Tensor::grad``).
+/// \param grad_outputs The "vector" in the Jacobian-vector product.
+///     Usually gradients w.r.t. each output. `torch::Tensor()` values can be
+///     specified for scalar Tensors or ones that don't require grad. If a
+///     `torch::Tensor()` value would be acceptable for all grad_tensors, then
+///     this argument is optional. Default: `{}`.
+/// \param retain_graph If ``false``, the graph used to compute the grad
+///     will be freed. Note that in nearly all cases setting this option to
+///     ``true`` is not needed and often can be worked around in a much more
+///     efficient way. Defaults to the value of ``create_graph``.
+/// \param create_graph If ``true``, graph of the derivative will
+///     be constructed, allowing to compute higher order derivative products.
+///     Default: ``false``.
+/// \param allow_unused If ``false``, specifying inputs that were not
+///     used when computing outputs (and therefore their grad is always zero)
+///     is an error. Defaults to ``false``.
+TORCH_API variable_list grad(
+    const variable_list& outputs,
+    const variable_list& inputs,
+    const variable_list& grad_outputs = {},
+    std::optional<bool> retain_graph = std::nullopt,
+    bool create_graph = false,
+    bool allow_unused = false);
+
+namespace forward_ad {
+
+/// Creates a new dual level and returns its index. This level index should then
+/// be used to call into the other functions below. This API supports entering a
+/// new level before the previous one is exited. We call them nested forward AD
+/// levels. These can be used to compute higher order derivatives.
+TORCH_API uint64_t enter_dual_level();
+
+/// Exits the given level. This will clear up all the gradients from this level
+/// and all dual Tensors that had gradients for this level will become regular
+/// Tensors again. This function can only be used to exit the innermost nesting
+/// level and so exiting must happen in reverse order compared to the entering
+/// that was done with the function above.
+TORCH_API void exit_dual_level(uint64_t level);
+
+} // namespace forward_ad
+} // namespace torch::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/autograd_not_implemented_fallback.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/autograd_not_implemented_fallback.h
new file mode 100644
index 0000000000000000000000000000000000000000..f406dde432ce8e37313026c654dfe5cd05a0d1df
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/autograd_not_implemented_fallback.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <torch/library.h>
+
+namespace torch::autograd {
+
+// Default DispatchKey::Autograd fallback for built-in operators.
+// Can be registered for custom operators.
+TORCH_API torch::CppFunction autogradNotImplementedFallback();
+
+// Default DispatchKey::AdInplaceOrView fallback for built-in operators
+// Can be registered for custom operators.
+TORCH_API torch::CppFunction autogradNotImplementedInplaceOrViewFallback();
+
+// Default DispatchKey::Autograd fallback for all other operators (i.e. custom
+// operators)
+TORCH_API torch::CppFunction basicAutogradNotImplementedFallback();
+
+enum class AutogradFallbackMode {
+  Nothing, // Fallback is a redispatch
+  Warn, // Fallback raises a warning if backward is called
+  Error, // Fallback raises an error if backward is called
+};
+
+// Change the behavior of "basicAutogradNotImplementedFallback"
+// In Python this is:
+// - torch._C._set_autograd_fallback_mode(str) -> None
+// - torch._C._get_autograd_fallback_mode() -> str
+TORCH_API void setAutogradFallbackMode(AutogradFallbackMode mode);
+TORCH_API AutogradFallbackMode getAutogradFallbackMode();
+
+} // namespace torch::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/cpp_hook.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/cpp_hook.h
new file mode 100644
index 0000000000000000000000000000000000000000..6b77940c9e3be007a6d22840efa01d2d3f6038cd
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/cpp_hook.h
@@ -0,0 +1,32 @@
+#pragma once
+#include <torch/csrc/autograd/function_hook.h>
+#include <functional>
+#include <memory>
+
+namespace torch::autograd {
+
+using hooks_list =
+    std::vector<std::function<at::TensorBase(const at::TensorBase&)>>;
+
+struct CppFunctionTensorPreHook : public FunctionPreHook {
+  CppFunctionTensorPreHook(std::shared_ptr<hooks_list> hooks, size_t value_idx);
+  variable_list operator()(const variable_list& values) override;
+
+  std::shared_ptr<hooks_list> hooks_;
+  size_t value_idx_;
+};
+
+struct CppFunctionSingleTensorPreHook : public FunctionPreHook {
+  CppFunctionSingleTensorPreHook(
+      std::function<at::TensorBase(const at::TensorBase&)> hook,
+      size_t value_idx);
+  variable_list operator()(const variable_list& values) override;
+
+  void compiled_args(
+      torch::dynamo::autograd::CompiledNodeArgs& args) const override;
+
+  std::function<at::TensorBase(const at::TensorBase&)> hook_;
+  size_t value_idx_;
+};
+
+} // namespace torch::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/custom_function.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/custom_function.h
new file mode 100644
index 0000000000000000000000000000000000000000..a3704aff9d78f4975e07f9d0051d74fe9f364844
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/custom_function.h
@@ -0,0 +1,576 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <c10/core/SymInt.h>
+#include <c10/util/flat_hash_map.h>
+#include <c10/util/irange.h>
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/autograd/variable_info.h>
+#include <torch/csrc/dynamo/compiled_autograd.h>
+#include <vector>
+
+namespace torch::autograd {
+
+using optional_variable_list = std::vector<std::optional<Variable>>;
+using _jvp_fn_t = std::function<variable_list(variable_list, variable_list)>;
+using _view_as_self_fn_t = std::function<at::Tensor(at::Tensor)>;
+
+TORCH_API std::vector<std::optional<Variable>> _wrap_outputs(
+    const variable_list& input_vars,
+    const std::unordered_set<at::TensorImpl*>& non_differentiable,
+    const std::unordered_set<at::TensorImpl*>& dirty_inputs,
+    const at::ArrayRef<std::optional<Variable>> raw_outputs,
+    const std::shared_ptr<Node>& cdata,
+    const _jvp_fn_t& jvp_user_function,
+    const std::unordered_set<at::TensorImpl*>& to_save_if_setup_context,
+    const _view_as_self_fn_t& view_as_self_fn);
+
+TORCH_API void check_variable_result(
+    const at::TensorBase& original,
+    const at::TensorBase& result,
+    const std::string& hook_name);
+
+// Get the return type of the forward function of the custom Function class X
+template <typename X, typename... Args>
+using forward_t = decltype(X::forward(nullptr, std::declval<Args>()...));
+
+/// To use custom autograd operations, implement a Function subclass with
+/// static forward and backward functions:
+///
+/// `forward` can take as many arguments as you want and should return either a
+/// variable list or a Variable. Use of any direct Variable arguments will be
+/// registered in the graph but no vectors/sets or any other data structures
+/// will be traversed. You can use std::optional<Tensor> as one of the arguments
+/// and it will be registered as a variable in the graph if the argument has a
+/// value. It should take a pointer to `torch::autograd::AutogradContext` as the
+/// first argument. Variables can be saved in the `ctx` using
+/// `ctx->save_for_backward`
+/// (see `torch::autograd::AutogradContext::save_for_backward`) and other data
+/// can be saved in the `ctx->saved_data` map
+/// (see `torch::autograd::AutogradContext::saved_data`)
+/// in the form of `<std::string, at::IValue>` pairs.
+///
+/// `backward` should take a pointer to `torch::autograd::AutogradContext`
+/// and a variable list containing as many Variables as there were outputs from
+/// `forward` as arguments. It should return as many Variables as there were
+/// inputs with each of them containing the gradient w.r.t. its corresponding
+/// input. Variables saved in `forward` can be accessed with
+/// `ctx->get_saved_variables` (see
+/// `torch::autograd::AutogradContext::get_saved_variables`) and other saved
+/// data can be accessed from `ctx->saved_data`.
+/// To enable compiled autograd support (torch.compile for backward) for your
+/// custom autograd operation, you can set MyFunction::is_traceable
+/// (see Function::istraceable notes below).
+///
+/// For example:
+/// ```
+/// class MyFunction : public Function<MyFunction> {
+///   public:
+///   static constexpr bool is_traceable = true;
+///
+///   static variable_list forward(AutogradContext *ctx, int n, Variable var) {
+///      // Save data for backward in context
+///      ctx->saved_data["n"] = n;
+///      var.mul_(n);
+///      // Mark var as modified by inplace operation
+///      ctx->mark_dirty({var});
+///      return {var};
+///   }
+///
+///   static variable_list backward(AutogradContext *ctx, variable_list
+///   grad_output) {
+///      // Use data saved in forward
+///      auto n = ctx->saved_data["n"].toInt();
+///      return {grad_output[0]*n};
+///   }
+/// };
+/// ```
+///
+/// To use `MyFunction`:
+/// ```
+/// Variable x;
+/// auto y = MyFunction::apply(6, x);
+/// // Example backward call
+/// y[0].sum().backward();
+/// ```
+template <class T>
+struct TORCH_API Function {
+  // We need to use a different template parameter than T here because T will
+  // inherit from Function, and when Function<T> is instantiated, T::forward
+  // is not declared yet.
+  // The enable_if check is to ensure that the user doesn't explicitly provide
+  // the parameter X.
+  template <typename X = T, typename... Args>
+  static auto apply(Args&&... args)
+      -> std::enable_if_t<std::is_same_v<X, T>, forward_t<X, Args...>>;
+
+  // This flag is for an experimental feature: compiled autograd. Not all
+  // built-in APIs are supported at the moment e.g. mark_dirty and
+  // mark_non_differentiable. Before setting this flag to enable tracing for
+  // your custom function <T>, you need to ensure that the backward function is
+  // traceable i.e. any variables accessed in the backward other than the input
+  // arguments must be handled in a similar manner to built-ins in
+  // CppNode::compiled_args and CppNode::apply_with_saved.
+  static constexpr bool is_traceable = false;
+};
+
+/// Context to save information during `forward` that can be accessed in
+/// `backward` in custom autograd operations (see `torch::autograd::Function`
+/// for details).
+struct TORCH_API AutogradContext {
+  AutogradContext() = default;
+  AutogradContext(const AutogradContext& other) = delete;
+  AutogradContext& operator=(const AutogradContext& other) = delete;
+  AutogradContext(AutogradContext&& other) = delete;
+  AutogradContext& operator=(AutogradContext&& other) = delete;
+  ~AutogradContext() = default;
+
+  AutogradContext(PackedArgs& packed_args);
+
+  /// Can be used to save non-variable data for `backward`.
+  ska::flat_hash_map<std::string, at::IValue> saved_data;
+
+  /// Saves the list of variables for a future call to `backward`. This
+  /// should be called at most once from inside of `forward`.
+  void save_for_backward(variable_list to_save);
+  /// Marks variables in the list as modified in an in-place operation. This
+  /// should be called at most once from inside of `forward` and all arguments
+  /// should be inputs.
+  void mark_dirty(const variable_list& inputs);
+  /// Marks outputs in the list as not requiring gradients. This should be
+  /// called at most once from inside of `forward` and all arguments should be
+  /// outputs.
+  void mark_non_differentiable(const variable_list& outputs);
+  // Sets whether undefined output grad tensors should be expanded to tensors
+  // full of zeros before calling backward function. Default value is true.
+  void set_materialize_grads(bool value);
+
+  /// Get the list of variables that were saved in `forward` using
+  /// `save_for_backward()`. Before returning them to the user, a check is made
+  /// to ensure that they were not modified by any in-place operations.
+  variable_list get_saved_variables() const;
+  const std::unordered_set<at::TensorImpl*>& get_and_bump_dirty() const;
+  const std::unordered_set<at::TensorImpl*>& get_non_differentiable() const;
+
+  /// Expose the Node's `task_should_compute_output` method to the cpp
+  /// custom autograd Function as `needs_input_grad`.
+  bool needs_input_grad(size_t output_edge_index) const;
+  bool needs_input_grad(std::initializer_list<IndexRange> idxs) const;
+
+ private:
+  std::unordered_set<at::TensorImpl*> non_differentiable_;
+  std::unordered_set<at::TensorImpl*> dirty_inputs_;
+  std::vector<torch::autograd::SavedVariable> saved_variables_;
+  variable_list to_save_;
+  bool materialize_grads_{true};
+
+  // The CppNode in the autograd graph that owns this AutogradContext. We need a
+  // weak_ptr to avoid a refcycle. Since grad_fn_ owns this AutogradContext, it
+  // will always be alive when we want to use it.
+  std::weak_ptr<Node> grad_fn_;
+  bool has_freed_buffers_{false};
+
+  // Compiled autograd overrides saved_variables() and needs_input_grad().
+  // We store the values we want to return here.
+  std::optional<variable_list> saved_variables_override_;
+  std::optional<std::vector<bool>> needs_input_grad_override_;
+
+  void save_variables();
+
+  template <class T>
+  friend struct CppNode;
+  template <class T>
+  friend variable_list CppNode_apply_functional(
+      variable_list&& inputs,
+      AutogradContext& ctx_,
+      const std::vector<bool>& is_variable_input_,
+      const std::vector<VariableInfo>& output_info_,
+      const std::string& name);
+};
+
+template <typename T>
+inline variable_list CppNode_apply_functional(
+    // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
+    variable_list&& inputs,
+    AutogradContext& ctx_,
+    const std::vector<bool>& is_variable_input_,
+    const std::vector<VariableInfo>& output_info_,
+    const std::string& name) {
+  at::OptionalDeviceGuard _device_guard;
+
+  auto num_inputs = inputs.size();
+  variable_list backward_inputs;
+  backward_inputs.reserve(num_inputs);
+  for (const auto i : c10::irange(num_inputs)) {
+    if (inputs[i].defined() || !ctx_.materialize_grads_) {
+      backward_inputs.emplace_back(std::move(inputs[i]));
+    } else {
+      backward_inputs.emplace_back(output_info_[i].zeros(_device_guard));
+    }
+  }
+
+  auto outputs = T::backward(&ctx_, backward_inputs);
+
+  const auto num_forward_inputs =
+      static_cast<int64_t>(is_variable_input_.size());
+  auto num_outputs = static_cast<int64_t>(outputs.size());
+  // Returning too many results is ok, but only as long as they're all
+  // undefined. Truncate the result vector in that case.
+  if (num_outputs > num_forward_inputs) {
+    bool all_undef = true;
+    for (const auto i : c10::irange(num_forward_inputs, num_outputs)) {
+      all_undef &= (!outputs[i].defined());
+    }
+    if (all_undef) {
+      outputs.resize(num_forward_inputs);
+      num_outputs = num_forward_inputs;
+    }
+  }
+
+  if (num_outputs != num_forward_inputs) {
+    std::string msg("function ");
+    msg += name + " returned an incorrect number of gradients (expected ";
+    msg += std::to_string(num_forward_inputs) + ", got ";
+    msg += std::to_string(num_outputs) + ")";
+    throw std::runtime_error(msg);
+  }
+
+  variable_list results;
+  results.reserve(num_outputs);
+  for (const auto i : c10::irange(num_outputs)) {
+    if (!is_variable_input_[i]) {
+      if (outputs[i].defined()) {
+        std::string msg("function ");
+        msg += name +
+            " returned a gradient different that is defined at position ";
+        msg += std::to_string(i + 1) +
+            ", std the corresponding forward input was not a Variable";
+        throw std::runtime_error(msg);
+      }
+      continue;
+    }
+    results.emplace_back(outputs[i]);
+  }
+  return results;
+}
+
+template <typename T>
+inline variable_list CppNode_apply_functional_ivalue(
+    const variable_list& inputs,
+    const std::vector<c10::IValue>& args) {
+  auto packed_args = PackedArgs(args);
+  auto ctx = AutogradContext(packed_args);
+  auto output_info = packed_args.unpack<std::vector<VariableInfo>>();
+  auto is_variable_input = packed_args.unpack<std::vector<bool>>();
+  auto name = packed_args.unpack<std::string>();
+  return CppNode_apply_functional<T>(
+      variable_list(inputs), ctx, is_variable_input, output_info, name);
+}
+
+// CppNode<T> is the Node in the autograd graph that represents the user defined
+// backward function for Function<T>. Calls to CppNode::apply are forward to
+// T::backward().
+template <class T>
+struct CppNode : public Node {
+  variable_list apply(variable_list&& inputs) override;
+  AutogradContext ctx_;
+  std::vector<bool> is_variable_input_;
+  std::vector<VariableInfo> input_info_;
+  std::vector<VariableInfo> output_info_;
+
+  void release_variables() override;
+
+  void set_ctx_grad_fn(const std::shared_ptr<Node>& node);
+  void save_variables_to_ctx();
+
+  void compiled_args(CompiledNodeArgs& args) const override {
+    // although neither of the 2 methods below have uniqueness guarantees
+    // it is unlikely for them to collide at the same time
+    args.collect(static_cast<uint64_t>(typeid(T).hash_code()));
+    args.collect(std::string(typeid(T).name()));
+
+    args.collect(ctx_.saved_data);
+    TORCH_INTERNAL_ASSERT(ctx_.non_differentiable_.empty());
+    TORCH_INTERNAL_ASSERT(ctx_.dirty_inputs_.empty());
+    args.collect(
+        ctx_.saved_variables_, true); // always unpacked as output in eager
+    TORCH_INTERNAL_ASSERT(ctx_.to_save_.empty());
+    args.collect(ctx_.materialize_grads_);
+    args.collect(ctx_.has_freed_buffers_);
+    args.collect(is_variable_input_);
+    args.collect(input_info_);
+    args.collect(output_info_);
+  }
+
+  variable_list apply_with_saved(
+      const variable_list& inputs,
+      SwapSavedVariables& saved) override {
+    saved.before(ctx_.saved_data);
+    TORCH_INTERNAL_ASSERT(ctx_.non_differentiable_.empty());
+    TORCH_INTERNAL_ASSERT(ctx_.dirty_inputs_.empty());
+    saved.before(ctx_.saved_variables_);
+    TORCH_INTERNAL_ASSERT(ctx_.to_save_.empty());
+    saved.before(ctx_.materialize_grads_);
+    saved.before(ctx_.has_freed_buffers_);
+    saved.before(input_info_);
+    saved.before(output_info_);
+
+    PackedArgs packed_args;
+    packed_args.pack_saved_data(ctx_.saved_data);
+    variable_list saved_variables = ctx_.get_saved_variables();
+    packed_args.pack(saved_variables);
+    packed_args.pack(ctx_.materialize_grads_);
+    packed_args.pack(ctx_.has_freed_buffers_);
+
+    std::vector<bool> needs_input_grad;
+    {
+      auto ptr = ctx_.grad_fn_.lock();
+      TORCH_INTERNAL_ASSERT(ptr);
+      for (const auto i : c10::irange(ptr->next_edges().size())) {
+        needs_input_grad.push_back(ptr->task_should_compute_output(i));
+      }
+    }
+    packed_args.pack(needs_input_grad);
+
+    packed_args.pack(output_info_);
+    packed_args.pack(is_variable_input_);
+    packed_args.pack(name());
+    auto args = std::move(packed_args).vec();
+
+    auto output_metadata = torch::dynamo::autograd::
+        IValuePacker<std::vector<std::optional<InputMetadata>>>::pack(
+            torch::dynamo::autograd::get_input_metadata(next_edges()));
+
+    const auto& pyinterface = torch::dynamo::autograd::getPyCompilerInterface();
+
+    // Each time apply_with_saved is called, we bind a new function to Python.
+    // This is because the schema might be different on compiled autograd cache
+    // misses. An alternative is to pass the schema to Python so that it can be
+    // an input to a function, but the schema can't be put into an FX graph
+    // right now.
+    std::vector<at::TypePtr> schema;
+    schema.reserve(args.size());
+    for (const auto& ivalue : args) {
+      if (ivalue.isTensor()) {
+        schema.emplace_back(at::TensorType::get());
+      } else {
+        schema.emplace_back(ivalue.type());
+      }
+    }
+    static_assert(
+        std::is_same_v<std::remove_cv_t<decltype(T::is_traceable)>, bool>);
+    auto fn_name = pyinterface->bind_function(
+        saved.get_py_compiler(),
+        std::string(typeid(T).name()),
+        CppNode_apply_functional_ivalue<T>,
+        schema,
+        /*is_custom_function*/ true,
+        /*is_traceable*/ T::is_traceable);
+
+    auto results = pyinterface->call_function(
+        saved.get_py_compiler(),
+        "apply_functional",
+        fn_name,
+        inputs,
+        args,
+        output_metadata);
+
+    saved.after(ctx_.saved_data);
+    TORCH_INTERNAL_ASSERT(ctx_.non_differentiable_.empty());
+    TORCH_INTERNAL_ASSERT(ctx_.dirty_inputs_.empty());
+    saved.after(ctx_.saved_variables_);
+    TORCH_INTERNAL_ASSERT(ctx_.to_save_.empty());
+    saved.after(ctx_.materialize_grads_);
+    saved.after(ctx_.has_freed_buffers_);
+    saved.after(input_info_);
+    saved.after(output_info_);
+    return results;
+  }
+};
+
+struct ExtractVariables : IterArgs<ExtractVariables> {
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  std::vector<bool>& is_var_;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  variable_list& list_;
+  ExtractVariables(std::vector<bool>& is_var, variable_list& list)
+      : is_var_(is_var), list_(list) {}
+  void operator()(const std::optional<at::Tensor>& x) {
+    if (x.has_value() && x.value().defined()) {
+      is_var_.push_back(true);
+      list_.emplace_back(x.value());
+    } else {
+      is_var_.push_back(false);
+    }
+  }
+  void operator()(const at::Tensor& x) {
+    is_var_.push_back(true);
+    list_.emplace_back(x);
+  }
+  void operator()(const at::TensorList& list) {
+    for (const at::Tensor& x : list) {
+      is_var_.push_back(true);
+      list_.emplace_back(x);
+    }
+  }
+  template <typename T>
+  void operator()(const T& x) {
+    is_var_.push_back(false);
+  }
+};
+
+template <typename... Args>
+inline void extract_vars(
+    std::vector<bool>& is_var,
+    variable_list& list,
+    Args&&... args) {
+  ExtractVariables(is_var, list).apply(std::forward<Args>(args)...);
+}
+
+template <typename T>
+std::enable_if_t<std::is_same_v<T, variable_list>, T> to_output_type(
+    std::vector<std::optional<Variable>>& output_list) {
+  variable_list result;
+  std::transform(
+      output_list.begin(),
+      output_list.end(),
+      std::back_inserter(result),
+      [](const std::optional<Variable>& var) { return *var; });
+  return result;
+}
+
+template <typename T>
+std::enable_if_t<std::is_same_v<T, Variable>, T> to_output_type(
+    std::vector<std::optional<Variable>>& output_list) {
+  return *output_list[0];
+}
+
+inline std::vector<std::optional<Variable>> to_optional(Variable& output) {
+  return std::vector<std::optional<Variable>>{output};
+}
+
+inline std::vector<std::optional<Variable>> to_optional(variable_list& output) {
+  std::vector<std::optional<Variable>> result;
+  std::transform(
+      output.begin(),
+      output.end(),
+      std::back_inserter(result),
+      [](const Variable& var) { return var; });
+  return result;
+}
+
+template <class T>
+template <typename X, typename... Args>
+auto Function<T>::apply(Args&&... args)
+    -> std::enable_if_t<std::is_same_v<X, T>, forward_t<X, Args...>> {
+  const auto& functorch_tls = at::functorch::functorchTLSAccessor();
+  if (functorch_tls) {
+    // Function support for functorch is handled in Python.
+    // Here we are dealing with a (C++) Function, which is not supported.
+    // Let's raise an error instead of being silently incorrect.
+    functorch_tls->checkSupportsCppAutogradFunction();
+  }
+
+  std::shared_ptr<CppNode<T>> node(new CppNode<T>(), deleteNode);
+  variable_list input_vars;
+
+  const size_t num_inputs = sizeof...(Args);
+  input_vars.reserve(num_inputs);
+  node->is_variable_input_.reserve(num_inputs);
+  // TODO Add tracing here
+  extract_vars(node->is_variable_input_, input_vars, args...);
+
+  bool is_executable =
+      GradMode::is_enabled() && any_variable_requires_grad(input_vars);
+  auto next_edges =
+      (is_executable ? collect_next_edges(input_vars) : edge_list());
+  node->set_ctx_grad_fn(node);
+  node->set_next_edges(std::move(next_edges));
+  node->clear_input_metadata();
+
+  node->input_info_.reserve(input_vars.size());
+  for (auto& var : input_vars) {
+    node->input_info_.emplace_back(var);
+  }
+
+  using forward_return_t = forward_t<X, Args...>;
+  forward_return_t outputs;
+  {
+    AutoGradMode grad_mode(false);
+    outputs = T::forward(&node->ctx_, std::forward<Args>(args)...);
+  }
+
+  _jvp_fn_t jvp_fn = [](const variable_list& inputs,
+                        const variable_list& gI) -> variable_list {
+    TORCH_CHECK(
+        false,
+        "jvp is not implemented for the c++ API of custom Function yet.",
+        "Please open a feature request on GitHub if you need this.");
+  };
+
+  auto view_as_self_fn = [](const at::Tensor& x) -> at::Tensor {
+    return x.view_as(x);
+  };
+
+  auto wrapped_outputs = _wrap_outputs(
+      input_vars,
+      node->ctx_.get_non_differentiable(),
+      node->ctx_.get_and_bump_dirty(),
+      to_optional(outputs),
+      is_executable ? node : nullptr,
+      jvp_fn,
+      {},
+      view_as_self_fn);
+
+  node->output_info_.reserve(wrapped_outputs.size());
+  for (auto& output : wrapped_outputs) {
+    if (is_executable && output.has_value()) {
+      node->output_info_.emplace_back(output.value());
+    } else if (is_executable) {
+      node->output_info_.emplace_back();
+    }
+  }
+
+  if (is_executable) {
+    node->save_variables_to_ctx();
+  }
+
+  // wrapped_outputs will be a variable_list so, convert it to the correct
+  // return type. Only Variable and variable_list are accepted as return types.
+  return to_output_type<forward_return_t>(wrapped_outputs);
+}
+
+// The logic here is the same as PyNode::apply, so changes to it should be done
+// in both the places
+template <class T>
+// NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
+variable_list CppNode<T>::apply(variable_list&& inputs) {
+  // Acquire lock to here protect thread safety on custom C++ Autograd Node
+  // This is needed for the custom Autograd Node since we don't know if the
+  // user defined Node will write to the shared data during backward.
+  // see Note [Thread Safety on Autograd Node]
+  std::lock_guard<std::mutex> lock(mutex_);
+  return CppNode_apply_functional<T>(
+      std::move(inputs), ctx_, is_variable_input_, output_info_, name());
+}
+
+template <class T>
+void CppNode<T>::release_variables() {
+  // lock to ensure thread safety, see [Thread Safety on Autograd Node]
+  std::lock_guard<std::mutex> lock(mutex_);
+  ctx_.saved_variables_.clear();
+  ctx_.has_freed_buffers_ = true;
+}
+
+template <class T>
+void CppNode<T>::save_variables_to_ctx() {
+  ctx_.save_variables();
+}
+
+template <class T>
+void CppNode<T>::set_ctx_grad_fn(const std::shared_ptr<Node>& node) {
+  ctx_.grad_fn_ = node;
+}
+
+} // namespace torch::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/edge.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/edge.h
new file mode 100644
index 0000000000000000000000000000000000000000..501d7d85ed0554b4a2bf667dd0d22b061a5031d1
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/edge.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+
+#include <c10/util/hash.h>
+
+namespace torch::autograd {
+
+struct Node;
+
+/// Represents a particular input of a function.
+struct Edge {
+  Edge() noexcept : function(nullptr), input_nr(0) {}
+
+  Edge(std::shared_ptr<Node> function_, uint32_t input_nr_) noexcept
+      : function(std::move(function_)), input_nr(input_nr_) {}
+
+  /// Convenience method to test if an edge is valid.
+  bool is_valid() const noexcept {
+    return function != nullptr;
+  }
+
+  // Required for use in associative containers.
+  bool operator==(const Edge& other) const noexcept {
+    return this->function == other.function && this->input_nr == other.input_nr;
+  }
+
+  bool operator!=(const Edge& other) const noexcept {
+    return !(*this == other);
+  }
+
+  /// The function this `Edge` points to.
+  std::shared_ptr<Node> function;
+
+  /// The identifier of a particular input to the function.
+  uint32_t input_nr;
+};
+} // namespace torch::autograd
+
+// The idiomatic way of enabling use of a custom type as the key of hash
+// containers in C++11. This method removes the requirement of having to pass
+// a custom hasher to std::unordered_{map, set}.
+// See http://en.cppreference.com/w/cpp/utility/hash for more information.
+namespace std {
+template <>
+struct hash<torch::autograd::Edge> {
+  // These type aliases are required by the standard.
+  using argument_type = torch::autograd::Edge;
+  using return_type = size_t;
+  return_type operator()(const argument_type& edge) const noexcept {
+    return c10::get_hash(edge.function, edge.input_nr);
+  }
+};
+} // namespace std
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/engine.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/engine.h
new file mode 100644
index 0000000000000000000000000000000000000000..8809cb2f13f6d189e320f7ed71077e56535f9471
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/engine.h
@@ -0,0 +1,289 @@
+#pragma once
+
+// Engine implements backpropagation from output variables and their gradients
+// to "root" variables (variables created by the user with requires_grad=True).
+
+#include <ATen/Tensor.h>
+#include <ATen/ThreadLocalState.h>
+#include <ATen/core/ivalue.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/autograd/anomaly_mode.h>
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/autograd/functions/basic_ops.h>
+#include <torch/csrc/autograd/graph_task.h>
+#include <torch/csrc/autograd/input_buffer.h>
+#include <torch/csrc/autograd/saved_variable_hooks.h>
+#include <torch/csrc/autograd/utils/warnings.h>
+
+#include <exception>
+#include <functional>
+#include <memory>
+#include <queue>
+#include <utility>
+#include <vector>
+
+namespace torch::autograd {
+struct ReadyQueue;
+}
+
+namespace torch::autograd {
+
+// Maximum reentrant backward depth before switching to a new thread
+// This limit is based on the TSAN's deadlock detector, where it will
+// fail if a program hold more than 65 locks in one thread at once.
+// As we hold mutex in every of our custom C++ autograd Node, we would
+// like to avoid TSAN complains on this when doing reentrant backwards
+// For reference, see https://github.com/google/sanitizers/issues/950
+static constexpr int MAX_DEPTH = 60;
+
+void set_device(int device);
+TORCH_API void validate_outputs(
+    const edge_list& edges,
+    variable_list& grads,
+    const std::function<std::string(const std::string&)>& format_error);
+TORCH_API void validate_outputs(
+    const std::vector<std::optional<InputMetadata>>& input_metadata,
+    variable_list& grads,
+    const std::function<std::string(const std::string&)>& format_error);
+TORCH_API std::vector<std::optional<InputMetadata>> collect_input_metadata(
+    const edge_list& edges);
+
+struct NodeTask {
+  std::weak_ptr<GraphTask> base_;
+  std::shared_ptr<Node> fn_;
+  // This buffer serves as an implicit "addition" node for all of the
+  // gradients flowing here.  Once all the dependencies are finished, we
+  // use the contents of this buffer to run the function.
+  InputBuffer inputs_;
+  // When worker receives a task with isShutdownTask = true, it will immediately
+  // exit. The engine sends a shutdown task to every queue upon its destruction.
+  bool isShutdownTask_;
+
+  int getReentrantDepth() const;
+
+  NodeTask(
+      std::weak_ptr<GraphTask> base,
+      std::shared_ptr<Node> fn,
+      InputBuffer inputs,
+      bool isShutdownTask = false)
+      : base_(std::move(base)),
+        fn_(std::move(fn)),
+        inputs_(std::move(inputs)),
+        isShutdownTask_(isShutdownTask) {}
+};
+
+// Guard that sets and restores checkpoint_valid
+class CheckpointValidGuard {
+ public:
+  explicit CheckpointValidGuard(
+      const std::shared_ptr<const GraphTask>& graph_task);
+  ~CheckpointValidGuard();
+
+ private:
+  bool prev_checkpoint_valid_state;
+};
+
+struct ReadyQueue {
+ private:
+  // Returns true when t2 should be (weakly) BEFORE t1 in the queue.
+  // Shutdown tasks are first and then empty NodeTask are next.
+  struct CompareNodeTaskTime {
+    bool operator()(NodeTask const& t1, NodeTask const& t2) {
+      // NOLINTNEXTLINE(bugprone-branch-clone)
+      if (t2.isShutdownTask_) {
+        return true;
+      } else if (!t1.fn_ || t1.isShutdownTask_) {
+        return false;
+      } else if (!t2.fn_) {
+        return true;
+      } else if (t1.getReentrantDepth() == t2.getReentrantDepth()) {
+        return t1.fn_->sequence_nr() < t2.fn_->sequence_nr();
+      } else {
+        return t1.getReentrantDepth() < t2.getReentrantDepth();
+      }
+    }
+  };
+
+  // To notify threads waiting on the ReadyQueue of available tasks on the heap_
+  std::condition_variable not_empty_;
+  // To protect read and writes to heap_
+  mutable std::mutex mutex_;
+
+  std::priority_queue<NodeTask, std::vector<NodeTask>, CompareNodeTaskTime>
+      heap_;
+
+ public:
+  // incrementOutstandingTasks indicates whether or not we should increment
+  // 'outstanding_tasks_' for the associated GraphTask. This should mostly
+  // always be true and is only set false in certain cases (see docs for
+  // DistEngine.execute_graph_task_until_ready_queue_empty)
+  void push(NodeTask item, bool incrementOutstandingTasks = true);
+  void pushShutdownTask();
+  NodeTask pop();
+  bool empty() const;
+  size_t size() const;
+};
+
+// A single instance of this struct should be created through the whole process
+// lifetime. The worker thread creation logic and Engine's destructor rely on
+// this.
+struct TORCH_API Engine {
+  /// Returns a reference to a static `Engine` instance.
+  static Engine& get_default_engine();
+
+  static Engine& get_base_engine();
+
+  // compiled_autograd needs to live in a different .so file so that it
+  // can have python symbols, so we add a layer of indirection
+  // see [Note: Compiled Autograd]
+  typedef variable_list (*compiled_autograd_fn)(
+      const std::shared_ptr<Node>& graph_root,
+      const GraphTask& graph_task,
+      bool accumulate_grad,
+      const edge_list& outputs);
+  static void set_compiled_autograd(compiled_autograd_fn fn);
+
+  Engine(const Engine&) = delete;
+  Engine(Engine&&) = delete;
+  virtual ~Engine();
+
+  // Given a list of (Node, input number) pairs computes the value of the graph
+  // by following next_edge references.
+  virtual variable_list execute(
+      const edge_list& roots,
+      const variable_list& inputs,
+      bool keep_graph,
+      bool create_graph,
+      bool accumulate_grad,
+      const edge_list& outputs = {});
+
+  // Given a pre-populated GraphTask and GraphRoot, computes the backward pass
+  // for the graph.
+  //
+  // NB: This API should only be used by internal autograd specific
+  // machinery and shouldn't be exposed to users in anyway.
+  virtual c10::intrusive_ptr<at::ivalue::Future> execute_with_graph_task(
+      const std::shared_ptr<GraphTask>& graph_task,
+      std::shared_ptr<Node> graph_root,
+      InputBuffer&& input_buffer);
+
+  virtual std::unique_ptr<AnomalyMetadata> make_anomaly_metadata() {
+    return std::make_unique<AnomalyMetadata>();
+  }
+
+  virtual std::unique_ptr<SavedVariableHooks> get_default_saved_variable_hooks() {
+    return nullptr;
+  }
+
+  // We pass cpu_ready_queue to evaluate_function, so that it knows
+  // the correct ready queue to push to after a NodeTask is ready
+  void evaluate_function(
+      std::shared_ptr<GraphTask>& graph_task,
+      Node* func,
+      InputBuffer& inputs,
+      const std::shared_ptr<ReadyQueue>& cpu_ready_queue);
+
+  void initialize_device_threads_pool();
+  virtual void thread_on_exception(
+      const std::shared_ptr<GraphTask>& graph_task,
+      const std::shared_ptr<Node>& fn,
+      std::exception& e);
+
+  void queue_callback(std::function<void()> callback);
+
+  bool is_checkpoint_valid();
+
+  // Should be called after fork to notify that worker threads are gone
+  void release_workers();
+
+  // Must be called by subclass before destructing to avoid a data-race-on-vptr.
+  void stop();
+
+  // Initializes a device thread for the autograd engine.
+  virtual void thread_init(
+      int device,
+      const std::shared_ptr<ReadyQueue>& ready_queue,
+      bool should_increment = true);
+
+ protected:
+  Engine();
+  void compute_dependencies(Node* root, GraphTask& task, uint64_t min_topo_nr);
+
+  // initialize the thread local ready queue with the ready queue that is
+  // created elsewhere (i.e. thread_init, Engine::execute, etc), or create a new
+  // ready queue if ready_queue is not provided.
+  void init_local_ready_queue(
+      std::shared_ptr<ReadyQueue> ready_queue = nullptr);
+
+  std::shared_ptr<ReadyQueue> ready_queue(
+      std::shared_ptr<ReadyQueue> cpu_ready_queue,
+      at::Device device);
+  std::shared_ptr<ReadyQueue> ready_queue_by_index(
+      std::shared_ptr<ReadyQueue> cpu_ready_queue,
+      int device_index);
+  // start device threads (CUDA, XLA, etc.) in Engine,
+  // note that it does NOT start CPU thread.
+  void start_device_threads();
+  void increment_non_reentrant_thread_count();
+  void decrement_non_reentrant_thread_count();
+  virtual void thread_main(const std::shared_ptr<GraphTask>& task);
+  void reentrant_thread_init();
+  void add_thread_pool_task(const std::weak_ptr<GraphTask>& graph_task);
+
+  // Safe to read device_ready_queues_ without synchronization after
+  // initialization
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::vector<std::shared_ptr<ReadyQueue>> device_ready_queues_;
+
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::vector<std::function<void()>> final_callbacks_;
+  // To protect reads and writes to final_callbacks_
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::mutex post_callbacks_lock_;
+
+  // How many nested reentrant calls are allowed until a new thread is used
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  int max_recursion_depth_;
+
+  struct ThreadPoolShared {
+    // Data structures used by the threads for executing reentrant backwards
+    // tasks. See Note [Reentrant backwards]
+    // Number of available threads for processing new GraphTasks.
+    unsigned int num_workers_{0};
+    // The threads will wait on work_ to be notified of GraphTasks
+    std::condition_variable work_;
+    // To protect reads and writes to graphtask_queue_ and num_workers_
+    // and for synchronizing creating new threads when needed
+    std::mutex mutex_;
+    // Workers will process the GraphTasks added to this queue. A GraphTask is
+    // allocated inside Engine::execute and lives for the duration of execute
+    std::queue<std::weak_ptr<GraphTask>> graphtasks_queue_;
+
+    ThreadPoolShared() = default;
+  };
+
+  // Temporary workaround until shutting down threads is done
+  // We need shared ownership of all these objects because the threads are
+  // leaked when Engine shuts down, so there may be threads waiting on work_ for
+  // the graphtasks_queue_ to be nonempty.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::shared_ptr<ThreadPoolShared> thread_pool_shared_;
+
+ private:
+  // Number of non-reentrant threads
+  std::atomic<uint32_t> non_reentrant_device_thread_count_;
+  // Destructor will wait for non-reentrant threads to finish
+  std::condition_variable non_reentrant_device_thread_condvar_;
+  std::mutex non_reentrant_device_thread_mutex_;
+  // stop() must be called before the destruction path goes down to the base
+  // class, in order to avoid a data-race-on-vptr. Use this boolean to guard
+  // whether stop() has already been called, so we can call this in every
+  // destructor of the class hierarchy.
+  bool stopped_{false};
+};
+
+// allow python_engine to override the default engine when it loads
+using EngineStub = Engine& (*)();
+TORCH_API void set_default_engine_stub(EngineStub stub);
+
+} // namespace torch::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/forward_grad.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/forward_grad.h
new file mode 100644
index 0000000000000000000000000000000000000000..52de12ab2e61e4b822179df014f393944fcb1e0d
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/forward_grad.h
@@ -0,0 +1,210 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <unordered_set>
+
+namespace torch::autograd {
+
+// [ Using ForwardGrad ]
+// ForwardGrad needs to be a shared_ptr to satisfy constraints of its inner
+// design. But this shared_ptr must be uniquely associated with the object that
+// stores it (as of writing, either AutogradMeta or SavedVariable). This object
+// is called the "owning object" in the discussions below. This owning object
+// must call `ForwardGrad::clear()` when it is destroyed to ensure that the
+// ForwardGrad is properly de-allocated.
+
+struct ForwardGrad;
+
+// This file contains two classes that are used to store forward AD gradients
+// and ensure that they are scoped properly. Because forward AD runs
+// concurrently with the evaluation of the function, we need a mechanism to
+// separate different forward AD invocations and be able to compute the right
+// gradients. We model such invocations as levels here. The particular scoping
+// issue mentioned above has two main drivers:
+//   - Ensure that we can conveniently use forward AD within a high level API
+//   without
+//     leaking the forward AD states outside.
+//   - Ensure that we can keep the level that we expose to the user API simple
+//   (an integer
+//     that represents the nesting depth) while avoiding confusions when the
+//     level index is reused.
+
+// The important external APIs from this file are:
+//   - ForwardADLevel::get_next_idx() that can be used to enter a new level and
+//   get its index
+//   - ForwardADLevel::release_idx() that can be used to exit a given level.
+//   - ForwardGrad() can be used to store a given forward gradient that will
+//   handle the level
+//     tracking automatically.
+
+// The basic implementation strategy is as follows:
+// Every tensor has a ForwardGrad, maintaining a map from levels to tangents.
+// ForwardGrad is responsible for registering itself to the appropriate
+// ForwardADLevel when a new tangent is added to it via ForwardGrad::set_value
+// and to un-register itself from this same level if that tangent is removed via
+// ForwardGrad::reset. The ForwardADLevel is created when a new level is entered
+// via ForwardADLevel::get_next_idx. A reference to the new ForwardADLevel is
+// stored into a global (for the whole process) vector that ensure it can be
+// accessed via ForwardADLevel::get_by_idx. This reference is deleted when the
+// index is released by the user when calling ForwardADLevel::release_idx. When
+// it is destructed, the ForwardADLevel is responsible for clearing all the
+// tangents for its level stored in all the ForwardGrad that registered with it.
+//
+// This process-wide level design, compared to a thread local one, allows us to
+// use very simple user facing handle for the level (an int) while enabling
+// cross-thread forward AD. The only required synchronization for the user is
+// when entering and exiting the levels. Some discussion on alternative design
+// is in https://github.com/pytorch/pytorch/pull/49097#discussion_r543716453 and
+// can be refined in the future.
+
+// Correctness of concurrency:
+// Each class uses its own lock when reading or modifying internal storages.
+// This allows in particular to safely remove tangents from ForwardGrad when the
+// ForwardADLevel is being exited. We ensure no deadlock by ensuring that a
+// methods never calls into another class's method while the local class's lock
+// is held except in one single case: calling from ForwardADLevel's destructor
+// into ForwardGrad::reset with update_level=false.
+
+// The lifetime of these objects is as follows:
+// The ForwardADLevel can be in three states:
+//      - Initialized: where one of its reference is held by the global vector
+//      and there may be more
+//        references held by temporary variables in ForwardGrad's methods.
+//      - About to be destructed: where "release_idx" has been called and the
+//      only reason for the
+//        ForwardADLevel not to be destructed right away is that some methods in
+//        ForwardGrad have owning reference to it. This is done so that a
+//        ForwardADLevel can never be destructed when a ForwardGrad is
+//        registered with it and in the process of adding something to its
+//        internal state.
+//      - Being destructed: Here the ForwardADLevel is not referenced anymore
+//      and can be safely reset
+//        all of the ForwardGrad. Note that we can have more than one reset
+//        being called here (which is ok) but we are guaranteed that there is at
+//        least one.
+// The ForwardGrad is simpler as there is no intermediary state and no special
+// destructor for. The logic to unregister it from the different ForwardADLevel
+// is done when the owning object (AutogradMeta or SavedVariable) is being
+// destroyed.
+
+// Other considered design:
+// To avoid having the ForwardGrad::clear, we considered storing weak_ptr inside
+// the ForwardADLevel. While this would work, it would mean that the set inside
+// the ForwardADLevel would only grow unless we do an expensive linear scan to
+// remove all the dangling weak pointers. Hence this approach was not used.
+
+// Data structures in this file are optimized for this maximum number of levels.
+// The number of levels corresponds to the degree of the gradient being
+// computed using forward AD and we don't expect more than second order
+// gradients to be common.
+#define EXPECTED_MAX_LEVEL 2
+
+struct TORCH_API ForwardADLevel {
+  ForwardADLevel(uint64_t idx) : idx_(idx) {}
+  ~ForwardADLevel();
+
+  static uint64_t get_next_idx();
+  static void release_idx(uint64_t idx);
+  static std::shared_ptr<ForwardADLevel> get_by_idx(uint64_t idx);
+  static std::shared_ptr<ForwardADLevel> try_get_by_idx(uint64_t idx);
+
+  void erase(const std::shared_ptr<ForwardGrad>& grad) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grads_.erase(grad);
+  }
+
+  void insert(const std::shared_ptr<ForwardGrad>& grad) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grads_.insert(grad);
+  }
+
+ private:
+  std::unordered_set<std::shared_ptr<ForwardGrad>> grads_;
+  std::mutex mutex_;
+  uint64_t idx_;
+};
+
+struct TORCH_API ForwardGrad : std::enable_shared_from_this<ForwardGrad> {
+  ForwardGrad() = default;
+
+  // This function must only be called when AutogradMeta or SavedVariable is
+  // being destructed as it ensures that:
+  //   - The only (potential) other references to this ForwardGrad are the
+  //     different level it is registered to
+  //   - No other thread will try to call `set_value` or `value` ever from now
+  //   on
+  //   - Any of the ForwardADLevel that this ForwardGrad is registered with
+  //   might
+  //     call `reset` at any point during this function
+  void clear() {
+    c10::SmallVector<uint64_t, EXPECTED_MAX_LEVEL> levels_idx;
+
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      for (auto& c : content_) {
+        levels_idx.push_back(c.first);
+      }
+    }
+
+    for (auto l_idx : levels_idx) {
+      // Use "try" version here as another thread might have deleted this
+      // level before we got here
+      // This is an owning reference as we want to keep the level alive
+      // until we successfully unregister ourselves
+      auto level = ForwardADLevel::try_get_by_idx(l_idx);
+      if (level) {
+        level->erase(shared_from_this());
+      }
+    }
+  }
+
+  void set_value(const at::Tensor& value, uint64_t level) {
+    // Owning reference to ensure the forward_level is not destroyed
+    // while we are updating our internal state
+    auto forward_level = ForwardADLevel::get_by_idx(level);
+    forward_level->insert(shared_from_this());
+
+    std::lock_guard<std::mutex> lock(mutex_);
+    content_.insert({level, value});
+  }
+
+  // This function removes the tangent for a given level from this ForwardGrad
+  // Use the update_level flag to disable notifying the level about this reset
+  // This flag is most notably used by the ForwardADLevel destructor.
+  void reset(uint64_t level, bool update_level = true) {
+    if (update_level) {
+      ForwardADLevel::get_by_idx(level)->erase(shared_from_this());
+    }
+
+    std::unique_lock<std::mutex> lock(mutex_);
+    const auto& it = content_.find(level);
+    TORCH_INTERNAL_ASSERT(
+        it != content_.end(), "Resetting a non-existent level.");
+    // Keep the Tensor alive until we have released the lock
+    // This is needed as we can be in a case where this function is called by
+    // ForwardADLevel destructor
+    auto t = (*it).second;
+    content_.erase(level);
+    lock.unlock();
+  }
+
+  const at::Tensor& value(uint64_t level) const;
+
+  bool contains(uint64_t level) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return content_.count(level) > 0;
+  }
+
+  bool empty() const {
+    return content_.empty();
+  }
+
+  static const at::Tensor& undef_grad();
+
+ private:
+  // TODO(albanD): replace this with a SmallVector
+  std::unordered_map<uint64_t, at::Tensor> content_;
+  mutable std::mutex mutex_;
+};
+
+} // namespace torch::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/function.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/function.h
new file mode 100644
index 0000000000000000000000000000000000000000..0c1f7926f675c6642b193b1cf9617c82747c60e8
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/function.h
@@ -0,0 +1,790 @@
+#pragma once
+
+#include <torch/csrc/autograd/anomaly_mode.h>
+#include <torch/csrc/autograd/edge.h>
+#include <torch/csrc/autograd/grad_mode.h>
+#include <torch/csrc/autograd/graph_task.h>
+#include <torch/csrc/autograd/input_metadata.h>
+#include <torch/csrc/autograd/saved_variable.h>
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/utils/python_stub.h>
+#include <torch/csrc/utils/variadic.h>
+
+#include <ATen/SequenceNumber.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/record_function.h>
+#include <c10/util/Exception.h>
+#include <c10/util/irange.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <initializer_list>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace torch::autograd {
+
+struct Edge;
+struct FunctionPostHook;
+struct FunctionPreHook;
+
+using tensor_list = std::vector<at::Tensor>;
+using variable_list = std::vector<Variable>;
+using edge_list = std::vector<Edge>;
+using saved_variable_list = std::vector<SavedVariable>;
+using ivalue_list = std::vector<c10::IValue>;
+using functional_apply_t = std::function<
+    variable_list(const variable_list&, const std::vector<c10::IValue>&)>;
+using IndexRange = std::pair<size_t, size_t>;
+using torch::dynamo::autograd::CompiledNodeArgs;
+using torch::dynamo::autograd::PackedArgs;
+using torch::dynamo::autograd::SwapSavedVariables;
+
+// Custom deleter to prevent stack overflows.
+TORCH_API void deleteNode(Node* function);
+
+// Guard that sets and restores the evaluating node
+class NodeGuard {
+ public:
+  explicit NodeGuard(std::shared_ptr<Node> node);
+  ~NodeGuard();
+
+ private:
+  std::shared_ptr<Node> last_evaluating_node_;
+};
+
+// Return the Node currently being evaluated (if any)
+// This is only set during the backward pass while a Node is being
+// executed.
+TORCH_API std::shared_ptr<Node> get_current_node();
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+//                               Node
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// A `Node` is an abstract class that represents an operation taking zero
+// or more input `Variable`s and producing zero or more output `Variable`s. All
+// functions in PyTorch's autograd machinery derive from this class and
+// override its `apply` method. Instances of such subclasses will then be
+// invocable via the call operator.
+//
+//                    Nodes in the Autograd Graph
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// When viewing the autograd system as a graph, `Node`s are the vertices or
+// nodes, connected to each other via (directed) `Edge`s, which themselves are
+// represented via (`Node`, input_nr) pairs. `Variable`s are the outputs to
+// and inputs of `Node`s, and travel between these edges during execution
+// of the graph. When two or more `Edge`s (from different sources) point at the
+// same input to a `Node`, the values produced along all of these edges are
+// implicitly summed prior to being forwarded to the target `Node`.
+//
+//                              Hierarchy
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// Subclasses usually represent differentiable functions as well as their
+// gradient operators. Note, however, that due to the very general definition
+// of a `Node` taking *zero* or more inputs and producing *zero* or more
+// outputs, uses of `Node`s are flexible and extend beyond purely
+// mathematical operations. For example, the `AccumulateGrad` function is a
+// *sink*: it takes one input, but produces no outputs, instead accumulating
+// the input as a side effect. At the other extreme, the `GraphRoot` function
+// receives no inputs from other functions, but produces multiple outputs.
+//
+//                              Interface
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// The most important method on `Node` is the call operator, which takes in
+// a list of variables and produces a list of variables. The precise size of
+// these lists can be determined with `num_inputs()` and `num_outputs()`.
+// `Node`s are stitched together via their `next_edge` interface, which let
+// you manipulate the set of outgoing edges of a `Node`. You can add an
+// edge with `add_next_edge()`, retrieve an edge with `next_edge(index)` and
+// iterate over them via the `next_edges()` method. Other methods exist for
+// integration with the JIT and other parts of PyTorch. Every `Node` has a
+// *sequence number* that increases monotonically in the order of `Node`
+// construction. It can be retrieved via the `sequence_nr()` method. Note that
+// this sequence number is *thread local*. This means that when `Node`s
+// `A`, `B` and `C` are created consecutively in the same thread, their
+// sequence numbers will be ordered `A` < `B` < `C`. If, however, `A` and `B`
+// are created in one thread and `C` is created in a new thread, there are *no
+// guarantees* w.r.t. the ordering of `C` relative to `A` or `B`.
+// See NOTE [ Sequence Number] for more details on the usages of sequence
+// number.
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+struct TORCH_API Node : std::enable_shared_from_this<Node> {
+ public:
+  /// Construct a new `Node` with the given `next_edges`
+  explicit Node(uint64_t sequence_nr, edge_list&& next_edges = edge_list())
+      : sequence_nr_(sequence_nr), next_edges_(std::move(next_edges)) {
+    for (const Edge& edge : next_edges_) {
+      update_topological_nr(edge);
+    }
+
+    if (AnomalyMode::is_enabled()) {
+      metadata()->store_stack();
+
+      // If anomaly mode is enabled and graph is constructed, then assign the
+      // currently evaluating node as the parent of this node.
+      // A parent is a Node where this Node is created.
+      // We are tracking the parents to track multiple backward operations.
+      assign_parent();
+    }
+
+    // Store the thread_id of the forward operator.
+    // See NOTE [ Sequence Numbers ]
+    thread_id_ = at::RecordFunction::currentThreadId();
+  }
+
+  explicit Node(edge_list&& next_edges = edge_list())
+      : Node(
+            /*sequence_nr=*/at::sequence_number::get_and_increment(),
+            std::move(next_edges)) {}
+
+  /// Nodes are neither copyable nor moveable.
+  Node(const Node& other) = delete;
+  Node(Node&& other) = delete;
+  Node& operator=(const Node& other) = delete;
+  Node& operator=(Node&& other) = delete;
+  virtual ~Node() = default;
+
+  std::shared_ptr<Node> getptr() {
+    return shared_from_this();
+  }
+  /// Evaluates the function on the given inputs and returns the result of the
+  /// function call.
+  variable_list operator()(variable_list&& inputs) {
+    // In the first iteration of named tensors, autograd ignores names and
+    // operates on unnamed tensors. In the long term, autograd should
+    // probably operate with names.
+    at::NoNamesGuard no_names_guard;
+
+#ifdef USE_ROCM
+    // Keep track of backward pass for rocblas.
+    at::ROCmBackwardPassGuard in_backward;
+#endif
+
+    auto step_callbacks =
+        at::getStepCallbacksUnlessEmpty(at::RecordScope::BACKWARD_FUNCTION);
+    if (C10_UNLIKELY(step_callbacks.has_value())) {
+      at::RecordFunction guard(std::move(*step_callbacks));
+      // Using sequence number and thread id to correlate with
+      // the forward pass function
+      guard.setForwardThreadId(thread_id_);
+      if (guard.needsInputs()) {
+        std::vector<c10::IValue> inputs_vec(inputs.begin(), inputs.end());
+        guard.before(
+            name(),
+            c10::ArrayRef<const c10::IValue>(
+                inputs_vec.data(), inputs_vec.size()),
+            static_cast<int64_t>(sequence_nr()));
+      } else {
+        guard.before(name(), static_cast<int64_t>(sequence_nr()));
+      }
+      return apply(std::move(inputs));
+    } else {
+      return apply(std::move(inputs));
+    }
+  }
+
+  // Graph Connectivity API
+  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  // Inputs. NOTE: inputs of the grad_fn correspond to Tensor outputs of the
+  // forward function.
+
+  // Marker for expected undefined input
+  struct undefined_input {};
+
+  /// Adds the type and shape metadata for a new input. Returns the index of
+  /// of the new input.
+  uint32_t add_input_metadata(
+      const at::TensorOptions& options,
+      c10::SymIntArrayRef shape,
+      bool is_tensor_subclass,
+      bool is_nested) noexcept {
+    uint32_t input_nr = input_metadata_.size();
+    auto meta_shape = MetadataShape{std::in_place_type<SymIntSmallVec>, shape};
+    input_metadata_.emplace_back(
+        options, meta_shape, is_tensor_subclass, is_nested);
+    return input_nr;
+  }
+
+  uint32_t add_input_metadata(const at::Tensor& t) noexcept {
+    uint32_t input_nr = input_metadata_.size();
+    input_metadata_.emplace_back(t);
+    return input_nr;
+  }
+
+  /// Adds a placeholder for an input that will not be used.
+  uint32_t add_input_metadata(undefined_input u) noexcept {
+    uint32_t input_nr = input_metadata_.size();
+    input_metadata_.emplace_back();
+    return input_nr;
+  }
+
+  uint32_t num_inputs() const noexcept {
+    return input_metadata_.size();
+  }
+
+  const InputMetadata& input_metadata(size_t index) const {
+    return input_metadata_[index];
+  }
+
+  // Danger: not thread safe, caller must protect with lock
+  InputMetadata& mutable_input_metadata(size_t index) {
+    return input_metadata_[index];
+  }
+
+  /**
+   * Note: Function Streams
+   * A function's stream (for a given device type) is the stream of the first
+   * element of its input buffer on a device of that type.
+   *
+   * If all elements are on the same device they MUST share a stream. If
+   * elements are on different devices (across multiple GPUs, for example)
+   * they may have different streams.
+   */
+  std::optional<c10::Stream> stream() {
+    auto opt_device_type = at::getAccelerator();
+    if (!opt_device_type.has_value()) {
+      return std::nullopt;
+    }
+    for (const auto& metadata : input_metadata_) {
+      if (metadata.device().type() == opt_device_type.value())
+        return metadata.stream();
+    }
+
+    return std::nullopt;
+  }
+
+  // Used by the engine to determine what device thread to run on
+  at::Device device() {
+    // Since we pick the first non-CPU tensor, this won't work with
+    // mixed device-type operations (e.g., an op that is both CUDA
+    // and XLA).  This is *incredibly* unlikely, so we don't worry
+    // about it.
+    for (const auto& metadata : input_metadata_) {
+      auto device = metadata.device();
+      if (device.type() != at::kCPU) {
+        return device;
+      }
+    }
+    // Only report to the CPU thread if there really were no tensors
+    // from other devices.
+    return at::kCPU;
+  }
+
+  void clear_input_metadata() {
+    input_metadata_.clear();
+  }
+
+  // Outputs ("Next Edges")
+
+  void update_topological_nr(const Edge& edge) {
+    TORCH_INTERNAL_ASSERT(
+        !has_parent_,
+        "Cannot update a node's topological_nr after it already has a parent."
+        " If we allow this, we can no longer guarantee that a parent's"
+        " topo_nr is always greater than those of all its children")
+    Node* node = edge.function.get();
+    if (node) {
+      auto topo_nr = node->topological_nr();
+      if (topological_nr_ <= topo_nr) {
+        topological_nr_ = topo_nr + 1;
+      }
+    }
+  }
+
+  void set_next_edge(size_t index, Edge edge) {
+    update_topological_nr(edge);
+    next_edges_[index] = std::move(edge);
+  }
+
+  void add_next_edge(Edge edge) {
+    update_topological_nr(edge);
+    next_edges_.emplace_back(std::move(edge));
+  }
+
+  void set_next_edges(edge_list&& next_edges) {
+    next_edges_ = std::move(next_edges);
+    for (const auto& next_edge : next_edges_) {
+      update_topological_nr(next_edge);
+    }
+  }
+
+  const Edge& next_edge(size_t index) const noexcept {
+    return next_edges_[index];
+  }
+
+  const edge_list& next_edges() const noexcept {
+    return next_edges_;
+  }
+
+  edge_list& next_edges() noexcept {
+    return next_edges_;
+  }
+
+  uint32_t num_outputs() const noexcept {
+    return next_edges_.size();
+  }
+
+  // Miscellaneous Methods
+  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  /// NOTE [ Sequence Number]
+  ///
+  /// The sequence_nr has two main usages in autograd:
+  ///
+  /// 1) Helps determine the node's execution priority in the engine.
+  ///    All else being equal, nodes with higher priority numbers are executed
+  ///    first. Thus, nodes corresponding to ops executed later are the first to
+  ///    be executed in the backward pass. One caveat is that we prioritize
+  ///    AccumulateGrad nodes by explicitly setting its sequence_nr to be
+  ///    UINT64_MAX.
+  /// 2) The sequence number of this `Node` is paired with with thread_id it was
+  /// created in
+  ///    as a unique identifier by the profiler to annotate recorded events.
+  ///    The purpose of this is to help users (and possibly programs)
+  ///    interpreting the profiler's output to correlate backward nodes with its
+  ///    forward ops. We need both sequence_nr and thread_id to identify a node
+  ///    because sequence_nr is thread_local, i.e., starts counting up from zero
+  ///    in a new thread
+  uint64_t sequence_nr() const noexcept {
+    return sequence_nr_;
+  }
+
+  void set_sequence_nr(uint64_t sequence_nr) {
+    sequence_nr_ = sequence_nr;
+  }
+
+  // NOTE [ Topological Number ]
+  //
+  // topological_nr is used to prune branches in the DAG during autograd
+  // discovery as maintaining topological_nr helps us check in O(1) if there
+  // does NOT exist a directed path between two nodes.
+  //
+  // The topological order number of this `Node` representing the length of the
+  // longest possible path from this Node to any leaf node. If you are leaf
+  // node, aka AccumulateGrad, this will be zero. This value has the property
+  // that For every pair of nodes X, Y in G, existence of a directed path from X
+  // to Y implies topo_nr(X) > topo_nr(Y). The converse is not true, however, so
+  // we cannot prove existence of a path from X to Y, only non-existence.
+  //
+  // One assumption we make when using topo_nr is that once a node
+  // has been used, i.e., has a parent node, its own topo_nr does not change
+  // we have added some checks with the `has_parent_` field to enforce this.
+  //
+  // What NOT to do:
+  //
+  //   1) 2 -> 1 -> 0               In this diagram we label nodes with their
+  //   topo_nr.
+  //      2 -> 1 -> 0               We have two simple graphs that can each
+  //      arise from
+  //                                `t.exp().exp()`, for example.
+  //   2)        2 -> 1 -> 0
+  //            /
+  //      2 -> 1 -> 0               We add 2 as a next edge to 1 even though 1
+  //      already
+  //                                has a parent.
+  //   3)        2 -> 1 -> 0
+  //            /
+  //      2 -> 3 -> 0               2 < 3, yet there exists a path from 2 to 3!
+  //
+  uint64_t topological_nr() const noexcept {
+    has_parent_ = true;
+    return topological_nr_;
+  }
+
+  // assigning a node as a parent to this node
+  void assign_parent();
+
+  /// Id of the thread that created Node
+  uint64_t thread_id() const noexcept {
+    return thread_id_;
+  }
+
+  /// Returns the name of the dynamic type of the function, for debugging.
+  virtual std::string name() const;
+
+  /// The difference between functions `should_compute_output` and
+  /// `task_should_compute_output`:
+  /// - `should_compute_output` should only be used during graph construction
+  /// and takes into account only requires_grad information
+  /// - `task_should_compute_output` should only be called during the backward
+  /// pass (unless called directly through grad_fn) and takes into account the
+  /// current graph task.  Specifically, the autograd engine trims unnecessary
+  /// edges when `inputs` are specified, and during backward untrimmed nodes
+  /// left on the graph can/should check `task_should_compute_output` to see if
+  /// any outgoing edges have been trimmed by the engine. If that is the case,
+  /// gradient computation wrt those edges can be omitted.
+  ///
+  /// Returns true if the particular output edge is active, and that particular
+  /// output of this function should be computed.
+  bool should_compute_output(size_t output_edge_index) const {
+    TORCH_CHECK(output_edge_index < num_outputs(), "Index out of range");
+    return next_edges_[output_edge_index].is_valid();
+  }
+
+  /// Returns true if any of the output edges in any of the ranges are active.
+  bool should_compute_output(std::initializer_list<IndexRange> idxs) const {
+    return std::any_of(idxs.begin(), idxs.end(), [this](IndexRange range) {
+      for (const auto i : c10::irange(range.first, range.second)) {
+        if (should_compute_output(i))
+          return true;
+      }
+      return false;
+    });
+  }
+
+  /// Same as the above `should_compute_output` function but will also
+  /// check whether this edge is needed within the current graph task.
+  bool task_should_compute_output(size_t output_edge_index) const {
+    TORCH_CHECK(output_edge_index < num_outputs(), "Index out of range");
+    const auto& next = next_edges_[output_edge_index];
+    if (next.is_valid()) {
+      const auto exec_info = get_current_graph_task_exec_info();
+      if (exec_info && !exec_info->empty()) {
+        auto it = exec_info->find(next.function.get());
+        if (it == exec_info->end() || !it->second.should_execute()) {
+          return false; // this edge is not needed for the current graph_task
+        }
+      }
+      return true;
+    }
+    return false;
+  }
+
+  /// Returns true if any of the output edges in any of the ranges are active
+  /// and should be computed in the current graph task.
+  bool task_should_compute_output(
+      std::initializer_list<IndexRange> idxs) const {
+    return std::any_of(idxs.begin(), idxs.end(), [this](IndexRange range) {
+      for (const auto i : c10::irange(range.first, range.second)) {
+        if (task_should_compute_output(i))
+          return true;
+      }
+      return false;
+    });
+  }
+
+  /// Returns the `PyObject` stored for this `Node` (for Python
+  /// interaction).
+  PyObject* pyobj() const noexcept {
+    return pyobj_;
+  }
+
+  /// Sets the `PyObject` stored for this `Node` (for Python interaction).
+  void set_pyobj(PyObject* pyobj) noexcept {
+    pyobj_ = pyobj;
+  }
+
+  /// Returns the anomaly metadata stored for this `Node`.
+  /// If none exist, creates a new empty one.
+  AnomalyMetadata* metadata() noexcept;
+
+  // Hook API
+  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  uintptr_t add_post_hook(std::unique_ptr<FunctionPostHook>&& post_hook) {
+    post_hooks_.emplace_back(std::move(post_hook));
+    // Use the raw pointer as the unique key to identify this hook. This key
+    // can then be used in del_post_hook(key) to remove this hook.
+    return reinterpret_cast<std::uintptr_t>(post_hooks_.back().get());
+  }
+
+  const std::vector<std::unique_ptr<FunctionPostHook>>& post_hooks()
+      const noexcept {
+    return post_hooks_;
+  }
+
+  // delete a post hook matching the key
+  bool del_post_hook(const uintptr_t& key) {
+    for (auto it = post_hooks_.begin(); it != post_hooks_.end(); ++it) {
+      if (key == reinterpret_cast<std::uintptr_t>(it->get())) {
+        post_hooks_.erase(it);
+        return true;
+      }
+    }
+    return false;
+  }
+
+  std::vector<std::unique_ptr<FunctionPostHook>>& post_hooks() noexcept {
+    return post_hooks_;
+  }
+
+  void add_pre_hook(std::unique_ptr<FunctionPreHook>&& pre_hook) {
+    pre_hooks_.emplace_back(std::move(pre_hook));
+  }
+
+  void add_tensor_pre_hook(std::unique_ptr<FunctionPreHook>&& pre_hook) {
+    tensor_pre_hooks_.emplace_back(std::move(pre_hook));
+  }
+
+  void add_retains_grad_hook(
+      std::unique_ptr<FunctionPreHook>&& pre_hook,
+      size_t output_idx) {
+    retains_grad_hooks_[output_idx] = std::move(pre_hook);
+  }
+
+  std::unique_ptr<FunctionPreHook> pop_retains_grad_hook(size_t output_idx) {
+    auto ret = std::move(retains_grad_hooks_[output_idx]);
+    retains_grad_hooks_.erase(output_idx);
+    return ret;
+  }
+
+  const std::vector<std::unique_ptr<FunctionPreHook>>& pre_hooks()
+      const noexcept {
+    return pre_hooks_;
+  }
+
+  std::vector<std::unique_ptr<FunctionPreHook>>& pre_hooks() noexcept {
+    return pre_hooks_;
+  }
+
+  virtual std::vector<std::unique_ptr<FunctionPreHook>>&
+  tensor_pre_hooks() noexcept {
+    return tensor_pre_hooks_;
+  }
+
+  virtual std::unique_ptr<PostAccumulateGradHook>& tensor_post_acc_grad_hooks()
+      const noexcept {
+    static std::unique_ptr<PostAccumulateGradHook> empty = nullptr;
+    return empty;
+  }
+
+  std::unordered_map<size_t, std::unique_ptr<FunctionPreHook>>&
+  retains_grad_hooks() noexcept {
+    return retains_grad_hooks_;
+  }
+
+  // Customization Points for Subclasses
+  //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+  /// Releases saved variables if the operation won't be reused.
+  virtual void release_variables() {}
+
+  /// Called before an apply if `release_variables()` is going to be called.
+  /// Allows larger ops like `InterpreterAutogradFunction` to incrementally
+  /// release variables as they run.
+  virtual void will_release_variables() {}
+
+  /// Returns true if this function is traceable. An op is traceable if all
+  /// operations happening within `apply()` are performed on autograd
+  /// `Variables` (i.e. apply mostly instantiates and applies other functions).
+  virtual bool is_traceable() {
+    return false;
+  }
+
+  /// A `Node` is said to pass state transparently to backward, if the
+  /// state consists only of (Saved)Variables and only non-variable objects
+  /// that parameterize the operation in some way that defines the graph
+  /// structure AND the backward function is traceable. In particular,
+  /// parametrization MUST NOT depend on the data of any `Variable`.
+  /// TODO: it might be possible to handle cases where backward is
+  /// non-traceable but state passing could be considered transparent. This
+  /// will probably depend on saved_variable_list being mutable.
+  /// NOTE: this value matters only if is_traceable() returns false.
+  virtual bool passes_state_transparently() {
+    return false;
+  }
+
+  // see [Note: Compiled Autograd]
+  // Used by compiled autograd to
+  //   1) Extract tensors/symint args
+  //   2) Collect node information for specialization and caching
+  // Implementations in subclasses should call args.collect() with all node
+  // attrs. These functions are only called during backward.
+  virtual void compiled_args(CompiledNodeArgs& args) const {
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        false, std::string("compiled_args not implemented: ") + name());
+  }
+
+  // Used by compiled autograd to call apply() with different saved tensors
+  // Implementations should call saved.before() on all attrs, then apply(), then
+  // saved.after() on all attrs in the same order.
+  virtual variable_list apply_with_saved(
+      const variable_list& inputs,
+      SwapSavedVariables& saved) {
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        false, std::string("apply_with_saved not implemented: ") + name());
+  }
+
+  // If this node is the AOTBackward node produced by torch.compile.
+  // Compiled Autograd special-cases on this information.
+  virtual bool is_aot_backward() const {
+    return false;
+  }
+
+ protected:
+  /// Performs the `Node`'s actual operation.
+  virtual variable_list apply(variable_list&& inputs) = 0;
+
+  /// Calls `apply()`, but instruments it with tracing machinery.
+  variable_list traced_apply(variable_list inputs);
+
+  // Sequence number used to correlate backward nodes with forward ops in the
+  // profiler and provide determinism in the engine.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  uint64_t sequence_nr_;
+
+  // See NOTE [ Topological Number ]
+  uint64_t topological_nr_ = 0;
+
+  // Tracks whether this node has been added as the next_edge of another node
+  // via set_next_edge(s), which always calls topological_nr() of all its
+  // children See NOTE [ Topological Number ] for why we need this.
+  mutable bool has_parent_ = false;
+
+  // Id of the thread that created the instance
+  uint64_t thread_id_ = 0;
+
+  // Note [Thread Safety on Autograd Node]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // Autograd Engine let the owning thread which calls Engine::execute to drive
+  // the GraphTask execution, there might be cases that part of the GraphTask is
+  // shared across different `backward()` or `grad()` calls, i.e. fork new
+  // threads in the middle of the forward and call `backward()` separately from
+  // different threads. We need to protect the thread safety on NodeTask to
+  // prevent data racing on shared variables read/write.
+  //
+  // NB: This is only needed for Autograd Nodes that runs on CPU, technically
+  // "CUDA", "XLA" nodes don't need locking because device threads are always
+  // single threaded.
+  //
+  // Here we add a thread mutex to help protect the Node's thread safety, so
+  // that different threads cannot race the shared data when executing the same
+  // NodeTask from multiple CPU threads. It IS the user/developer responsibility
+  // to take advantage of this mutex to protect the thread safety of their
+  // autograd Node. The general strategy of thread safety on autograd Node:
+  //
+  // 1. User should lock the mutex during Node::release_variables() if the Node
+  // needs
+  //    to release the variables on the fly, this serve the purpose that when we
+  //    release saved_variables from one thread, no other threads can release
+  //    the saved variables concurrently. call the Node::apply(),
+  // 2. User should lock the mutex during Node::apply(), this is to ensure Node
+  // that
+  //    writing to the shared variable are not racing across threads (i.e.
+  //    AccumulateGrad and custom C++ Autograd Node if writing to shared
+  //    variables )
+  // 3. item 2 and item 3 should work together so that when we release saved
+  // variables
+  //    from one thread, no other threads can call Node::apply(), this ensures
+  //    the variable references from other threads aren't dangling.
+  // 4. if the Node don't release any variables and no shared data read/write in
+  // the Node
+  //    i.e. purely functional, user don't need to lock the mutex
+  //
+  // This way we could protect the thread safety on Autograd Node, but we could
+  // still not protect the thread safety on Node pre/post C++ hooks (python
+  // hooks are automatically thread safe), we rely on the user to write thread
+  // safe C++ hooks if they want the hook to be correctly applied in
+  // multithreading environment.
+  std::mutex mutex_;
+
+  edge_list next_edges_;
+  PyObject* pyobj_ = nullptr; // weak reference
+  std::unique_ptr<AnomalyMetadata> anomaly_metadata_ = nullptr;
+
+  // NOTE [Hooks ordering]
+  // We have 3 separate fields for pre hooks registered to the autograd nodes
+  // because the conditions under which they execute are different, and we
+  // want more fine-grained control over the order in which different types
+  // of hooks are executed.
+  // - pre_hooks  are only executed when the node itself is executed
+  // - tensor_pre_hook is executed as long as the engine traverses over it
+  //   even if that node won't be executed.
+  // - retains_grad_hook are like tensor_pre_hooks except they are always
+  //   ordered after all other tensor pre hooks
+  std::vector<std::unique_ptr<FunctionPreHook>> pre_hooks_;
+  std::vector<std::unique_ptr<FunctionPreHook>> tensor_pre_hooks_;
+  std::unordered_map<size_t, std::unique_ptr<FunctionPreHook>>
+      retains_grad_hooks_;
+  std::vector<std::unique_ptr<FunctionPostHook>> post_hooks_;
+  at::SmallVector<InputMetadata, 2> input_metadata_;
+};
+
+/// See Node::is_traceable() for definition.
+struct TraceableFunction : public Node {
+  using Node::Node;
+  bool is_traceable() final {
+    return true;
+  }
+};
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+//                       Associated Free Nodes
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+namespace detail {
+// Implementation of `collect_next_edges` (see below).
+struct MakeNextFunctionList : IterArgs<MakeNextFunctionList> {
+  edge_list next_edges;
+  using IterArgs<MakeNextFunctionList>::operator();
+  void operator()(const Variable& variable) {
+    if (variable.defined()) {
+      next_edges.emplace_back(impl::gradient_edge(variable));
+    } else {
+      next_edges.emplace_back();
+    }
+  }
+  void operator()(const Variable* variable) {
+    operator()(*variable);
+  }
+  void operator()(const std::optional<Variable>& variable) {
+    if (variable.has_value()) {
+      operator()(*variable);
+    } else {
+      next_edges.emplace_back();
+    }
+  }
+};
+} // namespace detail
+
+/// Create an `Edge` between the given `variable` and the `function`, which is
+/// assumed to be the gradient function of this variable (i.e. the function
+/// through which this variable is backpropagated during the backward pass).
+/// This sets the `grad_fn` property of the `variable`. This function assumes
+/// that the `Variable` is a new input to the gradient function and its
+/// `input_nr` thus equal to `function->num_inputs()`. Additionally, it
+/// increments the `Node`'s number of inputs by one. Approximately
+/// equivalent to `variable.set_gradient_edge(function,
+/// function->add_input_metadata(variable.dispatch_type(), variable.sizes()))`.
+/// If you don't want the `Node`'s `num_inputs` to be incremented, use
+/// `set_gradient_edge` directly.
+inline void create_gradient_edge(
+    Variable& variable,
+    std::shared_ptr<Node> function) {
+  // Copy before move.
+  const auto input_nr = function->add_input_metadata(variable);
+  impl::set_gradient_edge(variable, {std::move(function), input_nr});
+}
+
+/// Return true if any of the variables in the list require a gradient.
+inline bool any_variable_requires_grad(const variable_list& variables) {
+  return std::any_of(
+      variables.begin(), variables.end(), [](const Variable& variable) {
+        return variable.defined() && variable.requires_grad();
+      });
+}
+
+/// Return the next edges of all the given variables, or tuples of variables.
+template <typename... Variables>
+edge_list collect_next_edges(Variables&&... variables) {
+  detail::MakeNextFunctionList make;
+  make.apply(std::forward<Variables>(variables)...);
+  return std::move(make.next_edges);
+}
+
+struct TypeAndSize {
+  TypeAndSize() = default;
+  /* implicit */
+  TypeAndSize(const at::Tensor& t)
+      : sym_sizes(t.sym_sizes().vec()), options(t.options()) {}
+
+  at::Tensor zeros();
+
+  std::vector<c10::SymInt> sym_sizes;
+  at::TensorOptions options;
+};
+
+} // namespace torch::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/function_hook.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/function_hook.h
new file mode 100644
index 0000000000000000000000000000000000000000..585cb4540764c606d6116181f2fa52db733a819e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/function_hook.h
@@ -0,0 +1,72 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <torch/csrc/Export.h>
+#include <string>
+#include <vector>
+
+namespace torch::dynamo::autograd {
+class CompiledNodeArgs;
+class SwapSavedVariables;
+struct PackedArgs;
+} // namespace torch::dynamo::autograd
+
+// A hook that's called on gradients
+
+namespace torch::autograd {
+
+using Variable = at::Tensor;
+using variable_list = std::vector<Variable>;
+
+struct TORCH_API FunctionPreHook {
+  virtual ~FunctionPreHook() = default;
+  virtual variable_list operator()(const variable_list& grads) = 0;
+  // only implemented for python hooks, registers hook with compiled autograd
+  virtual void compiled_args(
+      torch::dynamo::autograd::CompiledNodeArgs& args) const {
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        false,
+        std::string("compiled_args nyi, see [Note: Compiled Autograd] ") +
+            typeid(*this).name());
+  }
+};
+
+struct TORCH_API FunctionPostHook {
+  virtual ~FunctionPostHook() = default;
+  virtual variable_list operator()(
+      const variable_list& outputs /* grad_inputs */,
+      const variable_list& inputs /* grad_outputs */) = 0;
+  // only implemented for python hooks, registers hook with compiled autograd
+  virtual void compiled_args(
+      torch::dynamo::autograd::CompiledNodeArgs& args) const {
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        false,
+        std::string("compiled_args nyi, see [Note: Compiled Autograd] ") +
+            typeid(*this).name());
+  }
+};
+
+struct TORCH_API PostAccumulateGradHook {
+  virtual ~PostAccumulateGradHook() = default;
+  virtual void operator()(const Variable& tensor) = 0;
+  // only implemented for python hooks on nodes, registers hook with compiled
+  // autograd
+  virtual void compiled_args(
+      torch::dynamo::autograd::CompiledNodeArgs& args) const {
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        false,
+        std::string("compiled_args nyi, see [Note: Compiled Autograd] ") +
+            typeid(*this).name());
+  }
+
+  virtual void apply_with_saved(
+      Variable&,
+      torch::dynamo::autograd::SwapSavedVariables&) {
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        false,
+        std::string("compiled_args nyi, see [Note: Compiled Autograd] ") +
+            typeid(*this).name());
+  }
+};
+
+} // namespace torch::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/functions/accumulate_grad.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/functions/accumulate_grad.h
new file mode 100644
index 0000000000000000000000000000000000000000..a21a1138371acf9412cbbea28597247ad543cc36
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/functions/accumulate_grad.h
@@ -0,0 +1,300 @@
+#pragma once
+
+#include <ATen/CachedTensorUtils.h>
+#include <ATen/LegacyBatchedTensorImpl.h>
+#include <ATen/TensorOperators.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/autograd/utils/grad_layout_contract.h>
+#include <torch/csrc/autograd/variable.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/_sparse_coo_tensor_unsafe.h>
+#endif
+
+#include <mutex>
+
+namespace torch::autograd {
+
+#define CHECK_RESULT(RESULT, VAR)                                          \
+  if (!(RESULT.is_sparse() || VAR.is_sparse() || RESULT.is_sparse_csr() || \
+        VAR.is_sparse_csr())) {                                            \
+    if (!utils::obeys_layout_contract(RESULT, VAR)) {                      \
+      TORCH_WARN_ONCE(                                                     \
+          "grad and param do not obey the gradient layout contract. "      \
+          "This is not an error, but may impair performance.\n"            \
+          "grad.sizes() = ",                                               \
+          RESULT.sizes(),                                                  \
+          ", strides() = ",                                                \
+          RESULT.strides(),                                                \
+          "\n",                                                            \
+          "param.sizes() = ",                                              \
+          VAR.sizes(),                                                     \
+          ", strides() = ",                                                \
+          VAR.strides());                                                  \
+    }                                                                      \
+  }
+
+struct TORCH_API AccumulateGrad : public Node {
+  explicit AccumulateGrad(Variable variable_);
+
+  variable_list apply(variable_list&& grads) override;
+
+  std::vector<std::unique_ptr<FunctionPreHook>>& tensor_pre_hooks() noexcept
+      override {
+    // NB: Since the AccumulateGrad Node is only a weak ref from the Tensor,
+    //     it can be destroyed even though the Tensor is still alive (contrary
+    //     to all other Nodes). So we must lazily read the Tensor hooks here.
+    return impl::hooks(variable);
+  }
+
+  std::unique_ptr<PostAccumulateGradHook>& tensor_post_acc_grad_hooks()
+      const noexcept override {
+    // NB: Since the AccumulateGrad Node is only a weak ref from the Tensor,
+    //     it can be destroyed even though the Tensor is still alive (contrary
+    //     to all other Nodes). So we must lazily read the Tensor hooks here.
+    return impl::post_acc_grad_hooks(variable);
+  }
+
+  // Note: Gradient Layout Contract
+  //
+  // AccumulateGrad tries to stash strided (non-sparse) grads with memory layout
+  // (strides) such that variables and grads interact efficiently in later
+  // optimizer kernels, and grads interact efficiently with c10d::Reducer.cpp.
+  //
+  // Specifically, AccumulateGrad tries to ensure the following
+  // (cf torch/csrc/autograd/utils/grad_layout_contract.h):
+  //   (1) if variable.is_non_overlapping_and_dense(), the stashed grad's
+  //       strides match variable.
+  //   (2) else, stashed grad is rowmajor contiguous.
+  // If variable's grad does not exist (!variable_grad.defined())
+  // AccumulateGrad steals new_grad if it's stealable and obeys the contract
+  // already, otherwise it deep copies new_grad into an obedient clone.
+  //
+  // If variable's grad already exists (variable_grad.defined()), new_grad must
+  // be added to variable_grad.  If we aren't setting up for double backward
+  // (!GradMode::is_enabled()), AccumulateGrad performs "variable_grad +=
+  // new_grad" in-place, which keeps variable_grad's layout. We assume (hope)
+  // variable_grad was created obeying (1) or (2) at some point in the past.
+  //
+  // If we are setting up for double backward, AccumulateGrad updates the grad
+  // out-of-place via "variable_grad + new_grad."  TensorIterator operator+
+  // decides result's layout.  Typically TensorIterator matches strides of the
+  // first arg, so we once again assume (hope) variable_grad was originally
+  // created obeying (1) or (2).
+  //
+  // AccumulateGrad does not enforce the contract with 100% certainty. Examples:
+  //  - If a user manually permutes a param or its grad, then runs a fwd+bwd,
+  //    variable_grad += new_grad keeps variable_grad's layout without
+  //    rechecking the contract.
+  //  - If TensorIterator changes its corner cases about operator+'s result
+  //    (for example, giving more or less priority to channels_last inputs, see
+  //    https://github.com/pytorch/pytorch/pull/37968) the result may not obey.
+  //
+  // Fortunately, if a given grad doesn't satisfy (1) or (2), the penalty is
+  // degraded performance in Reducer.cpp or optimizer kernels, not death by
+  // assert or silently bad numerics.
+
+  // Gradient Accumulation
+  // Given a variable with its current grad as variable_grad, accumulates
+  // new_grad into variable_grad if in place accumulation is possible.
+  // Otherwise, uses 'update_grad' to update the grad for the variable.
+  //
+  // Branch breakdown:
+  // - Case 1: Param has no existing grad
+  //   - Case 1.1: Stealable dense new_grad
+  //     . We aren't setting up for double-backward.
+  //     . No other user-visible tensor references new_grad.
+  //     . new_grad obeys the "Gradient Layout Contract", there has a special
+  //       case, For MKLDNN tensor, which is a opaque tensor, assuming it obeys
+  //       layout_contract.
+  //   - Case 1.2: Stealable sparse new_grad
+  //     . Can't detach sparse tensor (since metadata changes are not allowed
+  //       after detach), so just create a new one for the grad which is a
+  //       shallow copy. We need a shallow copy so that modifying the original
+  //       grad tensor doesn't modify the grad we accumulate.
+  //     . We only skip clone if indices and values themselves are contiguous
+  //       for backward compatibility reasons. Since without this optimization,
+  //       earlier we would clone the entire SparseTensor which cloned indices
+  //       and values. For details see
+  //       https://github.com/pytorch/pytorch/issues/34375.
+  //   - Case 1.3: Cloning sparse/nested new_grad
+  //   - Case 1.4: Cloning MKLDNN new_grad
+  //   - Case 1.5: Deep copies new_grad according to the Gradient Layout
+  //   Contract.
+  // - Case 2: Param has existing grad and grad mode is not enabled
+  //   - This case is not strictly necessary, but it makes the first-order only
+  //     case slightly more efficient.
+  //   - Case 2.1: Sparse variable_grad + Dense new_grad
+  //     . If `variable_grad` is sparse and `new_grad` is not sparse, their
+  //       sum is not sparse, and we must change the TensorImpl type of
+  //       `variable_grad` for it to store the result. However, changing the
+  //       TensorImpl type of a tensor requires changing the tensor itself, and
+  //       thus in this case we have to change the grad tensor.
+  //   - Case 2.2: Vmap-incompatible
+  //     . Ideally we'd perform an in-place operation to avoid changing
+  //       the grad tensor. However, if that's impossible because the grads
+  //       are vmap-incompatible (See NOTE: [vmap-incompatible in-place
+  //       operations]), then we just add them out-of-place.
+  //   - Case 2.3: In-place addition
+  //     . In this case we can avoid changing the grad tensor. There are three
+  //       scenarios when we'll hit this case:
+  //       . `variable_grad` is sparse, and `new_grad` is sparse.
+  //       . `variable_grad` is dense, and `new_grad` is sparse.
+  //       . `variable_grad` is dense, and `new_grad` is dense.
+  //       . `variable_grad` is mkldnn, and `new_grad` is mkldnn.
+  //
+  //       In all of these four cases, `variable_grad += new_grad` is a
+  //       valid operation which adds `new_grad` to `variable_grad` in
+  //       place. `variable_grad` is thus still referring to the same tensor
+  //       after the operation.
+  //     . DistributedDataParallel(DDP) package relies on grad being
+  //       mutated in place for saving peak memory usage. DDP will still
+  //       work correctly if it is mutated out of place here, but DDP will
+  //       maintain one extra copy of grad tensors in buffer and thus
+  //       increase peak memory usage.
+  // - Case 3: Param has existing grad and grad mode is enabled
+  //   - Case 3.1: Sparse variable_grad + Dense new_grad
+  //   - Case 3.2: Not Sparse variable_grad + Dense new_grad
+  //
+  // variable: the variable whose grad we're accumulating.
+  // variable_grad: the current grad for the variable.
+  // new_grad: new grad we want to accumulate for the variable.
+  // num_expected_refs: the number of refs we expect to hold internally
+  //                    such that it is safe to avoid cloning the grad
+  //                    if use_count() of the grad is less than or equal
+  //                    to this value (in addition to post_hooks).
+  // update_grad: Function that is used to update grad for the variable.
+  //              The argument to the function is a Tensor which
+  //              is used to set a new value for the grad.
+  template <typename T>
+  static void accumulateGrad(
+      const Variable& variable,
+      at::Tensor& variable_grad,
+      const at::Tensor& new_grad,
+      size_t num_expected_refs,
+      const T& update_grad) {
+    if (!variable_grad.defined()) {
+      if (!GradMode::is_enabled() && !new_grad.is_sparse() &&
+          !new_grad.is_sparse_csr() &&
+          !(variable.is_sparse_csr() && new_grad.layout() == at::kStrided) &&
+          at::caching::adjusted_use_count(new_grad) <= num_expected_refs &&
+          (new_grad.is_mkldnn() ||
+           utils::obeys_layout_contract(new_grad, variable))) {
+        // See Case 1.1: Stealable dense new_grad
+        update_grad(new_grad.detach());
+      } else if (
+          !GradMode::is_enabled() && new_grad.is_sparse() &&
+          new_grad._indices().is_contiguous() &&
+          new_grad._values().is_contiguous() &&
+          // Use count for indices and values should always be <=1 since the
+          // SparseTensor should be the only one holding a reference to these.
+          new_grad._indices().use_count() <= 1 &&
+          new_grad._values().use_count() <= 1 &&
+          new_grad.use_count() <= num_expected_refs) {
+        // Case 1.2: Stealable sparse new_grad
+        // No scenario where we expect this to be true currently
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+            !at::caching::is_cached_tensor(new_grad._indices()) &&
+            !at::caching::is_cached_tensor(new_grad._values()) &&
+            !at::caching::is_cached_tensor(new_grad));
+
+        update_grad(at::_sparse_coo_tensor_unsafe(
+            new_grad._indices(),
+            new_grad._values(),
+            new_grad.sizes(),
+            new_grad.options()));
+      } else {
+        if (new_grad.is_sparse() || new_grad.is_sparse_csr() ||
+            new_grad.is_nested()) {
+          // Case 1.3: Cloning sparse/nested new_grad
+          update_grad(new_grad.clone());
+        } else {
+          if (new_grad.is_mkldnn()) {
+            // Case 1.4: Cloning MKLDNN new_grad
+            update_grad(new_grad.clone());
+          } else {
+            // Case 1.5: Deep copies new_grad according to the "Gradient
+            // Layout Contract."
+            update_grad(utils::clone_obey_contract(new_grad, variable));
+          }
+        }
+      }
+    } else if (!GradMode::is_enabled()) {
+      // Case 2: Param has existing grad and grad mode is not enabled
+      if (variable_grad.is_sparse() && !new_grad.is_sparse()) {
+        // Case 2.1: Sparse variable_grad + Dense new_grad
+        auto result = new_grad + variable_grad;
+        CHECK_RESULT(result, variable);
+        update_grad(std::move(result));
+      } else if (!at::inplaceIsVmapCompatible(variable_grad, new_grad)) {
+        // Case 2.2: Vmap-incompatible
+        auto result = variable_grad + new_grad;
+        CHECK_RESULT(result, variable);
+        update_grad(std::move(result));
+      } else {
+        // Case 2.3: In-place addition
+        variable_grad += new_grad;
+        CHECK_RESULT(variable_grad, variable);
+        // ^ We could enforce the contract more aggressively here by writing:
+        // if (variable_grad.is_sparse() || new_grad.is_sparse()) {
+        //   variable_grad += new_grad;
+        // } else if (obeys_layout_contract(variable_grad, variable)) {
+        //   variable_grad += new_grad;
+        // } else {
+        //   result = at::empty_strided(variable.sizes(), variable.strides(),
+        //                              variable.options().memory_format(std::nullopt));
+        //   update_grad(at::native::add_out(result, variable_grad,
+        //   new_grad, 1.0);
+        // }
+        // However, that accumulation is sometimes in place and sometimes not,
+        // which may break user code.
+      }
+    } else {
+      // Case 3: Param has existing grad and grad mode is enabled
+      at::Tensor result;
+      if (variable_grad.is_sparse() && !new_grad.is_sparse()) {
+        // Case 3.1: Sparse variable_grad + Dense new_grad
+        // CPU backend throws an error on sparse + dense, so
+        // prefer dense + sparse here.
+        result = new_grad + variable_grad;
+      } else {
+        // Case 3.2: Not Sparse variable_grad + Dense new_grad
+        // Assumes operator+ result typically matches strides of first arg,
+        // and hopes variable_grad was originally created obeying layout
+        // contract.
+        result = variable_grad + new_grad;
+      }
+      CHECK_RESULT(result, variable);
+      update_grad(std::move(result));
+      // ^ We could enforce the contract more aggressively here by saying
+      // if (obeys_layout_contract(new_grad, variable)) {
+      //   update_grad(new_grad + variable_grad);
+      // } else {
+      //   update_grad(variable_grad + new_grad);
+      // }
+      // such that the stashed grad is likely to have the right strides if
+      // either variable_grad or new_grad already has the right strides.
+      // We could enforce the contract with certainty by saying
+      // auto result = variable_grad + new_grad (or vice versa), checking
+      // result's layout, and copying to an obedient clone if necessary before
+      // update_grad. The copy would require another gmem pass.  We can't create
+      // empty result with the right layout then add_out into it with a single
+      // kernel, because GradMode is enabled in this branch, and add_out isn't
+      // differentiable. Maybe more trouble than it's worth.
+    }
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(
+      const variable_list& inputs,
+      SwapSavedVariables& saved) override;
+
+  Variable variable;
+};
+
+#undef CHECK_RESULT
+
+} // namespace torch::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/functions/basic_ops.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/functions/basic_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..934b1ed911d3e8468fad4bd40512517ddd6f1b72
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/functions/basic_ops.h
@@ -0,0 +1,112 @@
+#pragma once
+
+#include <c10/util/irange.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/autograd/variable.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace torch::autograd {
+
+struct TORCH_API Error : public Node {
+  Error(std::string msg, edge_list&& next_edges)
+      : Node(std::move(next_edges)), msg(std::move(msg)) {}
+
+  Error(std::string msg) : msg(std::move(msg)) {}
+
+  variable_list apply(variable_list&& inputs) override;
+  variable_list apply(variable_list&& inputs) const;
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(
+      const variable_list& inputs,
+      SwapSavedVariables& saved) override;
+
+  std::string msg;
+};
+
+// We print grad_fn names in tensor printing. For functions with backward
+// NYI, grad_fn=<Error> will be printed if we use Error, which is confusing. So
+// special case with a new NotImplemented function here.
+struct TORCH_API NotImplemented : public Error {
+  NotImplemented(const std::string& forward_fn, edge_list&& next_edges)
+      : Error(
+            "derivative for " + forward_fn + " is not implemented",
+            std::move(next_edges)) {}
+
+  NotImplemented(const std::string& forward_fn)
+      : Error("derivative for " + forward_fn + " is not implemented") {}
+};
+
+// Identity in forward, Error in backward. Used to implement
+// @once_differentiable
+struct TORCH_API DelayedError : public Node {
+  DelayedError(std::string msg, int64_t num_inputs) : msg(std::move(msg)) {
+    for ([[maybe_unused]] const auto _ [[maybe_unused]] :
+         c10::irange(num_inputs)) {
+      add_input_metadata(Node::undefined_input());
+    }
+  }
+
+  variable_list apply(variable_list&& inputs) override;
+  variable_list apply(variable_list&& inputs) const;
+
+  std::string msg;
+};
+
+struct TORCH_API UndefinedGrad : public Node {
+  UndefinedGrad() {
+    add_input_metadata(Node::undefined_input());
+  }
+
+  variable_list apply(variable_list&& inputs) override;
+  variable_list apply(variable_list&& inputs) const;
+};
+
+struct TORCH_API UndefinedGradBackward : public Node {
+  UndefinedGradBackward(edge_list&& next_edges) : Node(std::move(next_edges)) {}
+
+  UndefinedGradBackward() = default;
+
+  variable_list apply(variable_list&& inputs) override;
+  variable_list apply(variable_list&& inputs) const;
+
+  void compiled_args(CompiledNodeArgs& args) const override {}
+  variable_list apply_with_saved(
+      const variable_list& inputs,
+      SwapSavedVariables& saved) override {
+    return apply(variable_list(inputs));
+  }
+};
+
+struct TORCH_API GraphRoot : public Node {
+  GraphRoot(edge_list functions, variable_list inputs)
+      : Node(std::move(functions)), outputs(std::move(inputs)) {
+    // Ensures calls to stream() on a GraphRoot instance reflect current
+    // stream(s) on devices of root grad tensors at the time the instance is
+    // constructed.
+    for (const auto& t : outputs) {
+      add_input_metadata(t);
+    }
+  }
+
+  variable_list apply(variable_list&& inputs) override {
+    return outputs;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(
+      const variable_list& inputs,
+      SwapSavedVariables& saved) override;
+
+  variable_list outputs;
+};
+
+struct TORCH_API Identity : public Node {
+  variable_list apply(variable_list&& inputs) override;
+};
+
+} // namespace torch::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/functions/comm.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/functions/comm.h
new file mode 100644
index 0000000000000000000000000000000000000000..6da4d4f63978c933b6e6abe4c4f760c8e5038e8f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/functions/comm.h
@@ -0,0 +1,45 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/autograd/variable.h>
+
+#include <ATen/ATen.h>
+#include <c10/cuda/CUDAStream.h>
+#include <optional>
+
+#include <cstddef>
+#include <vector>
+
+namespace torch::autograd {
+
+struct TORCH_CUDA_CU_API Scatter : public Node {
+  explicit Scatter(
+      std::vector<at::Device> devices,
+      std::optional<std::vector<int64_t>> chunk_sizes = std::nullopt,
+      int64_t dim = 0,
+      std::optional<std::vector<std::optional<at::cuda::CUDAStream>>> streams =
+          std::nullopt,
+      bool unsqueeze_scalars = false);
+  ~Scatter() override;
+
+  variable_list apply(variable_list&& inputs) override;
+
+  std::vector<at::Device> devices_;
+  std::optional<std::vector<int64_t>> chunk_sizes_;
+  int64_t dim_;
+  std::optional<std::vector<std::optional<at::cuda::CUDAStream>>> streams_;
+  bool unsqueeze_scalars_;
+};
+
+struct TORCH_CUDA_CU_API Gather : public Node {
+  explicit Gather(const at::Device& destination_device, int64_t dim = 0);
+  ~Gather() override;
+
+  variable_list apply(variable_list&& inputs) override;
+
+  at::Device destination_device_;
+  int64_t dim_;
+};
+
+} // namespace torch::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/functions/pybind.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/functions/pybind.h
new file mode 100644
index 0000000000000000000000000000000000000000..036673d53f61e6c374bcf0b7d62560cb08ebb174
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/functions/pybind.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/pybind.h>
+
+#include <torch/csrc/autograd/python_cpp_function.h>
+#include <torch/csrc/autograd/python_function.h>
+
+// NOLINTNEXTLINE(misc-unused-alias-decls)
+namespace py = pybind11;
+
+namespace pybind11::detail {} // namespace pybind11::detail
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/functions/tensor.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/functions/tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..e23488d6c33aeaa2673f3a5dbd9dc903194869bc
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/functions/tensor.h
@@ -0,0 +1,185 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/autograd/variable.h>
+
+#include <ATen/TensorGeometry.h>
+#include <ATen/core/DeprecatedTypeProperties.h>
+#include <optional>
+
+#include <cstdint>
+#include <memory>
+
+namespace torch::autograd {
+
+struct TORCH_API CopyBackwards : public Node {
+  variable_list apply(variable_list&& grads) override;
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(
+      const variable_list& inputs,
+      SwapSavedVariables& saved) override;
+
+  at::TensorOptions src_options;
+};
+
+// Note [View + Inplace update for base tensor]
+//
+// This note covers a few important topics related to view + inplace handling.
+//   - It explains what is the CopySlices Node and why we need it.
+//   - It explains the considerations on what is saved for backward in
+//   CopySlices.
+//   - It explains why we need to sometimes change the exec_info of the current
+//   backward
+//
+// What is CopySlices?
+// ~~~~~~~~~~~~~~~~~~~
+//
+// We support autograd with inplace mutation; e.g., if you write x.mul_(2)
+// the autograd will work as if you now had multiple Tensors under the hood and
+// you did
+//   x = t.clone()
+//   x0 = x
+//   x1 = x0 * 2
+//   x = x1
+// As you can see here, after this operation, x.grad_fn now points to x1.grad_fn
+// (the MulBackward node) and this node points to x's original grad_fn (which is
+// also x0.grad_fn). It is important to keep in mind that after the inplace,
+// there is no Tensor object that represents the x0 state anymore. But the graph
+// for it is still around in autograd (in case x was used before being modified
+// inplace). See Example 1 in
+// https://docs.google.com/drawings/d/1-T5DyYfChMX1ONQkY-zU-hj_ayQ2zmA5CBOKDWqvEhE
+// We call this rebasing the history of the Tensor.
+//
+// Now, a difficult situation is what happens if x is a differentiable view
+// of a base b.
+//   b = t.clone()
+//   x = b.select(0, 0)
+//   x *= 2
+// With the same approach as above, this will become
+//   b = t.clone()
+//   x = b.select(0, 0)
+//   b0 = b
+//   x0 = x
+//   x1 = x0 * 2
+//   b1 = b0.select_scatter(x1, 0, 0)
+//   x2 = b1.select(0, 0)
+//   x = x2
+//   b = b1
+// As you can see here, not only we need to modify x's grad_fn, we also need to
+// modify the one from b. We also need to ensure that the new grad_fn on x is
+// linked to b's new grad_fn. The chain the select_scatter, multiplication and
+// select is what CopySlices does, all wrapped into a single Node.
+//
+// See Example 1 in
+// https://docs.google.com/drawings/d/1-T5DyYfChMX1ONQkY-zU-hj_ayQ2zmA5CBOKDWqvEhE
+//
+// What do we need to save in CopySlices to run backward?
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+//
+// We need to perform grad_view = fn(grad_view), but out-of-place.
+// view_fn_ is an optional function saved in DifferentiableViewMeta
+// from forward pass, so that we can recover we when as_strided is not
+// supported. It preserves the invariants:
+//   view = view_fn_(base)
+//   grad_view = view_fn_(grad_base)
+//
+// When as_strided is supported (e.g. strided CPU/CUDA Tensors), view_fn_
+// is empty and we save TensorGeometry(view) instead.
+// With the TensorGeometry information we can use `as_strided` call which
+// is more efficient to recover views in backward.
+//
+// For example:
+//   view_1 = view_op_1(base)
+//   view_2 = view_op_2(view_1)
+//   ...
+//   view_n = view_op_n(view_n-1)
+//   view_n = inplace_op(view_n)
+//
+// In CPU/CUDA case where we support efficient as_strided implementation,
+// grad_view_n can be calculated through 1 step.
+//
+//   grad_view_n = grad_base.as_strided(view_sizes, view_strides, view_offset);
+//
+// But in XLA backend where we don't have full support of as_strided,
+// it has to save a chained lambda function view_fn_, to exactly
+// replay how the view was done in forward.
+//
+//   view_fn_ = view_op_n(...(view_op_2(view_op_1())))
+//   grad_view_n = view_fn_(grad_base)
+//
+// This chain view_fn_ works as long as forward view ops are implemented,
+// e.g XLA simulates view without a real Storage behind Tensor, but it's less
+// efficient than the as_strided one so we should be careful to only use it when
+// necessary.
+//
+//   - For CPU/CUDA we save TensorGeometry of both base and view tensors,
+//     That's all we need to pass into as_strided.
+//     E.g. int[] sizes, int[] strides, and int storage_offset.
+//   - For XLA we use view_fn_, which captures all forward view op arguments
+//     by **value**.
+//     E.g for at::narrow, int dim, int start, in length are saved.
+//
+// Theoretically we could also save Tensor `view` in CopySlices Node, but
+// it's far more expensive than what we currently save.
+//   1. We cannot afford keeping large tensors alive to recover views only.
+//   2. There are inplace checks when Tensors are loaded back to make sure
+//      they haven't been changed (including size metadata).
+// So saving metadata like TensorGeometry/view arguments is much better
+// because it is minimal information needed to recover views, as well as it
+// allows the user to modify the original Tensor without preventing the
+// backward pass from running.
+//
+// Why do we manually change exec_info in the apply?
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+//
+// Using the same example as before,
+//   b = t.clone()
+//   x = b.select(0, 0)
+//   x *= y
+//
+// You can see the visualization at
+// https://docs.google.com/drawings/d/1Bx-Hcz-zlIv7PabQqnPhUIVIs9F8WWi48svqMsAUMFs
+// which contains the wrapped MulBackward Node and show what it links to.
+// Since a backward can happen between any subset of the inputs (t and y) and
+// outputs (o, x, b). It is possible to get into a state where CopySlices's 0th
+// next function (CloneBackward) needs gradient but MulBackward's 0th next
+// function (SelectBackward) is not. This happens if you do autograd.grad
+// between x and t for example.
+// In such a case, we do need to mark SelectBackward as requiring gradient such
+// that, during the execution of MulBackward, we will actually compute gradient
+// for the 0th input.
+//
+// All the other next functions are always shared (this is asserted in the apply
+// code) and so nothing needs to be done for them.
+
+// See Note [View + Inplace update for view tensor] for what we do to view
+// tensor when an in-place operation happens.
+struct TORCH_API CopySlices : public Node {
+  CopySlices(
+      const Variable& base_var,
+      at::TensorGeometry view_,
+      std::unique_ptr<ViewFunc> view_fn_,
+      std::shared_ptr<Node> fn_);
+
+  // common code between apply/apply_with_saved
+  template <typename T>
+  variable_list apply_impl(variable_list&& inputs, const T& call_fn);
+
+  variable_list apply(variable_list&& inputs) override;
+  void release_variables() override;
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(
+      const variable_list& inputs,
+      SwapSavedVariables& saved) override;
+  void update_exec_info();
+
+  at::TensorGeometry base;
+  // view and view_fn are redundant and view_fn will be used if available.
+  // See Note [View + Inplace update for base tensor] for details.
+  at::TensorGeometry view;
+  std::unique_ptr<ViewFunc> view_fn;
+  std::shared_ptr<Node> fn;
+};
+
+} // namespace torch::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/functions/utils.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/functions/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..12348c2c4a7c1c64882736886acbd35be763f642
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/functions/utils.h
@@ -0,0 +1,115 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/autograd/InferenceMode.h>
+#include <torch/csrc/autograd/autograd.h>
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/utils/variadic.h>
+
+#include <ATen/core/Tensor.h>
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+namespace torch::autograd {
+
+using function_constructor = std::function<std::shared_ptr<Node>(edge_list&&)>;
+
+/**
+ * Wraps the tensor outputs in variables and creates the grad_fn and sets the
+ * grad_fn if necessary.
+ */
+TORCH_API variable_list wrap_outputs(
+    const variable_list& inputs,
+    tensor_list&& outputs,
+    const function_constructor& ctr);
+
+///  Checks that inputs contains exactly `args` items and that the first
+///  `required_args`
+/// items are not nullptr. If not specified, `required_args` defaults to `args`.
+TORCH_API void check_input_variables(
+    const char* name,
+    const variable_list& inputs,
+    int args,
+    int required_args = -1,
+    bool allow_undefined = false);
+
+struct ComputeRequiresGrad : IterArgs<ComputeRequiresGrad> {
+  bool out = false;
+  using IterArgs<ComputeRequiresGrad>::operator();
+  void operator()(const at::Tensor& tensor) {
+    const auto& var = static_cast<const Variable&>(tensor);
+    if (var.defined() && var.requires_grad()) {
+      out = true;
+    }
+  }
+  void operator()(const std::optional<at::Tensor>& tensor) {
+    if (tensor.has_value()) {
+      (*this)(*tensor);
+    }
+  }
+  bool short_circuit() {
+    return out;
+  }
+};
+
+template <typename... Args>
+inline bool compute_requires_grad(Args&&... args) {
+  if (!GradMode::is_enabled()) {
+    return false;
+  }
+  return ComputeRequiresGrad().apply(std::forward<Args>(args)...).out;
+}
+
+inline void set_history(
+    const at::Tensor& variable,
+    const std::shared_ptr<Node>& grad_fn) {
+  TORCH_CHECK(grad_fn != nullptr);
+  if (variable.defined()) {
+    // If the codegen triggers this, you most likely want to add your newly
+    // added function to the DONT_REQUIRE_DERIVATIVE list in
+    // tools/autograd/gen_variable_type.py
+    TORCH_CHECK(
+        isDifferentiableType(variable.scalar_type()),
+        "Autograd not support dtype: ",
+        variable.scalar_type());
+    auto output_nr = grad_fn->add_input_metadata(variable);
+    impl::set_gradient_edge(variable, {grad_fn, output_nr});
+  } else {
+    grad_fn->add_input_metadata(Node::undefined_input());
+  }
+}
+
+inline void set_history(
+    const std::vector<Variable>& variables,
+    const std::shared_ptr<Node>& grad_fn) {
+  for (auto& variable : variables) {
+    set_history(variable, grad_fn);
+  }
+}
+
+inline bool isFwGradDefined(const std::optional<at::Tensor>& t) {
+  return t.has_value() && t->defined() && t->_fw_grad(/*level */ 0).defined();
+}
+
+inline bool isFwGradDefinedTensorList(const at::ITensorListRef& variables) {
+  bool ret = false;
+  for (auto& variable : variables) {
+    ret |= isFwGradDefined(variable);
+  }
+  return ret;
+}
+
+inline bool isFwGradDefinedTensorList(
+    const c10::List<std::optional<at::Tensor>>& li) {
+  bool ret = false;
+  for (auto i : c10::irange(li.size())) {
+    auto t = li.get(i);
+    ret |= isFwGradDefined(t);
+  }
+  return ret;
+}
+
+} // namespace torch::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/generated/Functions.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/generated/Functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd1d645ee18ab5ebd943391a03eb4b6db906a945
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/generated/Functions.h
@@ -0,0 +1,15350 @@
+#pragma once
+
+// @generated from ..\tools\autograd\templates/Functions.h
+
+#include <ATen/ATen.h>
+#include <ATen/core/functional.h>
+#include <ATen/TensorGeometry.h>
+
+#include "torch/csrc/autograd/function.h"
+#include "torch/csrc/autograd/variable.h"
+#include "torch/csrc/autograd/saved_variable.h"
+#include <torch/csrc/Export.h>
+
+#include <c10/core/SymIntArrayRef.h>
+
+namespace torch { namespace autograd { namespace generated {
+
+using at::Scalar;
+using at::Tensor;
+using at::IntArrayRef;
+using at::ArrayRef;
+using at::Type;
+using at::TensorGeometry;
+using at::ScalarType;
+using std::optional;
+using c10::fmap;
+
+inline std::vector<Tensor> unpack_list(at::ArrayRef<SavedVariable> xs, std::shared_ptr<Node> saved_for = nullptr) {
+  // NB: we must explicitly do the conversion in the lambda, otherwise template
+  // deduction will give a Tensor of Variable which is not convertible
+  return fmap(xs, [&saved_for](const SavedVariable& x) {
+    // TODO(crcrpar): Use `std::move(saved_for)` to avoid incrementing refcount, which would need refactoring.
+    return static_cast<Tensor>(x.unpack(saved_for));
+  });
+}
+
+inline c10::List<std::optional<Tensor>> unpack_opt_list(at::ArrayRef<SavedVariable> xs, std::shared_ptr<Node> saved_for = nullptr) {
+  torch::List<std::optional<Tensor>> result;
+  result.reserve(xs.size());
+  for (const SavedVariable& v : xs) {
+    auto var = v.unpack(saved_for);
+    result.push_back(var.defined() ? std::optional<Tensor>(var) : ::std::nullopt);
+  }
+  return result;
+}
+
+using torch::autograd::TypeAndSize;
+
+#ifdef _WIN32
+struct AbsBackward0 : public TraceableFunction {
+  TORCH_API AbsBackward0() = default;
+#else
+struct TORCH_API AbsBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AbsBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct AcosBackward0 : public TraceableFunction {
+  TORCH_API AcosBackward0() = default;
+#else
+struct TORCH_API AcosBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AcosBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct AddBackward0 : public TraceableFunction {
+  TORCH_API AddBackward0() = default;
+#else
+struct TORCH_API AddBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AddBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  at::ScalarType other_scalar_type;
+  at::ScalarType self_scalar_type;
+
+};
+#ifdef _WIN32
+struct AddBackward1 : public TraceableFunction {
+  TORCH_API AddBackward1() = default;
+#else
+struct TORCH_API AddBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AddBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::ScalarType self_scalar_type;
+
+};
+#ifdef _WIN32
+struct AddbmmBackward0 : public TraceableFunction {
+  TORCH_API AddbmmBackward0() = default;
+#else
+struct TORCH_API AddbmmBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AddbmmBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    batch1_.reset_data();
+    batch2_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  SavedVariable batch1_;
+  c10::SymInt batch1_sym_argsize_0;
+  c10::SymInt batch1_sym_argsize_1;
+  SavedVariable batch2_;
+  c10::SymInt batch2_sym_argsize_2;
+  at::Scalar beta;
+
+};
+#ifdef _WIN32
+struct AddcdivBackward0 : public TraceableFunction {
+  TORCH_API AddcdivBackward0() = default;
+#else
+struct TORCH_API AddcdivBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AddcdivBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    tensor1_.reset_data();
+    tensor2_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::ScalarType self_scalar_type;
+  SavedVariable tensor1_;
+  at::ScalarType tensor1_scalar_type;
+  SavedVariable tensor2_;
+  at::ScalarType tensor2_scalar_type;
+  at::Scalar value;
+
+};
+#ifdef _WIN32
+struct AddcmulBackward0 : public TraceableFunction {
+  TORCH_API AddcmulBackward0() = default;
+#else
+struct TORCH_API AddcmulBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AddcmulBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    tensor1_.reset_data();
+    tensor2_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::ScalarType self_scalar_type;
+  SavedVariable tensor1_;
+  at::ScalarType tensor1_scalar_type;
+  SavedVariable tensor2_;
+  at::ScalarType tensor2_scalar_type;
+  at::Scalar value;
+
+};
+#ifdef _WIN32
+struct AddmmBackward0 : public TraceableFunction {
+  TORCH_API AddmmBackward0() = default;
+#else
+struct TORCH_API AddmmBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AddmmBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    mat1_.reset_data();
+    mat2_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  at::Scalar beta;
+  SavedVariable mat1_;
+  at::Layout mat1_layout;
+  std::vector<c10::SymInt> mat1_sym_sizes;
+  std::vector<c10::SymInt> mat1_sym_strides;
+  SavedVariable mat2_;
+  at::Layout mat2_layout;
+  std::vector<c10::SymInt> mat2_sym_sizes;
+  std::vector<c10::SymInt> mat2_sym_strides;
+
+};
+#ifdef _WIN32
+struct SparseAddmmBackward0 : public TraceableFunction {
+  TORCH_API SparseAddmmBackward0() = default;
+#else
+struct TORCH_API SparseAddmmBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SparseAddmmBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    mat1_.reset_data();
+    mat2_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  at::Scalar beta;
+  SavedVariable mat1_;
+  SavedVariable mat2_;
+  at::Layout mat2_layout;
+  std::vector<c10::SymInt> mat2_sym_sizes;
+  std::vector<c10::SymInt> mat2_sym_strides;
+
+};
+#ifdef _WIN32
+struct AddmvBackward0 : public TraceableFunction {
+  TORCH_API AddmvBackward0() = default;
+#else
+struct TORCH_API AddmvBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AddmvBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    mat_.reset_data();
+    vec_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  at::Scalar beta;
+  SavedVariable mat_;
+  SavedVariable vec_;
+
+};
+#ifdef _WIN32
+struct AddrBackward0 : public TraceableFunction {
+  TORCH_API AddrBackward0() = default;
+#else
+struct TORCH_API AddrBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AddrBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    vec1_.reset_data();
+    vec2_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  at::Scalar beta;
+  SavedVariable vec1_;
+  SavedVariable vec2_;
+
+};
+#ifdef _WIN32
+struct AffineGridGeneratorBackward0 : public TraceableFunction {
+  TORCH_API AffineGridGeneratorBackward0() = default;
+#else
+struct TORCH_API AffineGridGeneratorBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AffineGridGeneratorBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool align_corners;
+  std::vector<c10::SymInt> size;
+
+};
+#ifdef _WIN32
+struct AliasBackward0 : public Node {
+  TORCH_API AliasBackward0() = default;
+#else
+struct TORCH_API AliasBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AliasBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct AngleBackward0 : public TraceableFunction {
+  TORCH_API AngleBackward0() = default;
+#else
+struct TORCH_API AngleBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AngleBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct AcoshBackward0 : public TraceableFunction {
+  TORCH_API AcoshBackward0() = default;
+#else
+struct TORCH_API AcoshBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AcoshBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct AcoshBackward1 : public TraceableFunction {
+  TORCH_API AcoshBackward1() = default;
+#else
+struct TORCH_API AcoshBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AcoshBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct AsinhBackward0 : public TraceableFunction {
+  TORCH_API AsinhBackward0() = default;
+#else
+struct TORCH_API AsinhBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AsinhBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct AsinhBackward1 : public TraceableFunction {
+  TORCH_API AsinhBackward1() = default;
+#else
+struct TORCH_API AsinhBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AsinhBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct AtanhBackward0 : public TraceableFunction {
+  TORCH_API AtanhBackward0() = default;
+#else
+struct TORCH_API AtanhBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AtanhBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct AtanhBackward1 : public TraceableFunction {
+  TORCH_API AtanhBackward1() = default;
+#else
+struct TORCH_API AtanhBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AtanhBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct AsStridedBackward0 : public Node {
+  TORCH_API AsStridedBackward0() = default;
+#else
+struct TORCH_API AsStridedBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AsStridedBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::TensorGeometry self_geometry;
+  std::vector<c10::SymInt> size;
+  ::std::optional<c10::SymInt> storage_offset;
+  std::vector<c10::SymInt> stride;
+
+};
+#ifdef _WIN32
+struct AsStridedBackward1 : public TraceableFunction {
+  TORCH_API AsStridedBackward1() = default;
+#else
+struct TORCH_API AsStridedBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AsStridedBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::TensorGeometry self_geometry;
+  std::vector<c10::SymInt> size;
+  ::std::optional<c10::SymInt> storage_offset;
+  std::vector<c10::SymInt> stride;
+
+};
+#ifdef _WIN32
+struct AsinBackward0 : public TraceableFunction {
+  TORCH_API AsinBackward0() = default;
+#else
+struct TORCH_API AsinBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AsinBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct AtanBackward0 : public TraceableFunction {
+  TORCH_API AtanBackward0() = default;
+#else
+struct TORCH_API AtanBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AtanBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct Atan2Backward0 : public TraceableFunction {
+  TORCH_API Atan2Backward0() = default;
+#else
+struct TORCH_API Atan2Backward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "Atan2Backward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct BaddbmmBackward0 : public TraceableFunction {
+  TORCH_API BaddbmmBackward0() = default;
+#else
+struct TORCH_API BaddbmmBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "BaddbmmBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    batch1_.reset_data();
+    batch2_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  SavedVariable batch1_;
+  SavedVariable batch2_;
+  at::Scalar beta;
+
+};
+#ifdef _WIN32
+struct BernoulliBackward0 : public TraceableFunction {
+  TORCH_API BernoulliBackward0() = default;
+#else
+struct TORCH_API BernoulliBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "BernoulliBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct BernoulliBackward1 : public TraceableFunction {
+  TORCH_API BernoulliBackward1() = default;
+#else
+struct TORCH_API BernoulliBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "BernoulliBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  torch::autograd::generated::TypeAndSize p_info;
+
+};
+#ifdef _WIN32
+struct BernoulliBackward2 : public TraceableFunction {
+  TORCH_API BernoulliBackward2() = default;
+#else
+struct TORCH_API BernoulliBackward2 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "BernoulliBackward2"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct BmmBackward0 : public TraceableFunction {
+  TORCH_API BmmBackward0() = default;
+#else
+struct TORCH_API BmmBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "BmmBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    mat2_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable mat2_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct MatmulBackward0 : public TraceableFunction {
+  TORCH_API MatmulBackward0() = default;
+#else
+struct TORCH_API MatmulBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MatmulBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct CatBackward0 : public TraceableFunction {
+  TORCH_API CatBackward0() = default;
+#else
+struct TORCH_API CatBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CatBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  ::std::vector<at::ScalarType> tensors_args_scalartypes;
+  ::std::vector<::std::vector<c10::SymInt>> tensors_args_sizes_symint;
+  size_t tensors_size_;
+};
+#ifdef _WIN32
+struct CauchyBackward0 : public TraceableFunction {
+  TORCH_API CauchyBackward0() = default;
+#else
+struct TORCH_API CauchyBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CauchyBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct CeilBackward0 : public TraceableFunction {
+  TORCH_API CeilBackward0() = default;
+#else
+struct TORCH_API CeilBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CeilBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct CholeskyBackward0 : public TraceableFunction {
+  TORCH_API CholeskyBackward0() = default;
+#else
+struct TORCH_API CholeskyBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CholeskyBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool upper;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct ChunkBackward0 : public TraceableFunction {
+  TORCH_API ChunkBackward0() = default;
+#else
+struct TORCH_API ChunkBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ChunkBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct ChunkBackwardAutogradNestedTensor0 : public TraceableFunction {
+  TORCH_API ChunkBackwardAutogradNestedTensor0() = default;
+#else
+struct TORCH_API ChunkBackwardAutogradNestedTensor0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ChunkBackwardAutogradNestedTensor0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t chunks = 0;
+  int64_t dim = 0;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct LinalgCholeskyExBackward0 : public TraceableFunction {
+  TORCH_API LinalgCholeskyExBackward0() = default;
+#else
+struct TORCH_API LinalgCholeskyExBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgCholeskyExBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    L_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool upper;
+  SavedVariable L_;
+
+};
+#ifdef _WIN32
+struct CholeskySolveBackward0 : public TraceableFunction {
+  TORCH_API CholeskySolveBackward0() = default;
+#else
+struct TORCH_API CholeskySolveBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CholeskySolveBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    input2_.reset_data();
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable input2_;
+  SavedVariable self_;
+  bool upper;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct CholeskyInverseBackward0 : public TraceableFunction {
+  TORCH_API CholeskyInverseBackward0() = default;
+#else
+struct TORCH_API CholeskyInverseBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CholeskyInverseBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  bool upper;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct ClampBackward0 : public TraceableFunction {
+  TORCH_API ClampBackward0() = default;
+#else
+struct TORCH_API ClampBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ClampBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    max_.reset_data();
+    min_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable max_;
+  SavedVariable min_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ClampBackward1 : public TraceableFunction {
+  TORCH_API ClampBackward1() = default;
+#else
+struct TORCH_API ClampBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ClampBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  ::std::optional<at::Scalar> max;
+  ::std::optional<at::Scalar> min;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ClampMinBackward0 : public TraceableFunction {
+  TORCH_API ClampMinBackward0() = default;
+#else
+struct TORCH_API ClampMinBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ClampMinBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar min;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ClampMinBackward1 : public TraceableFunction {
+  TORCH_API ClampMinBackward1() = default;
+#else
+struct TORCH_API ClampMinBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ClampMinBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    min_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable min_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ClampMaxBackward0 : public TraceableFunction {
+  TORCH_API ClampMaxBackward0() = default;
+#else
+struct TORCH_API ClampMaxBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ClampMaxBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar max;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ClampMaxBackward1 : public TraceableFunction {
+  TORCH_API ClampMaxBackward1() = default;
+#else
+struct TORCH_API ClampMaxBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ClampMaxBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    max_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable max_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct CloneBackward0 : public TraceableFunction {
+  TORCH_API CloneBackward0() = default;
+#else
+struct TORCH_API CloneBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CloneBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct LazyCloneBackward0 : public TraceableFunction {
+  TORCH_API LazyCloneBackward0() = default;
+#else
+struct TORCH_API LazyCloneBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LazyCloneBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct ToCopyBackward0 : public TraceableFunction {
+  TORCH_API ToCopyBackward0() = default;
+#else
+struct TORCH_API ToCopyBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ToCopyBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::TensorOptions self_options;
+
+};
+#ifdef _WIN32
+struct CoalesceBackward0 : public TraceableFunction {
+  TORCH_API CoalesceBackward0() = default;
+#else
+struct TORCH_API CoalesceBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CoalesceBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct ComplexBackward0 : public TraceableFunction {
+  TORCH_API ComplexBackward0() = default;
+#else
+struct TORCH_API ComplexBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ComplexBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    imag_.reset_data();
+    real_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable imag_;
+  SavedVariable real_;
+
+};
+#ifdef _WIN32
+struct PolarBackward0 : public TraceableFunction {
+  TORCH_API PolarBackward0() = default;
+#else
+struct TORCH_API PolarBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "PolarBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct ConjBackward0 : public Node {
+  TORCH_API ConjBackward0() = default;
+#else
+struct TORCH_API ConjBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ConjBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct NegViewBackward0 : public Node {
+  TORCH_API NegViewBackward0() = default;
+#else
+struct TORCH_API NegViewBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NegViewBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct ConjPhysicalBackward0 : public TraceableFunction {
+  TORCH_API ConjPhysicalBackward0() = default;
+#else
+struct TORCH_API ConjPhysicalBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ConjPhysicalBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct ConjPhysicalBackward1 : public TraceableFunction {
+  TORCH_API ConjPhysicalBackward1() = default;
+#else
+struct TORCH_API ConjPhysicalBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ConjPhysicalBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct CopysignBackward0 : public TraceableFunction {
+  TORCH_API CopysignBackward0() = default;
+#else
+struct TORCH_API CopysignBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CopysignBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  torch::autograd::generated::TypeAndSize other_info;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct CopysignBackward1 : public TraceableFunction {
+  TORCH_API CopysignBackward1() = default;
+#else
+struct TORCH_API CopysignBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CopysignBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct CosBackward0 : public TraceableFunction {
+  TORCH_API CosBackward0() = default;
+#else
+struct TORCH_API CosBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CosBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct CoshBackward0 : public TraceableFunction {
+  TORCH_API CoshBackward0() = default;
+#else
+struct TORCH_API CoshBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CoshBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct LinalgCrossBackward0 : public TraceableFunction {
+  TORCH_API LinalgCrossBackward0() = default;
+#else
+struct TORCH_API LinalgCrossBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgCrossBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable other_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct LogcumsumexpBackward0 : public TraceableFunction {
+  TORCH_API LogcumsumexpBackward0() = default;
+#else
+struct TORCH_API LogcumsumexpBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LogcumsumexpBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct CumprodBackward0 : public TraceableFunction {
+  TORCH_API CumprodBackward0() = default;
+#else
+struct TORCH_API CumprodBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CumprodBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable self_;
+  at::ScalarType self_scalar_type;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct CumsumBackward0 : public TraceableFunction {
+  TORCH_API CumsumBackward0() = default;
+#else
+struct TORCH_API CumsumBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CumsumBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  at::ScalarType self_scalar_type;
+
+};
+#ifdef _WIN32
+struct CummaxBackward0 : public TraceableFunction {
+  TORCH_API CummaxBackward0() = default;
+#else
+struct TORCH_API CummaxBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CummaxBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable self_;
+  SavedVariable indices_;
+
+};
+#ifdef _WIN32
+struct CumminBackward0 : public TraceableFunction {
+  TORCH_API CumminBackward0() = default;
+#else
+struct TORCH_API CumminBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CumminBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable self_;
+  SavedVariable indices_;
+
+};
+#ifdef _WIN32
+struct ConvTbcBackward0 : public TraceableFunction {
+  TORCH_API ConvTbcBackward0() = default;
+#else
+struct TORCH_API ConvTbcBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ConvTbcBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    bias_.reset_data();
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable bias_;
+  int64_t pad = 0;
+  SavedVariable self_;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct CtcLossBackward0 : public TraceableFunction {
+  TORCH_API CtcLossBackward0() = default;
+#else
+struct TORCH_API CtcLossBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CtcLossBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    log_probs_.reset_data();
+    targets_.reset_data();
+    result0_.reset_data();
+    result1_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t blank = 0;
+  std::vector<int64_t> input_lengths;
+  SavedVariable log_probs_;
+  std::vector<int64_t> target_lengths;
+  SavedVariable targets_;
+  bool zero_infinity;
+  SavedVariable result0_;
+  SavedVariable result1_;
+
+};
+#ifdef _WIN32
+struct CtcLossBackward1 : public TraceableFunction {
+  TORCH_API CtcLossBackward1() = default;
+#else
+struct TORCH_API CtcLossBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CtcLossBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    input_lengths_.reset_data();
+    log_probs_.reset_data();
+    target_lengths_.reset_data();
+    targets_.reset_data();
+    result0_.reset_data();
+    result1_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t blank = 0;
+  SavedVariable input_lengths_;
+  SavedVariable log_probs_;
+  SavedVariable target_lengths_;
+  SavedVariable targets_;
+  bool zero_infinity;
+  SavedVariable result0_;
+  SavedVariable result1_;
+
+};
+#ifdef _WIN32
+struct Deg2RadBackward0 : public TraceableFunction {
+  TORCH_API Deg2RadBackward0() = default;
+#else
+struct TORCH_API Deg2RadBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "Deg2RadBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct LinalgDetBackward0 : public TraceableFunction {
+  TORCH_API LinalgDetBackward0() = default;
+#else
+struct TORCH_API LinalgDetBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgDetBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    A_.reset_data();
+    LU_.reset_data();
+    pivots_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable A_;
+  SavedVariable LU_;
+  SavedVariable pivots_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct LinalgSlogdetBackward0 : public TraceableFunction {
+  TORCH_API LinalgSlogdetBackward0() = default;
+#else
+struct TORCH_API LinalgSlogdetBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgSlogdetBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    A_.reset_data();
+    LU_.reset_data();
+    pivots_.reset_data();
+    sign_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable A_;
+  SavedVariable LU_;
+  SavedVariable pivots_;
+  SavedVariable sign_;
+
+};
+#ifdef _WIN32
+struct BlockDiagBackward0 : public TraceableFunction {
+  TORCH_API BlockDiagBackward0() = default;
+#else
+struct TORCH_API BlockDiagBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "BlockDiagBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  ::std::vector<at::ScalarType> tensors_args_scalartypes;
+  ::std::vector<::std::vector<int64_t>> tensors_args_sizes;
+  size_t tensors_size_;
+};
+#ifdef _WIN32
+struct DiagEmbedBackward0 : public TraceableFunction {
+  TORCH_API DiagEmbedBackward0() = default;
+#else
+struct TORCH_API DiagEmbedBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "DiagEmbedBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim1 = 0;
+  int64_t dim2 = 0;
+  int64_t offset = 0;
+
+};
+#ifdef _WIN32
+struct DiagonalBackward0 : public Node {
+  TORCH_API DiagonalBackward0() = default;
+#else
+struct TORCH_API DiagonalBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "DiagonalBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim1 = 0;
+  int64_t dim2 = 0;
+  int64_t offset = 0;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct DiagonalBackwardBackward0 : public TraceableFunction {
+  TORCH_API DiagonalBackwardBackward0() = default;
+#else
+struct TORCH_API DiagonalBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "DiagonalBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim1 = 0;
+  int64_t dim2 = 0;
+  int64_t offset = 0;
+
+};
+#ifdef _WIN32
+struct DistBackward0 : public TraceableFunction {
+  TORCH_API DistBackward0() = default;
+#else
+struct TORCH_API DistBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "DistBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  at::Scalar p;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct DivBackward0 : public TraceableFunction {
+  TORCH_API DivBackward0() = default;
+#else
+struct TORCH_API DivBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "DivBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  SavedVariable self_;
+  at::ScalarType self_scalar_type;
+
+};
+#ifdef _WIN32
+struct DivBackward1 : public TraceableFunction {
+  TORCH_API DivBackward1() = default;
+#else
+struct TORCH_API DivBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "DivBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar other;
+  at::ScalarType self_scalar_type;
+
+};
+#ifdef _WIN32
+struct DivBackward2 : public TraceableFunction {
+  TORCH_API DivBackward2() = default;
+#else
+struct TORCH_API DivBackward2 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "DivBackward2"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  std::optional<std::string> rounding_mode;
+  SavedVariable self_;
+  at::ScalarType self_scalar_type;
+
+};
+#ifdef _WIN32
+struct DivBackward3 : public TraceableFunction {
+  TORCH_API DivBackward3() = default;
+#else
+struct TORCH_API DivBackward3 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "DivBackward3"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar other;
+  std::optional<std::string> rounding_mode;
+  at::ScalarType self_scalar_type;
+
+};
+#ifdef _WIN32
+struct DotBackward0 : public TraceableFunction {
+  TORCH_API DotBackward0() = default;
+#else
+struct TORCH_API DotBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "DotBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    tensor_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable tensor_;
+
+};
+#ifdef _WIN32
+struct VdotBackward0 : public TraceableFunction {
+  TORCH_API VdotBackward0() = default;
+#else
+struct TORCH_API VdotBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "VdotBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct FusedDropoutBackward0 : public TraceableFunction {
+  TORCH_API FusedDropoutBackward0() = default;
+#else
+struct TORCH_API FusedDropoutBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FusedDropoutBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result1_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double p;
+  SavedVariable result1_;
+
+};
+#ifdef _WIN32
+struct NativeDropoutBackward0 : public TraceableFunction {
+  TORCH_API NativeDropoutBackward0() = default;
+#else
+struct TORCH_API NativeDropoutBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NativeDropoutBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result1_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double p;
+  ::std::optional<bool> train;
+  SavedVariable result1_;
+
+};
+#ifdef _WIN32
+struct NativeDropoutBackwardBackward0 : public TraceableFunction {
+  TORCH_API NativeDropoutBackwardBackward0() = default;
+#else
+struct TORCH_API NativeDropoutBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NativeDropoutBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    mask_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable grad_output_;
+  SavedVariable mask_;
+  double scale;
+
+};
+#ifdef _WIN32
+struct EqBackward0 : public TraceableFunction {
+  TORCH_API EqBackward0() = default;
+#else
+struct TORCH_API EqBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "EqBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct EqBackward1 : public TraceableFunction {
+  TORCH_API EqBackward1() = default;
+#else
+struct TORCH_API EqBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "EqBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  torch::autograd::generated::TypeAndSize other_info;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct ErfBackward0 : public TraceableFunction {
+  TORCH_API ErfBackward0() = default;
+#else
+struct TORCH_API ErfBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ErfBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ErfcBackward0 : public TraceableFunction {
+  TORCH_API ErfcBackward0() = default;
+#else
+struct TORCH_API ErfcBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ErfcBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct SpecialErfcxBackward0 : public TraceableFunction {
+  TORCH_API SpecialErfcxBackward0() = default;
+#else
+struct TORCH_API SpecialErfcxBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SpecialErfcxBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct ErfinvBackward0 : public TraceableFunction {
+  TORCH_API ErfinvBackward0() = default;
+#else
+struct TORCH_API ErfinvBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ErfinvBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ExpBackward0 : public TraceableFunction {
+  TORCH_API ExpBackward0() = default;
+#else
+struct TORCH_API ExpBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ExpBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct Exp2Backward0 : public TraceableFunction {
+  TORCH_API Exp2Backward0() = default;
+#else
+struct TORCH_API Exp2Backward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "Exp2Backward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct Expm1Backward0 : public TraceableFunction {
+  TORCH_API Expm1Backward0() = default;
+#else
+struct TORCH_API Expm1Backward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "Expm1Backward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct ExpandBackward0 : public Node {
+  TORCH_API ExpandBackward0() = default;
+#else
+struct TORCH_API ExpandBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ExpandBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct ExponentialBackward0 : public TraceableFunction {
+  TORCH_API ExponentialBackward0() = default;
+#else
+struct TORCH_API ExponentialBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ExponentialBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct FakeQuantizePerTensorAffineCachemaskBackward0 : public TraceableFunction {
+  TORCH_API FakeQuantizePerTensorAffineCachemaskBackward0() = default;
+#else
+struct TORCH_API FakeQuantizePerTensorAffineCachemaskBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FakeQuantizePerTensorAffineCachemaskBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    mask_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable mask_;
+
+};
+#ifdef _WIN32
+struct FakeQuantizePerTensorAffineCachemaskTensorQparamsBackward0 : public TraceableFunction {
+  TORCH_API FakeQuantizePerTensorAffineCachemaskTensorQparamsBackward0() = default;
+#else
+struct TORCH_API FakeQuantizePerTensorAffineCachemaskTensorQparamsBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FakeQuantizePerTensorAffineCachemaskTensorQparamsBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    mask_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable mask_;
+
+};
+#ifdef _WIN32
+struct FakeQuantizeLearnablePerTensorAffineBackward0 : public TraceableFunction {
+  TORCH_API FakeQuantizeLearnablePerTensorAffineBackward0() = default;
+#else
+struct TORCH_API FakeQuantizeLearnablePerTensorAffineBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FakeQuantizeLearnablePerTensorAffineBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    scale_.reset_data();
+    self_.reset_data();
+    zero_point_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double grad_factor;
+  int64_t quant_max = 0;
+  int64_t quant_min = 0;
+  SavedVariable scale_;
+  SavedVariable self_;
+  SavedVariable zero_point_;
+
+};
+#ifdef _WIN32
+struct FakeQuantizePerChannelAffineCachemaskBackward0 : public TraceableFunction {
+  TORCH_API FakeQuantizePerChannelAffineCachemaskBackward0() = default;
+#else
+struct TORCH_API FakeQuantizePerChannelAffineCachemaskBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FakeQuantizePerChannelAffineCachemaskBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    mask_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable mask_;
+
+};
+#ifdef _WIN32
+struct FakeQuantizeLearnablePerChannelAffineBackward0 : public TraceableFunction {
+  TORCH_API FakeQuantizeLearnablePerChannelAffineBackward0() = default;
+#else
+struct TORCH_API FakeQuantizeLearnablePerChannelAffineBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FakeQuantizeLearnablePerChannelAffineBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    scale_.reset_data();
+    self_.reset_data();
+    zero_point_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t axis = 0;
+  double grad_factor;
+  int64_t quant_max = 0;
+  int64_t quant_min = 0;
+  SavedVariable scale_;
+  SavedVariable self_;
+  SavedVariable zero_point_;
+
+};
+#ifdef _WIN32
+struct FusedMovingAvgObsFqHelperBackward0 : public TraceableFunction {
+  TORCH_API FusedMovingAvgObsFqHelperBackward0() = default;
+#else
+struct TORCH_API FusedMovingAvgObsFqHelperBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FusedMovingAvgObsFqHelperBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    mask_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable mask_;
+
+};
+#ifdef _WIN32
+struct FillBackward0 : public TraceableFunction {
+  TORCH_API FillBackward0() = default;
+#else
+struct TORCH_API FillBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FillBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct FillBackward1 : public TraceableFunction {
+  TORCH_API FillBackward1() = default;
+#else
+struct TORCH_API FillBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FillBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct FillBackward2 : public TraceableFunction {
+  TORCH_API FillBackward2() = default;
+#else
+struct TORCH_API FillBackward2 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FillBackward2"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct FillBackward3 : public TraceableFunction {
+  TORCH_API FillBackward3() = default;
+#else
+struct TORCH_API FillBackward3 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FillBackward3"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct FloorBackward0 : public TraceableFunction {
+  TORCH_API FloorBackward0() = default;
+#else
+struct TORCH_API FloorBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FloorBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct FmodBackward0 : public TraceableFunction {
+  TORCH_API FmodBackward0() = default;
+#else
+struct TORCH_API FmodBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FmodBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct FmodBackward1 : public TraceableFunction {
+  TORCH_API FmodBackward1() = default;
+#else
+struct TORCH_API FmodBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FmodBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct FracBackward0 : public TraceableFunction {
+  TORCH_API FracBackward0() = default;
+#else
+struct TORCH_API FracBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FracBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct FrexpBackward0 : public TraceableFunction {
+  TORCH_API FrexpBackward0() = default;
+#else
+struct TORCH_API FrexpBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FrexpBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    exponent_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable exponent_;
+
+};
+#ifdef _WIN32
+struct GatherBackward0 : public TraceableFunction {
+  TORCH_API GatherBackward0() = default;
+#else
+struct TORCH_API GatherBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "GatherBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    index_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable index_;
+  SavedVariable self_;
+  bool sparse_grad;
+
+};
+#ifdef _WIN32
+struct GeBackward0 : public TraceableFunction {
+  TORCH_API GeBackward0() = default;
+#else
+struct TORCH_API GeBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "GeBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct GeBackward1 : public TraceableFunction {
+  TORCH_API GeBackward1() = default;
+#else
+struct TORCH_API GeBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "GeBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  torch::autograd::generated::TypeAndSize other_info;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct GeometricBackward0 : public TraceableFunction {
+  TORCH_API GeometricBackward0() = default;
+#else
+struct TORCH_API GeometricBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "GeometricBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct GeqrfBackward0 : public TraceableFunction {
+  TORCH_API GeqrfBackward0() = default;
+#else
+struct TORCH_API GeqrfBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "GeqrfBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct GridSampler2DBackward0 : public TraceableFunction {
+  TORCH_API GridSampler2DBackward0() = default;
+#else
+struct TORCH_API GridSampler2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "GridSampler2DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grid_.reset_data();
+    input_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool align_corners;
+  SavedVariable grid_;
+  SavedVariable input_;
+  int64_t interpolation_mode = 0;
+  int64_t padding_mode = 0;
+
+};
+#ifdef _WIN32
+struct GridSampler3DBackward0 : public TraceableFunction {
+  TORCH_API GridSampler3DBackward0() = default;
+#else
+struct TORCH_API GridSampler3DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "GridSampler3DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grid_.reset_data();
+    input_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool align_corners;
+  SavedVariable grid_;
+  SavedVariable input_;
+  int64_t interpolation_mode = 0;
+  int64_t padding_mode = 0;
+
+};
+#ifdef _WIN32
+struct GridSampler2DCpuFallbackBackward0 : public TraceableFunction {
+  TORCH_API GridSampler2DCpuFallbackBackward0() = default;
+#else
+struct TORCH_API GridSampler2DCpuFallbackBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "GridSampler2DCpuFallbackBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grid_.reset_data();
+    input_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool align_corners;
+  SavedVariable grid_;
+  SavedVariable input_;
+  int64_t interpolation_mode = 0;
+  int64_t padding_mode = 0;
+
+};
+#ifdef _WIN32
+struct GtBackward0 : public TraceableFunction {
+  TORCH_API GtBackward0() = default;
+#else
+struct TORCH_API GtBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "GtBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct GtBackward1 : public TraceableFunction {
+  TORCH_API GtBackward1() = default;
+#else
+struct TORCH_API GtBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "GtBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  torch::autograd::generated::TypeAndSize other_info;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct HardsigmoidBackward0 : public TraceableFunction {
+  TORCH_API HardsigmoidBackward0() = default;
+#else
+struct TORCH_API HardsigmoidBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "HardsigmoidBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct HardswishBackward0 : public TraceableFunction {
+  TORCH_API HardswishBackward0() = default;
+#else
+struct TORCH_API HardswishBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "HardswishBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct HardswishBackwardBackward0 : public TraceableFunction {
+  TORCH_API HardswishBackwardBackward0() = default;
+#else
+struct TORCH_API HardswishBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "HardswishBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable grad_output_;
+  SavedVariable self_;
+  at::TensorOptions self_options;
+
+};
+#ifdef _WIN32
+struct HypotBackward0 : public TraceableFunction {
+  TORCH_API HypotBackward0() = default;
+#else
+struct TORCH_API HypotBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "HypotBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct I0Backward0 : public TraceableFunction {
+  TORCH_API I0Backward0() = default;
+#else
+struct TORCH_API I0Backward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "I0Backward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct SpecialI0EBackward0 : public TraceableFunction {
+  TORCH_API SpecialI0EBackward0() = default;
+#else
+struct TORCH_API SpecialI0EBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SpecialI0EBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct SpecialI1Backward0 : public TraceableFunction {
+  TORCH_API SpecialI1Backward0() = default;
+#else
+struct TORCH_API SpecialI1Backward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SpecialI1Backward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct SpecialI1EBackward0 : public TraceableFunction {
+  TORCH_API SpecialI1EBackward0() = default;
+#else
+struct TORCH_API SpecialI1EBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SpecialI1EBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct IgammaBackward0 : public TraceableFunction {
+  TORCH_API IgammaBackward0() = default;
+#else
+struct TORCH_API IgammaBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "IgammaBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct IgammacBackward0 : public TraceableFunction {
+  TORCH_API IgammacBackward0() = default;
+#else
+struct TORCH_API IgammacBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "IgammacBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct IndexBackward0 : public TraceableFunction {
+  TORCH_API IndexBackward0() = default;
+#else
+struct TORCH_API IndexBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "IndexBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.clear();
+    indices_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> indices_;
+  bool indices_released_ = false;
+  at::TensorOptions self_options;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct UnsafeIndexBackward0 : public TraceableFunction {
+  TORCH_API UnsafeIndexBackward0() = default;
+#else
+struct TORCH_API UnsafeIndexBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UnsafeIndexBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.clear();
+    indices_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> indices_;
+  bool indices_released_ = false;
+  at::TensorOptions self_options;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct UnsafeMaskedIndexBackward0 : public TraceableFunction {
+  TORCH_API UnsafeMaskedIndexBackward0() = default;
+#else
+struct TORCH_API UnsafeMaskedIndexBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UnsafeMaskedIndexBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.clear();
+    indices_released_ = true;
+    mask_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> indices_;
+  bool indices_released_ = false;
+  SavedVariable mask_;
+  at::TensorOptions self_options;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct UnsafeMaskedIndexPutAccumulateBackward0 : public TraceableFunction {
+  TORCH_API UnsafeMaskedIndexPutAccumulateBackward0() = default;
+#else
+struct TORCH_API UnsafeMaskedIndexPutAccumulateBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UnsafeMaskedIndexPutAccumulateBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.clear();
+    indices_released_ = true;
+    mask_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> indices_;
+  bool indices_released_ = false;
+  SavedVariable mask_;
+
+};
+#ifdef _WIN32
+struct IndexAddBackward0 : public TraceableFunction {
+  TORCH_API IndexAddBackward0() = default;
+#else
+struct TORCH_API IndexAddBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "IndexAddBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    index_.reset_data();
+    source_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  int64_t dim = 0;
+  SavedVariable index_;
+  SavedVariable source_;
+  int64_t source_dim = 0;
+
+};
+#ifdef _WIN32
+struct IndexReduceBackward0 : public TraceableFunction {
+  TORCH_API IndexReduceBackward0() = default;
+#else
+struct TORCH_API IndexReduceBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "IndexReduceBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    index_.reset_data();
+    self_.reset_data();
+    source_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  bool include_self;
+  SavedVariable index_;
+  std::string reduce;
+  SavedVariable self_;
+  SavedVariable source_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct IndexCopyBackward0 : public TraceableFunction {
+  TORCH_API IndexCopyBackward0() = default;
+#else
+struct TORCH_API IndexCopyBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "IndexCopyBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    index_.reset_data();
+    source_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable index_;
+  SavedVariable source_;
+  int64_t source_dim = 0;
+
+};
+#ifdef _WIN32
+struct IndexFillBackward0 : public TraceableFunction {
+  TORCH_API IndexFillBackward0() = default;
+#else
+struct TORCH_API IndexFillBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "IndexFillBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    index_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable index_;
+
+};
+#ifdef _WIN32
+struct IndexFillBackward1 : public TraceableFunction {
+  TORCH_API IndexFillBackward1() = default;
+#else
+struct TORCH_API IndexFillBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "IndexFillBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    index_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable index_;
+
+};
+#ifdef _WIN32
+struct IndexPutBackward0 : public TraceableFunction {
+  TORCH_API IndexPutBackward0() = default;
+#else
+struct TORCH_API IndexPutBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "IndexPutBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.clear();
+    indices_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool accumulate;
+  std::vector<SavedVariable> indices_;
+  bool indices_released_ = false;
+  torch::autograd::generated::TypeAndSize values_info;
+
+};
+#ifdef _WIN32
+struct UnsafeIndexPutBackward0 : public TraceableFunction {
+  TORCH_API UnsafeIndexPutBackward0() = default;
+#else
+struct TORCH_API UnsafeIndexPutBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UnsafeIndexPutBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.clear();
+    indices_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool accumulate;
+  std::vector<SavedVariable> indices_;
+  bool indices_released_ = false;
+  torch::autograd::generated::TypeAndSize values_info;
+
+};
+#ifdef _WIN32
+struct IndexPutImplBackward0 : public TraceableFunction {
+  TORCH_API IndexPutImplBackward0() = default;
+#else
+struct TORCH_API IndexPutImplBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "IndexPutImplBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.clear();
+    indices_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool accumulate;
+  std::vector<SavedVariable> indices_;
+  bool indices_released_ = false;
+  torch::autograd::generated::TypeAndSize values_info;
+
+};
+#ifdef _WIN32
+struct IndexSelectBackward0 : public TraceableFunction {
+  TORCH_API IndexSelectBackward0() = default;
+#else
+struct TORCH_API IndexSelectBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "IndexSelectBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    index_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable index_;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct LinalgInvExBackward0 : public TraceableFunction {
+  TORCH_API LinalgInvExBackward0() = default;
+#else
+struct TORCH_API LinalgInvExBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgInvExBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    inverse_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable inverse_;
+
+};
+#ifdef _WIN32
+struct LinalgPinvBackward0 : public TraceableFunction {
+  TORCH_API LinalgPinvBackward0() = default;
+#else
+struct TORCH_API LinalgPinvBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgPinvBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct KthvalueBackward0 : public TraceableFunction {
+  TORCH_API KthvalueBackward0() = default;
+#else
+struct TORCH_API KthvalueBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "KthvalueBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  bool keepdim;
+  std::vector<c10::SymInt> self_sym_sizes;
+  SavedVariable indices_;
+
+};
+#ifdef _WIN32
+struct LeBackward0 : public TraceableFunction {
+  TORCH_API LeBackward0() = default;
+#else
+struct TORCH_API LeBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LeBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct LeBackward1 : public TraceableFunction {
+  TORCH_API LeBackward1() = default;
+#else
+struct TORCH_API LeBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LeBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  torch::autograd::generated::TypeAndSize other_info;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct LerpBackward0 : public TraceableFunction {
+  TORCH_API LerpBackward0() = default;
+#else
+struct TORCH_API LerpBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LerpBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar weight;
+
+};
+#ifdef _WIN32
+struct LerpBackward1 : public TraceableFunction {
+  TORCH_API LerpBackward1() = default;
+#else
+struct TORCH_API LerpBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LerpBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    end_.reset_data();
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable end_;
+  SavedVariable self_;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct LgammaBackward0 : public TraceableFunction {
+  TORCH_API LgammaBackward0() = default;
+#else
+struct TORCH_API LgammaBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LgammaBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct DigammaBackward0 : public TraceableFunction {
+  TORCH_API DigammaBackward0() = default;
+#else
+struct TORCH_API DigammaBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "DigammaBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct PolygammaBackward0 : public TraceableFunction {
+  TORCH_API PolygammaBackward0() = default;
+#else
+struct TORCH_API PolygammaBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "PolygammaBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t n = 0;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct PolygammaBackward1 : public TraceableFunction {
+  TORCH_API PolygammaBackward1() = default;
+#else
+struct TORCH_API PolygammaBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "PolygammaBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t n = 0;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct LogBackward0 : public TraceableFunction {
+  TORCH_API LogBackward0() = default;
+#else
+struct TORCH_API LogBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LogBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct Log10Backward0 : public TraceableFunction {
+  TORCH_API Log10Backward0() = default;
+#else
+struct TORCH_API Log10Backward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "Log10Backward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct Log1PBackward0 : public TraceableFunction {
+  TORCH_API Log1PBackward0() = default;
+#else
+struct TORCH_API Log1PBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "Log1PBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct Log2Backward0 : public TraceableFunction {
+  TORCH_API Log2Backward0() = default;
+#else
+struct TORCH_API Log2Backward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "Log2Backward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct LogaddexpBackward0 : public TraceableFunction {
+  TORCH_API LogaddexpBackward0() = default;
+#else
+struct TORCH_API LogaddexpBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LogaddexpBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct Logaddexp2Backward0 : public TraceableFunction {
+  TORCH_API Logaddexp2Backward0() = default;
+#else
+struct TORCH_API Logaddexp2Backward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "Logaddexp2Backward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct XlogyBackward0 : public TraceableFunction {
+  TORCH_API XlogyBackward0() = default;
+#else
+struct TORCH_API XlogyBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "XlogyBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct XlogyBackward1 : public TraceableFunction {
+  TORCH_API XlogyBackward1() = default;
+#else
+struct TORCH_API XlogyBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "XlogyBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  at::Scalar self;
+
+};
+#ifdef _WIN32
+struct XlogyBackward2 : public TraceableFunction {
+  TORCH_API XlogyBackward2() = default;
+#else
+struct TORCH_API XlogyBackward2 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "XlogyBackward2"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar other;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct SpecialXlog1PyBackward0 : public TraceableFunction {
+  TORCH_API SpecialXlog1PyBackward0() = default;
+#else
+struct TORCH_API SpecialXlog1PyBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SpecialXlog1PyBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct SpecialXlog1PyBackward1 : public TraceableFunction {
+  TORCH_API SpecialXlog1PyBackward1() = default;
+#else
+struct TORCH_API SpecialXlog1PyBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SpecialXlog1PyBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  at::Scalar self;
+
+};
+#ifdef _WIN32
+struct SpecialXlog1PyBackward2 : public TraceableFunction {
+  TORCH_API SpecialXlog1PyBackward2() = default;
+#else
+struct TORCH_API SpecialXlog1PyBackward2 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SpecialXlog1PyBackward2"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar other;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct SpecialZetaBackward0 : public TraceableFunction {
+  TORCH_API SpecialZetaBackward0() = default;
+#else
+struct TORCH_API SpecialZetaBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SpecialZetaBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct SpecialZetaBackward1 : public TraceableFunction {
+  TORCH_API SpecialZetaBackward1() = default;
+#else
+struct TORCH_API SpecialZetaBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SpecialZetaBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  at::Scalar self;
+
+};
+#ifdef _WIN32
+struct SpecialZetaBackward2 : public TraceableFunction {
+  TORCH_API SpecialZetaBackward2() = default;
+#else
+struct TORCH_API SpecialZetaBackward2 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SpecialZetaBackward2"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct LogNormalBackward0 : public TraceableFunction {
+  TORCH_API LogNormalBackward0() = default;
+#else
+struct TORCH_API LogNormalBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LogNormalBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct LogsumexpBackward0 : public TraceableFunction {
+  TORCH_API LogsumexpBackward0() = default;
+#else
+struct TORCH_API LogsumexpBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LogsumexpBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dim;
+  bool keepdim;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct LinalgLstsqBackward0 : public TraceableFunction {
+  TORCH_API LinalgLstsqBackward0() = default;
+#else
+struct TORCH_API LinalgLstsqBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgLstsqBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    b_.reset_data();
+    self_.reset_data();
+    solution_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable b_;
+  SavedVariable self_;
+  SavedVariable solution_;
+
+};
+#ifdef _WIN32
+struct LtBackward0 : public TraceableFunction {
+  TORCH_API LtBackward0() = default;
+#else
+struct TORCH_API LtBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LtBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct LtBackward1 : public TraceableFunction {
+  TORCH_API LtBackward1() = default;
+#else
+struct TORCH_API LtBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LtBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  torch::autograd::generated::TypeAndSize other_info;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct LinalgLuFactorExBackward0 : public TraceableFunction {
+  TORCH_API LinalgLuFactorExBackward0() = default;
+#else
+struct TORCH_API LinalgLuFactorExBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgLuFactorExBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    LU_.reset_data();
+    pivots_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool pivot;
+  SavedVariable LU_;
+  SavedVariable pivots_;
+
+};
+#ifdef _WIN32
+struct LinalgLuFactorBackward0 : public TraceableFunction {
+  TORCH_API LinalgLuFactorBackward0() = default;
+#else
+struct TORCH_API LinalgLuFactorBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgLuFactorBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    LU_.reset_data();
+    pivots_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool pivot;
+  SavedVariable LU_;
+  SavedVariable pivots_;
+
+};
+#ifdef _WIN32
+struct LinalgLuBackward0 : public TraceableFunction {
+  TORCH_API LinalgLuBackward0() = default;
+#else
+struct TORCH_API LinalgLuBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgLuBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    L_.reset_data();
+    P_.reset_data();
+    U_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool pivot;
+  SavedVariable L_;
+  SavedVariable P_;
+  SavedVariable U_;
+
+};
+#ifdef _WIN32
+struct LinalgLuSolveBackward0 : public TraceableFunction {
+  TORCH_API LinalgLuSolveBackward0() = default;
+#else
+struct TORCH_API LinalgLuSolveBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgLuSolveBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    LU_.reset_data();
+    pivots_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable LU_;
+  bool adjoint;
+  bool left;
+  SavedVariable pivots_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct LuUnpackBackward0 : public TraceableFunction {
+  TORCH_API LuUnpackBackward0() = default;
+#else
+struct TORCH_API LuUnpackBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LuUnpackBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::SymInt LU_data_sym_argsize_minus_1;
+  c10::SymInt LU_data_sym_argsize_minus_2;
+
+};
+#ifdef _WIN32
+struct MaskedFillBackward0 : public TraceableFunction {
+  TORCH_API MaskedFillBackward0() = default;
+#else
+struct TORCH_API MaskedFillBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MaskedFillBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    mask_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable mask_;
+
+};
+#ifdef _WIN32
+struct MaskedFillBackward1 : public TraceableFunction {
+  TORCH_API MaskedFillBackward1() = default;
+#else
+struct TORCH_API MaskedFillBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MaskedFillBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    mask_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable mask_;
+
+};
+#ifdef _WIN32
+struct MaskedScatterBackward0 : public TraceableFunction {
+  TORCH_API MaskedScatterBackward0() = default;
+#else
+struct TORCH_API MaskedScatterBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MaskedScatterBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    mask_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable mask_;
+  std::vector<c10::SymInt> source_sym_sizes;
+
+};
+#ifdef _WIN32
+struct MaskedScatterBackwardBackward0 : public TraceableFunction {
+  TORCH_API MaskedScatterBackwardBackward0() = default;
+#else
+struct TORCH_API MaskedScatterBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MaskedScatterBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    mask_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  torch::autograd::generated::TypeAndSize grad_output_info;
+  SavedVariable mask_;
+
+};
+#ifdef _WIN32
+struct MaskedSelectBackward0 : public TraceableFunction {
+  TORCH_API MaskedSelectBackward0() = default;
+#else
+struct TORCH_API MaskedSelectBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MaskedSelectBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    mask_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable mask_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct LinalgMatrixExpBackward0 : public TraceableFunction {
+  TORCH_API LinalgMatrixExpBackward0() = default;
+#else
+struct TORCH_API LinalgMatrixExpBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgMatrixExpBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct MaxBackward0 : public TraceableFunction {
+  TORCH_API MaxBackward0() = default;
+#else
+struct TORCH_API MaxBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MaxBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  bool keepdim;
+  std::vector<c10::SymInt> self_sym_sizes;
+  SavedVariable indices_;
+
+};
+#ifdef _WIN32
+struct MaxBackward1 : public TraceableFunction {
+  TORCH_API MaxBackward1() = default;
+#else
+struct TORCH_API MaxBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MaxBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct MaximumBackward0 : public TraceableFunction {
+  TORCH_API MaximumBackward0() = default;
+#else
+struct TORCH_API MaximumBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MaximumBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct FmaxBackward0 : public TraceableFunction {
+  TORCH_API FmaxBackward0() = default;
+#else
+struct TORCH_API FmaxBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FmaxBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct MeanBackward0 : public TraceableFunction {
+  TORCH_API MeanBackward0() = default;
+#else
+struct TORCH_API MeanBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MeanBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::SymInt self_sym_numel;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct MeanBackwardAutogradNestedTensor0 : public TraceableFunction {
+  TORCH_API MeanBackwardAutogradNestedTensor0() = default;
+#else
+struct TORCH_API MeanBackwardAutogradNestedTensor0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MeanBackwardAutogradNestedTensor0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  c10::SymInt self_sym_numel;
+
+};
+#ifdef _WIN32
+struct MeanBackward1 : public TraceableFunction {
+  TORCH_API MeanBackward1() = default;
+#else
+struct TORCH_API MeanBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MeanBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::OptionalArray<int64_t> dim;
+  bool keepdim;
+  c10::SymInt self_sym_numel;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct MedianBackward0 : public TraceableFunction {
+  TORCH_API MedianBackward0() = default;
+#else
+struct TORCH_API MedianBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MedianBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct NanmedianBackward0 : public TraceableFunction {
+  TORCH_API NanmedianBackward0() = default;
+#else
+struct TORCH_API NanmedianBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NanmedianBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct MedianBackward1 : public TraceableFunction {
+  TORCH_API MedianBackward1() = default;
+#else
+struct TORCH_API MedianBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MedianBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  bool keepdim;
+  std::vector<c10::SymInt> self_sym_sizes;
+  SavedVariable indices_;
+
+};
+#ifdef _WIN32
+struct NanmedianBackward1 : public TraceableFunction {
+  TORCH_API NanmedianBackward1() = default;
+#else
+struct TORCH_API NanmedianBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NanmedianBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  bool keepdim;
+  std::vector<c10::SymInt> self_sym_sizes;
+  SavedVariable indices_;
+
+};
+#ifdef _WIN32
+struct MinBackward0 : public TraceableFunction {
+  TORCH_API MinBackward0() = default;
+#else
+struct TORCH_API MinBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MinBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  bool keepdim;
+  std::vector<c10::SymInt> self_sym_sizes;
+  SavedVariable indices_;
+
+};
+#ifdef _WIN32
+struct MinBackward1 : public TraceableFunction {
+  TORCH_API MinBackward1() = default;
+#else
+struct TORCH_API MinBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MinBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct MinimumBackward0 : public TraceableFunction {
+  TORCH_API MinimumBackward0() = default;
+#else
+struct TORCH_API MinimumBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MinimumBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct FminBackward0 : public TraceableFunction {
+  TORCH_API FminBackward0() = default;
+#else
+struct TORCH_API FminBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FminBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct AmaxBackward0 : public TraceableFunction {
+  TORCH_API AmaxBackward0() = default;
+#else
+struct TORCH_API AmaxBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AmaxBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dim;
+  bool keepdim;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct AminBackward0 : public TraceableFunction {
+  TORCH_API AminBackward0() = default;
+#else
+struct TORCH_API AminBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AminBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dim;
+  bool keepdim;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct MmBackward0 : public TraceableFunction {
+  TORCH_API MmBackward0() = default;
+#else
+struct TORCH_API MmBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MmBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    mat2_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable mat2_;
+  at::Layout mat2_layout;
+  std::vector<c10::SymInt> mat2_sym_sizes;
+  std::vector<c10::SymInt> mat2_sym_strides;
+  SavedVariable self_;
+  at::Layout self_layout;
+  std::vector<c10::SymInt> self_sym_sizes;
+  std::vector<c10::SymInt> self_sym_strides;
+
+};
+#ifdef _WIN32
+struct GroupedMmBackward0 : public TraceableFunction {
+  TORCH_API GroupedMmBackward0() = default;
+#else
+struct TORCH_API GroupedMmBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "GroupedMmBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    mat2_.reset_data();
+    offs_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable mat2_;
+  at::Layout mat2_layout;
+  std::vector<c10::SymInt> mat2_sym_sizes;
+  std::vector<c10::SymInt> mat2_sym_strides;
+  SavedVariable offs_;
+  SavedVariable self_;
+  at::Layout self_layout;
+  std::vector<c10::SymInt> self_sym_sizes;
+  std::vector<c10::SymInt> self_sym_strides;
+
+};
+#ifdef _WIN32
+struct ModeBackward0 : public TraceableFunction {
+  TORCH_API ModeBackward0() = default;
+#else
+struct TORCH_API ModeBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ModeBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  bool keepdim;
+  std::vector<c10::SymInt> self_sym_sizes;
+  SavedVariable indices_;
+
+};
+#ifdef _WIN32
+struct MulBackward0 : public TraceableFunction {
+  TORCH_API MulBackward0() = default;
+#else
+struct TORCH_API MulBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MulBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  at::ScalarType other_scalar_type;
+  SavedVariable self_;
+  at::ScalarType self_scalar_type;
+
+};
+#ifdef _WIN32
+struct MulBackward1 : public TraceableFunction {
+  TORCH_API MulBackward1() = default;
+#else
+struct TORCH_API MulBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MulBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar other;
+  at::ScalarType self_scalar_type;
+
+};
+#ifdef _WIN32
+struct MvBackward0 : public TraceableFunction {
+  TORCH_API MvBackward0() = default;
+#else
+struct TORCH_API MvBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MvBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    vec_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable vec_;
+
+};
+#ifdef _WIN32
+struct MvlgammaBackward0 : public TraceableFunction {
+  TORCH_API MvlgammaBackward0() = default;
+#else
+struct TORCH_API MvlgammaBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MvlgammaBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t p = 0;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct NanToNumBackward0 : public TraceableFunction {
+  TORCH_API NanToNumBackward0() = default;
+#else
+struct TORCH_API NanToNumBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NanToNumBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct NativeBatchNormBackward0 : public TraceableFunction {
+  TORCH_API NativeBatchNormBackward0() = default;
+#else
+struct TORCH_API NativeBatchNormBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NativeBatchNormBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    input_.reset_data();
+    running_mean_.reset_data();
+    running_var_.reset_data();
+    weight_.reset_data();
+    result1_.reset_data();
+    result2_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double eps;
+  SavedVariable input_;
+  SavedVariable running_mean_;
+  SavedVariable running_var_;
+  bool training;
+  SavedVariable weight_;
+  SavedVariable result1_;
+  SavedVariable result2_;
+
+};
+#ifdef _WIN32
+struct NativeBatchNormLegitBackward0 : public TraceableFunction {
+  TORCH_API NativeBatchNormLegitBackward0() = default;
+#else
+struct TORCH_API NativeBatchNormLegitBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NativeBatchNormLegitBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    input_.reset_data();
+    running_mean_.reset_data();
+    running_var_.reset_data();
+    weight_.reset_data();
+    result1_.reset_data();
+    result2_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double eps;
+  SavedVariable input_;
+  SavedVariable running_mean_;
+  SavedVariable running_var_;
+  bool training;
+  SavedVariable weight_;
+  SavedVariable result1_;
+  SavedVariable result2_;
+
+};
+#ifdef _WIN32
+struct NativeBatchNormLegitNoTrainingBackward0 : public TraceableFunction {
+  TORCH_API NativeBatchNormLegitNoTrainingBackward0() = default;
+#else
+struct TORCH_API NativeBatchNormLegitNoTrainingBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NativeBatchNormLegitNoTrainingBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    input_.reset_data();
+    running_mean_.reset_data();
+    running_var_.reset_data();
+    weight_.reset_data();
+    result1_.reset_data();
+    result2_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double eps;
+  SavedVariable input_;
+  SavedVariable running_mean_;
+  SavedVariable running_var_;
+  SavedVariable weight_;
+  SavedVariable result1_;
+  SavedVariable result2_;
+
+};
+#ifdef _WIN32
+struct NativeBatchNormLegitBackward1 : public TraceableFunction {
+  TORCH_API NativeBatchNormLegitBackward1() = default;
+#else
+struct TORCH_API NativeBatchNormLegitBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NativeBatchNormLegitBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    input_.reset_data();
+    weight_.reset_data();
+    result1_.reset_data();
+    result2_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double eps;
+  SavedVariable input_;
+  bool training;
+  SavedVariable weight_;
+  SavedVariable result1_;
+  SavedVariable result2_;
+
+};
+#ifdef _WIN32
+struct NativeBatchNormBackwardBackward0 : public TraceableFunction {
+  TORCH_API NativeBatchNormBackwardBackward0() = default;
+#else
+struct TORCH_API NativeBatchNormBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NativeBatchNormBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_out_.reset_data();
+    input_.reset_data();
+    running_mean_.reset_data();
+    running_var_.reset_data();
+    save_invstd_.reset_data();
+    save_mean_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double eps;
+  SavedVariable grad_out_;
+  SavedVariable input_;
+  SavedVariable running_mean_;
+  SavedVariable running_var_;
+  SavedVariable save_invstd_;
+  SavedVariable save_mean_;
+  bool train;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct NativeLayerNormBackward0 : public TraceableFunction {
+  TORCH_API NativeLayerNormBackward0() = default;
+#else
+struct TORCH_API NativeLayerNormBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NativeLayerNormBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    bias_.reset_data();
+    input_.reset_data();
+    weight_.reset_data();
+    result1_.reset_data();
+    result2_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable bias_;
+  SavedVariable input_;
+  std::vector<c10::SymInt> normalized_shape;
+  SavedVariable weight_;
+  SavedVariable result1_;
+  SavedVariable result2_;
+
+};
+#ifdef _WIN32
+struct NativeLayerNormBackwardBackward0 : public TraceableFunction {
+  TORCH_API NativeLayerNormBackwardBackward0() = default;
+#else
+struct TORCH_API NativeLayerNormBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NativeLayerNormBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_out_.reset_data();
+    input_.reset_data();
+    mean_.reset_data();
+    rstd_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable grad_out_;
+  SavedVariable input_;
+  SavedVariable mean_;
+  std::vector<c10::SymInt> normalized_shape;
+  SavedVariable rstd_;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct NativeGroupNormBackward0 : public TraceableFunction {
+  TORCH_API NativeGroupNormBackward0() = default;
+#else
+struct TORCH_API NativeGroupNormBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NativeGroupNormBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    input_.reset_data();
+    weight_.reset_data();
+    result1_.reset_data();
+    result2_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::SymInt C;
+  c10::SymInt HxW;
+  c10::SymInt N;
+  double eps;
+  int64_t group = 0;
+  SavedVariable input_;
+  SavedVariable weight_;
+  SavedVariable result1_;
+  SavedVariable result2_;
+
+};
+#ifdef _WIN32
+struct NeBackward0 : public TraceableFunction {
+  TORCH_API NeBackward0() = default;
+#else
+struct TORCH_API NeBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NeBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct NeBackward1 : public TraceableFunction {
+  TORCH_API NeBackward1() = default;
+#else
+struct TORCH_API NeBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NeBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  torch::autograd::generated::TypeAndSize other_info;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct NegBackward0 : public TraceableFunction {
+  TORCH_API NegBackward0() = default;
+#else
+struct TORCH_API NegBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NegBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct BatchNormWithUpdateBackward0 : public TraceableFunction {
+  TORCH_API BatchNormWithUpdateBackward0() = default;
+#else
+struct TORCH_API BatchNormWithUpdateBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "BatchNormWithUpdateBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    input_.reset_data();
+    running_mean_.reset_data();
+    running_var_.reset_data();
+    weight_.reset_data();
+    result1_.reset_data();
+    result2_.reset_data();
+    result3_.reset_data();
+  }
+  bool retain_variables = true;
+  void will_release_variables() override {
+    retain_variables = false;
+  }
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double eps;
+  SavedVariable input_;
+  SavedVariable running_mean_;
+  SavedVariable running_var_;
+  SavedVariable weight_;
+  SavedVariable result1_;
+  SavedVariable result2_;
+  SavedVariable result3_;
+
+};
+#ifdef _WIN32
+struct BatchNormNoUpdateBackward0 : public TraceableFunction {
+  TORCH_API BatchNormNoUpdateBackward0() = default;
+#else
+struct TORCH_API BatchNormNoUpdateBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "BatchNormNoUpdateBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    input_.reset_data();
+    running_mean_.reset_data();
+    running_var_.reset_data();
+    weight_.reset_data();
+    result1_.reset_data();
+    result2_.reset_data();
+    result3_.reset_data();
+  }
+  bool retain_variables = true;
+  void will_release_variables() override {
+    retain_variables = false;
+  }
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double eps;
+  SavedVariable input_;
+  SavedVariable running_mean_;
+  SavedVariable running_var_;
+  SavedVariable weight_;
+  SavedVariable result1_;
+  SavedVariable result2_;
+  SavedVariable result3_;
+
+};
+#ifdef _WIN32
+struct BatchNormBackwardBackward0 : public TraceableFunction {
+  TORCH_API BatchNormBackwardBackward0() = default;
+#else
+struct TORCH_API BatchNormBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "BatchNormBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_out_.reset_data();
+    input_.reset_data();
+    reserve_.reset_data();
+    running_mean_.reset_data();
+    running_var_.reset_data();
+    save_mean_.reset_data();
+    save_var_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double eps;
+  SavedVariable grad_out_;
+  SavedVariable input_;
+  SavedVariable reserve_;
+  SavedVariable running_mean_;
+  SavedVariable running_var_;
+  SavedVariable save_mean_;
+  SavedVariable save_var_;
+  bool update;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct NextafterBackward0 : public TraceableFunction {
+  TORCH_API NextafterBackward0() = default;
+#else
+struct TORCH_API NextafterBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NextafterBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct NormBackward0 : public TraceableFunction {
+  TORCH_API NormBackward0() = default;
+#else
+struct TORCH_API NormBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NormBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar p;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct NormBackward1 : public TraceableFunction {
+  TORCH_API NormBackward1() = default;
+#else
+struct TORCH_API NormBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NormBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dim;
+  bool keepdim;
+  ::std::optional<at::Scalar> p;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct NormBackward2 : public TraceableFunction {
+  TORCH_API NormBackward2() = default;
+#else
+struct TORCH_API NormBackward2 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NormBackward2"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  ::std::optional<at::Scalar> p;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct NormBackward3 : public TraceableFunction {
+  TORCH_API NormBackward3() = default;
+#else
+struct TORCH_API NormBackward3 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NormBackward3"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dim;
+  bool keepdim;
+  ::std::optional<at::Scalar> p;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct LinalgVectorNormBackward0 : public TraceableFunction {
+  TORCH_API LinalgVectorNormBackward0() = default;
+#else
+struct TORCH_API LinalgVectorNormBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgVectorNormBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::OptionalArray<int64_t> dim;
+  bool keepdim;
+  at::Scalar ord;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct PdistBackward0 : public TraceableFunction {
+  TORCH_API PdistBackward0() = default;
+#else
+struct TORCH_API PdistBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "PdistBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double p;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct PdistBackwardBackward0 : public TraceableFunction {
+  TORCH_API PdistBackwardBackward0() = default;
+#else
+struct TORCH_API PdistBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "PdistBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct EuclideanDistBackward0 : public TraceableFunction {
+  TORCH_API EuclideanDistBackward0() = default;
+#else
+struct TORCH_API EuclideanDistBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "EuclideanDistBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    x1_.reset_data();
+    x2_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable x1_;
+  SavedVariable x2_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct CdistBackward0 : public TraceableFunction {
+  TORCH_API CdistBackward0() = default;
+#else
+struct TORCH_API CdistBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CdistBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    x1_.reset_data();
+    x2_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double p;
+  SavedVariable x1_;
+  SavedVariable x2_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct CdistBackwardBackward0 : public TraceableFunction {
+  TORCH_API CdistBackwardBackward0() = default;
+#else
+struct TORCH_API CdistBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CdistBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct NormalBackward0 : public TraceableFunction {
+  TORCH_API NormalBackward0() = default;
+#else
+struct TORCH_API NormalBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NormalBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct NormalBackward1 : public TraceableFunction {
+  TORCH_API NormalBackward1() = default;
+#else
+struct TORCH_API NormalBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NormalBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> mean_sym_sizes;
+
+};
+#ifdef _WIN32
+struct NormalBackward2 : public TraceableFunction {
+  TORCH_API NormalBackward2() = default;
+#else
+struct TORCH_API NormalBackward2 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NormalBackward2"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> std_sym_sizes;
+
+};
+#ifdef _WIN32
+struct NormalBackward3 : public TraceableFunction {
+  TORCH_API NormalBackward3() = default;
+#else
+struct TORCH_API NormalBackward3 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NormalBackward3"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> mean_sym_sizes;
+  std::vector<c10::SymInt> std_sym_sizes;
+
+};
+#ifdef _WIN32
+struct LinalgHouseholderProductBackward0 : public TraceableFunction {
+  TORCH_API LinalgHouseholderProductBackward0() = default;
+#else
+struct TORCH_API LinalgHouseholderProductBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgHouseholderProductBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    input_.reset_data();
+    tau_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable input_;
+  SavedVariable tau_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct OrmqrBackward0 : public TraceableFunction {
+  TORCH_API OrmqrBackward0() = default;
+#else
+struct TORCH_API OrmqrBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "OrmqrBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    input2_.reset_data();
+    input3_.reset_data();
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable input2_;
+  SavedVariable input3_;
+  bool left;
+  SavedVariable self_;
+  bool transpose;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct PermuteBackward0 : public Node {
+  TORCH_API PermuteBackward0() = default;
+#else
+struct TORCH_API PermuteBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "PermuteBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dims;
+
+};
+#ifdef _WIN32
+struct PoissonBackward0 : public TraceableFunction {
+  TORCH_API PoissonBackward0() = default;
+#else
+struct TORCH_API PoissonBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "PoissonBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct PowBackward0 : public TraceableFunction {
+  TORCH_API PowBackward0() = default;
+#else
+struct TORCH_API PowBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "PowBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar exponent;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct PowBackward1 : public TraceableFunction {
+  TORCH_API PowBackward1() = default;
+#else
+struct TORCH_API PowBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "PowBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    exponent_.reset_data();
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable exponent_;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct PowBackward2 : public TraceableFunction {
+  TORCH_API PowBackward2() = default;
+#else
+struct TORCH_API PowBackward2 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "PowBackward2"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    exponent_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable exponent_;
+  at::Scalar self;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct ProdBackward0 : public TraceableFunction {
+  TORCH_API ProdBackward0() = default;
+#else
+struct TORCH_API ProdBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ProdBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct ProdBackward1 : public TraceableFunction {
+  TORCH_API ProdBackward1() = default;
+#else
+struct TORCH_API ProdBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ProdBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  bool keepdim;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct PutBackward0 : public TraceableFunction {
+  TORCH_API PutBackward0() = default;
+#else
+struct TORCH_API PutBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "PutBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    index_.reset_data();
+    source_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool accumulate;
+  SavedVariable index_;
+  SavedVariable source_;
+  torch::autograd::generated::TypeAndSize source_info;
+
+};
+#ifdef _WIN32
+struct LinalgQrBackward0 : public TraceableFunction {
+  TORCH_API LinalgQrBackward0() = default;
+#else
+struct TORCH_API LinalgQrBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgQrBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    Q_.reset_data();
+    R_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::string mode;
+  SavedVariable Q_;
+  SavedVariable R_;
+
+};
+#ifdef _WIN32
+struct Rad2DegBackward0 : public TraceableFunction {
+  TORCH_API Rad2DegBackward0() = default;
+#else
+struct TORCH_API Rad2DegBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "Rad2DegBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct RandomBackward0 : public TraceableFunction {
+  TORCH_API RandomBackward0() = default;
+#else
+struct TORCH_API RandomBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "RandomBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct RandomBackward1 : public TraceableFunction {
+  TORCH_API RandomBackward1() = default;
+#else
+struct TORCH_API RandomBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "RandomBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct RandomBackward2 : public TraceableFunction {
+  TORCH_API RandomBackward2() = default;
+#else
+struct TORCH_API RandomBackward2 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "RandomBackward2"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct ReciprocalBackward0 : public TraceableFunction {
+  TORCH_API ReciprocalBackward0() = default;
+#else
+struct TORCH_API ReciprocalBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ReciprocalBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct RemainderBackward0 : public TraceableFunction {
+  TORCH_API RemainderBackward0() = default;
+#else
+struct TORCH_API RemainderBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "RemainderBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct RemainderBackward1 : public TraceableFunction {
+  TORCH_API RemainderBackward1() = default;
+#else
+struct TORCH_API RemainderBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "RemainderBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct RenormBackward0 : public TraceableFunction {
+  TORCH_API RenormBackward0() = default;
+#else
+struct TORCH_API RenormBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "RenormBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  at::Scalar maxnorm;
+  at::Scalar p;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct RepeatBackward0 : public TraceableFunction {
+  TORCH_API RepeatBackward0() = default;
+#else
+struct TORCH_API RepeatBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "RepeatBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> repeats;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct SpecialEntrBackward0 : public TraceableFunction {
+  TORCH_API SpecialEntrBackward0() = default;
+#else
+struct TORCH_API SpecialEntrBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SpecialEntrBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct SpecialNdtriBackward0 : public TraceableFunction {
+  TORCH_API SpecialNdtriBackward0() = default;
+#else
+struct TORCH_API SpecialNdtriBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SpecialNdtriBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct SpecialLogNdtrBackward0 : public TraceableFunction {
+  TORCH_API SpecialLogNdtrBackward0() = default;
+#else
+struct TORCH_API SpecialLogNdtrBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SpecialLogNdtrBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct ReshapeAliasBackward0 : public Node {
+  TORCH_API ReshapeAliasBackward0() = default;
+#else
+struct TORCH_API ReshapeAliasBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ReshapeAliasBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct RoundBackward0 : public TraceableFunction {
+  TORCH_API RoundBackward0() = default;
+#else
+struct TORCH_API RoundBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "RoundBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct RoundBackward1 : public TraceableFunction {
+  TORCH_API RoundBackward1() = default;
+#else
+struct TORCH_API RoundBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "RoundBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct RsqrtBackward0 : public TraceableFunction {
+  TORCH_API RsqrtBackward0() = default;
+#else
+struct TORCH_API RsqrtBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "RsqrtBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct ScatterBackward0 : public TraceableFunction {
+  TORCH_API ScatterBackward0() = default;
+#else
+struct TORCH_API ScatterBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ScatterBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    index_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable index_;
+
+};
+#ifdef _WIN32
+struct ScatterBackward1 : public TraceableFunction {
+  TORCH_API ScatterBackward1() = default;
+#else
+struct TORCH_API ScatterBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ScatterBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    index_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable index_;
+
+};
+#ifdef _WIN32
+struct ScatterAddBackward0 : public TraceableFunction {
+  TORCH_API ScatterAddBackward0() = default;
+#else
+struct TORCH_API ScatterAddBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ScatterAddBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    index_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable index_;
+
+};
+#ifdef _WIN32
+struct SelectBackward0 : public Node {
+  TORCH_API SelectBackward0() = default;
+#else
+struct TORCH_API SelectBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SelectBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  c10::SymInt index;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct SelectBackwardAutogradNestedTensor0 : public Node {
+  TORCH_API SelectBackwardAutogradNestedTensor0() = default;
+#else
+struct TORCH_API SelectBackwardAutogradNestedTensor0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SelectBackwardAutogradNestedTensor0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  c10::SymInt index;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct SelectBackwardBackward0 : public TraceableFunction {
+  TORCH_API SelectBackwardBackward0() = default;
+#else
+struct TORCH_API SelectBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SelectBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  c10::SymInt index;
+
+};
+#ifdef _WIN32
+struct SigmoidBackward0 : public TraceableFunction {
+  TORCH_API SigmoidBackward0() = default;
+#else
+struct TORCH_API SigmoidBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SigmoidBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct LogitBackward0 : public TraceableFunction {
+  TORCH_API LogitBackward0() = default;
+#else
+struct TORCH_API LogitBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LogitBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  ::std::optional<double> eps;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct SignBackward0 : public TraceableFunction {
+  TORCH_API SignBackward0() = default;
+#else
+struct TORCH_API SignBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SignBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct SgnBackward0 : public TraceableFunction {
+  TORCH_API SgnBackward0() = default;
+#else
+struct TORCH_API SgnBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SgnBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct SinBackward0 : public TraceableFunction {
+  TORCH_API SinBackward0() = default;
+#else
+struct TORCH_API SinBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SinBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct SincBackward0 : public TraceableFunction {
+  TORCH_API SincBackward0() = default;
+#else
+struct TORCH_API SincBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SincBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct SinhBackward0 : public TraceableFunction {
+  TORCH_API SinhBackward0() = default;
+#else
+struct TORCH_API SinhBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SinhBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct SliceBackward0 : public Node {
+  TORCH_API SliceBackward0() = default;
+#else
+struct TORCH_API SliceBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SliceBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  ::std::optional<c10::SymInt> end;
+  std::vector<c10::SymInt> self_sym_sizes;
+  ::std::optional<c10::SymInt> start;
+  c10::SymInt step;
+
+};
+#ifdef _WIN32
+struct SliceBackwardBackward0 : public TraceableFunction {
+  TORCH_API SliceBackwardBackward0() = default;
+#else
+struct TORCH_API SliceBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SliceBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  c10::SymInt end;
+  c10::SymInt start;
+  c10::SymInt step;
+
+};
+#ifdef _WIN32
+struct SliceInverseBackward0 : public Node {
+  TORCH_API SliceInverseBackward0() = default;
+#else
+struct TORCH_API SliceInverseBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SliceInverseBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  ::std::optional<c10::SymInt> end;
+  torch::autograd::generated::TypeAndSize self_info;
+  ::std::optional<c10::SymInt> start;
+  c10::SymInt step;
+
+};
+#ifdef _WIN32
+struct SliceScatterBackward0 : public TraceableFunction {
+  TORCH_API SliceScatterBackward0() = default;
+#else
+struct TORCH_API SliceScatterBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SliceScatterBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  ::std::optional<c10::SymInt> end;
+  torch::autograd::generated::TypeAndSize src_info;
+  ::std::optional<c10::SymInt> start;
+  c10::SymInt step;
+
+};
+#ifdef _WIN32
+struct SelectScatterBackward0 : public TraceableFunction {
+  TORCH_API SelectScatterBackward0() = default;
+#else
+struct TORCH_API SelectScatterBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SelectScatterBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  c10::SymInt index;
+  torch::autograd::generated::TypeAndSize src_info;
+
+};
+#ifdef _WIN32
+struct DiagonalScatterBackward0 : public TraceableFunction {
+  TORCH_API DiagonalScatterBackward0() = default;
+#else
+struct TORCH_API DiagonalScatterBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "DiagonalScatterBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim1 = 0;
+  int64_t dim2 = 0;
+  int64_t offset = 0;
+  torch::autograd::generated::TypeAndSize src_info;
+
+};
+#ifdef _WIN32
+struct AsStridedScatterBackward0 : public TraceableFunction {
+  TORCH_API AsStridedScatterBackward0() = default;
+#else
+struct TORCH_API AsStridedScatterBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AsStridedScatterBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::TensorGeometry self_geometry;
+  std::vector<c10::SymInt> size;
+  at::TensorGeometry src_geometry;
+  ::std::optional<c10::SymInt> storage_offset;
+  std::vector<c10::SymInt> stride;
+
+};
+#ifdef _WIN32
+struct LinalgSolveExBackward0 : public TraceableFunction {
+  TORCH_API LinalgSolveExBackward0() = default;
+#else
+struct TORCH_API LinalgSolveExBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgSolveExBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    A_.reset_data();
+    LU_.reset_data();
+    pivots_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable A_;
+  bool left;
+  SavedVariable LU_;
+  SavedVariable pivots_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct SortBackward0 : public TraceableFunction {
+  TORCH_API SortBackward0() = default;
+#else
+struct TORCH_API SortBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SortBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  std::vector<c10::SymInt> self_sym_sizes;
+  SavedVariable indices_;
+
+};
+#ifdef _WIN32
+struct SortBackward1 : public TraceableFunction {
+  TORCH_API SortBackward1() = default;
+#else
+struct TORCH_API SortBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SortBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  std::vector<c10::SymInt> self_sym_sizes;
+  SavedVariable indices_;
+
+};
+#ifdef _WIN32
+struct SplitBackward0 : public Node {
+  TORCH_API SplitBackward0() = default;
+#else
+struct TORCH_API SplitBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SplitBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  at::TensorOptions self_options;
+  std::vector<c10::SymInt> self_sym_sizes;
+  c10::SymInt split_size;
+
+};
+#ifdef _WIN32
+struct UnsafeSplitBackward0 : public TraceableFunction {
+  TORCH_API UnsafeSplitBackward0() = default;
+#else
+struct TORCH_API UnsafeSplitBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UnsafeSplitBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  at::TensorOptions self_options;
+  std::vector<c10::SymInt> self_sym_sizes;
+  c10::SymInt split_size;
+
+};
+#ifdef _WIN32
+struct SplitWithSizesBackward0 : public Node {
+  TORCH_API SplitWithSizesBackward0() = default;
+#else
+struct TORCH_API SplitWithSizesBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SplitWithSizesBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  at::TensorOptions self_options;
+  std::vector<c10::SymInt> self_sym_sizes;
+  std::vector<c10::SymInt> split_sizes;
+
+};
+#ifdef _WIN32
+struct SplitWithSizesBackwardAutogradNestedTensor0 : public Node {
+  TORCH_API SplitWithSizesBackwardAutogradNestedTensor0() = default;
+#else
+struct TORCH_API SplitWithSizesBackwardAutogradNestedTensor0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SplitWithSizesBackwardAutogradNestedTensor0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable self_;
+  at::TensorOptions self_options;
+  std::vector<c10::SymInt> split_sizes;
+
+};
+#ifdef _WIN32
+struct UnsafeSplitWithSizesBackward0 : public TraceableFunction {
+  TORCH_API UnsafeSplitWithSizesBackward0() = default;
+#else
+struct TORCH_API UnsafeSplitWithSizesBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UnsafeSplitWithSizesBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  at::TensorOptions self_options;
+  std::vector<c10::SymInt> self_sym_sizes;
+  std::vector<c10::SymInt> split_sizes;
+
+};
+#ifdef _WIN32
+struct SqrtBackward0 : public TraceableFunction {
+  TORCH_API SqrtBackward0() = default;
+#else
+struct TORCH_API SqrtBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SqrtBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct SqueezeBackward0 : public Node {
+  TORCH_API SqueezeBackward0() = default;
+#else
+struct TORCH_API SqueezeBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SqueezeBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct SqueezeBackward1 : public Node {
+  TORCH_API SqueezeBackward1() = default;
+#else
+struct TORCH_API SqueezeBackward1 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SqueezeBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct SqueezeBackwardAutogradNestedTensor0 : public Node {
+  TORCH_API SqueezeBackwardAutogradNestedTensor0() = default;
+#else
+struct TORCH_API SqueezeBackwardAutogradNestedTensor0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SqueezeBackwardAutogradNestedTensor0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+
+};
+#ifdef _WIN32
+struct SqueezeBackward2 : public Node {
+  TORCH_API SqueezeBackward2() = default;
+#else
+struct TORCH_API SqueezeBackward2 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SqueezeBackward2"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dim;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct SqueezeBackwardAutogradNestedTensor1 : public Node {
+  TORCH_API SqueezeBackwardAutogradNestedTensor1() = default;
+#else
+struct TORCH_API SqueezeBackwardAutogradNestedTensor1 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SqueezeBackwardAutogradNestedTensor1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dim;
+  int64_t self_dim = 0;
+
+};
+#ifdef _WIN32
+struct SqueezeBackward3 : public TraceableFunction {
+  TORCH_API SqueezeBackward3() = default;
+#else
+struct TORCH_API SqueezeBackward3 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SqueezeBackward3"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct SqueezeBackward4 : public TraceableFunction {
+  TORCH_API SqueezeBackward4() = default;
+#else
+struct TORCH_API SqueezeBackward4 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SqueezeBackward4"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct SqueezeBackward5 : public TraceableFunction {
+  TORCH_API SqueezeBackward5() = default;
+#else
+struct TORCH_API SqueezeBackward5 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SqueezeBackward5"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dim;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct StdBackward0 : public TraceableFunction {
+  TORCH_API StdBackward0() = default;
+#else
+struct TORCH_API StdBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "StdBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  ::std::optional<at::Scalar> correction;
+  c10::OptionalArray<int64_t> dim;
+  bool keepdim;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct StdMeanBackward0 : public TraceableFunction {
+  TORCH_API StdMeanBackward0() = default;
+#else
+struct TORCH_API StdMeanBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "StdMeanBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result0_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  ::std::optional<at::Scalar> correction;
+  c10::OptionalArray<int64_t> dim;
+  bool keepdim;
+  SavedVariable self_;
+  SavedVariable result0_;
+
+};
+#ifdef _WIN32
+struct SubBackward0 : public TraceableFunction {
+  TORCH_API SubBackward0() = default;
+#else
+struct TORCH_API SubBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SubBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  at::ScalarType other_scalar_type;
+  at::ScalarType self_scalar_type;
+
+};
+#ifdef _WIN32
+struct SubBackward1 : public TraceableFunction {
+  TORCH_API SubBackward1() = default;
+#else
+struct TORCH_API SubBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SubBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::ScalarType self_scalar_type;
+
+};
+#ifdef _WIN32
+struct RsubBackward0 : public TraceableFunction {
+  TORCH_API RsubBackward0() = default;
+#else
+struct TORCH_API RsubBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "RsubBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  at::ScalarType other_scalar_type;
+  at::ScalarType self_scalar_type;
+
+};
+#ifdef _WIN32
+struct RsubBackward1 : public TraceableFunction {
+  TORCH_API RsubBackward1() = default;
+#else
+struct TORCH_API RsubBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "RsubBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  at::ScalarType self_scalar_type;
+
+};
+#ifdef _WIN32
+struct SumBackward0 : public TraceableFunction {
+  TORCH_API SumBackward0() = default;
+#else
+struct TORCH_API SumBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SumBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct SumBackwardAutogradNestedTensor0 : public TraceableFunction {
+  TORCH_API SumBackwardAutogradNestedTensor0() = default;
+#else
+struct TORCH_API SumBackwardAutogradNestedTensor0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SumBackwardAutogradNestedTensor0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct SumBackward1 : public TraceableFunction {
+  TORCH_API SumBackward1() = default;
+#else
+struct TORCH_API SumBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SumBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::OptionalArray<int64_t> dim;
+  bool keepdim;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct SumBackwardAutogradNestedTensor1 : public TraceableFunction {
+  TORCH_API SumBackwardAutogradNestedTensor1() = default;
+#else
+struct TORCH_API SumBackwardAutogradNestedTensor1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SumBackwardAutogradNestedTensor1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::OptionalArray<int64_t> dim;
+  bool keepdim;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct NansumBackward0 : public TraceableFunction {
+  TORCH_API NansumBackward0() = default;
+#else
+struct TORCH_API NansumBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NansumBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::OptionalArray<int64_t> dim;
+  bool keepdim;
+  SavedVariable self_;
+  at::ScalarType self_scalar_type;
+
+};
+#ifdef _WIN32
+struct LinalgSvdBackward0 : public TraceableFunction {
+  TORCH_API LinalgSvdBackward0() = default;
+#else
+struct TORCH_API LinalgSvdBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgSvdBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    S_.reset_data();
+    U_.reset_data();
+    Vh_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool full_matrices;
+  SavedVariable S_;
+  c10::SymInt S_sym_argsize_minus_1;
+  SavedVariable U_;
+  SavedVariable Vh_;
+
+};
+#ifdef _WIN32
+struct LinalgEighBackward0 : public TraceableFunction {
+  TORCH_API LinalgEighBackward0() = default;
+#else
+struct TORCH_API LinalgEighBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgEighBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    eigenvalues_.reset_data();
+    eigenvectors_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable eigenvalues_;
+  SavedVariable eigenvectors_;
+
+};
+#ifdef _WIN32
+struct LinalgEigBackward0 : public TraceableFunction {
+  TORCH_API LinalgEigBackward0() = default;
+#else
+struct TORCH_API LinalgEigBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgEigBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    eigenvalues_.reset_data();
+    eigenvectors_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::ScalarType self_scalar_type;
+  SavedVariable eigenvalues_;
+  SavedVariable eigenvectors_;
+
+};
+#ifdef _WIN32
+struct TBackward0 : public Node {
+  TORCH_API TBackward0() = default;
+#else
+struct TORCH_API TBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct TBackward1 : public TraceableFunction {
+  TORCH_API TBackward1() = default;
+#else
+struct TORCH_API TBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct FlipBackward0 : public TraceableFunction {
+  TORCH_API FlipBackward0() = default;
+#else
+struct TORCH_API FlipBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FlipBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dims;
+
+};
+#ifdef _WIN32
+struct RollBackward0 : public TraceableFunction {
+  TORCH_API RollBackward0() = default;
+#else
+struct TORCH_API RollBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "RollBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dims;
+  std::vector<c10::SymInt> shifts;
+
+};
+#ifdef _WIN32
+struct Rot90Backward0 : public TraceableFunction {
+  TORCH_API Rot90Backward0() = default;
+#else
+struct TORCH_API Rot90Backward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "Rot90Backward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dims;
+  int64_t k = 0;
+
+};
+#ifdef _WIN32
+struct TakeBackward0 : public TraceableFunction {
+  TORCH_API TakeBackward0() = default;
+#else
+struct TORCH_API TakeBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TakeBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    index_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable index_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct TanBackward0 : public TraceableFunction {
+  TORCH_API TanBackward0() = default;
+#else
+struct TORCH_API TanBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TanBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct TanhBackward0 : public TraceableFunction {
+  TORCH_API TanhBackward0() = default;
+#else
+struct TORCH_API TanhBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TanhBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct TopkBackward0 : public TraceableFunction {
+  TORCH_API TopkBackward0() = default;
+#else
+struct TORCH_API TopkBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TopkBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  std::vector<c10::SymInt> self_sym_sizes;
+  SavedVariable indices_;
+
+};
+#ifdef _WIN32
+struct TraceBackward0 : public TraceableFunction {
+  TORCH_API TraceBackward0() = default;
+#else
+struct TORCH_API TraceBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TraceBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct TransposeBackward0 : public Node {
+  TORCH_API TransposeBackward0() = default;
+#else
+struct TORCH_API TransposeBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TransposeBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim0 = 0;
+  int64_t dim1 = 0;
+
+};
+#ifdef _WIN32
+struct TransposeBackward1 : public TraceableFunction {
+  TORCH_API TransposeBackward1() = default;
+#else
+struct TORCH_API TransposeBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TransposeBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim0 = 0;
+  int64_t dim1 = 0;
+
+};
+#ifdef _WIN32
+struct TriangularSolveBackward0 : public TraceableFunction {
+  TORCH_API TriangularSolveBackward0() = default;
+#else
+struct TORCH_API TriangularSolveBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TriangularSolveBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    A_.reset_data();
+    self_.reset_data();
+    solution_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable A_;
+  SavedVariable self_;
+  bool transpose;
+  bool unitriangular;
+  bool upper;
+  SavedVariable solution_;
+
+};
+#ifdef _WIN32
+struct LinalgSolveTriangularBackward0 : public TraceableFunction {
+  TORCH_API LinalgSolveTriangularBackward0() = default;
+#else
+struct TORCH_API LinalgSolveTriangularBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinalgSolveTriangularBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool left;
+  SavedVariable self_;
+  bool unitriangular;
+  bool upper;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct TrilBackward0 : public TraceableFunction {
+  TORCH_API TrilBackward0() = default;
+#else
+struct TORCH_API TrilBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TrilBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t diagonal = 0;
+
+};
+#ifdef _WIN32
+struct TriuBackward0 : public TraceableFunction {
+  TORCH_API TriuBackward0() = default;
+#else
+struct TORCH_API TriuBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TriuBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t diagonal = 0;
+
+};
+#ifdef _WIN32
+struct TruncBackward0 : public TraceableFunction {
+  TORCH_API TruncBackward0() = default;
+#else
+struct TORCH_API TruncBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TruncBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct ToDenseBackward0 : public TraceableFunction {
+  TORCH_API ToDenseBackward0() = default;
+#else
+struct TORCH_API ToDenseBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ToDenseBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  ::std::optional<bool> masked_grad;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ToSparseBackward0 : public TraceableFunction {
+  TORCH_API ToSparseBackward0() = default;
+#else
+struct TORCH_API ToSparseBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ToSparseBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Layout self_layout;
+  c10::OptionalArray<c10::SymInt> self_self_sym_blocksize_opt;
+
+};
+#ifdef _WIN32
+struct ToSparseBackward1 : public TraceableFunction {
+  TORCH_API ToSparseBackward1() = default;
+#else
+struct TORCH_API ToSparseBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ToSparseBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Layout self_layout;
+  c10::OptionalArray<c10::SymInt> self_self_sym_blocksize_opt;
+
+};
+#ifdef _WIN32
+struct ToSparseCsrBackward0 : public TraceableFunction {
+  TORCH_API ToSparseCsrBackward0() = default;
+#else
+struct TORCH_API ToSparseCsrBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ToSparseCsrBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Layout self_layout;
+  c10::OptionalArray<c10::SymInt> self_self_sym_blocksize_opt;
+
+};
+#ifdef _WIN32
+struct ToSparseCscBackward0 : public TraceableFunction {
+  TORCH_API ToSparseCscBackward0() = default;
+#else
+struct TORCH_API ToSparseCscBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ToSparseCscBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Layout self_layout;
+  c10::OptionalArray<c10::SymInt> self_self_sym_blocksize_opt;
+
+};
+#ifdef _WIN32
+struct ToSparseBsrBackward0 : public TraceableFunction {
+  TORCH_API ToSparseBsrBackward0() = default;
+#else
+struct TORCH_API ToSparseBsrBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ToSparseBsrBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Layout self_layout;
+  c10::OptionalArray<c10::SymInt> self_self_sym_blocksize_opt;
+
+};
+#ifdef _WIN32
+struct ToSparseBscBackward0 : public TraceableFunction {
+  TORCH_API ToSparseBscBackward0() = default;
+#else
+struct TORCH_API ToSparseBscBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ToSparseBscBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Layout self_layout;
+  c10::OptionalArray<c10::SymInt> self_self_sym_blocksize_opt;
+
+};
+#ifdef _WIN32
+struct ToMkldnnBackward0 : public TraceableFunction {
+  TORCH_API ToMkldnnBackward0() = default;
+#else
+struct TORCH_API ToMkldnnBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ToMkldnnBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct UnfoldBackward0 : public Node {
+  TORCH_API UnfoldBackward0() = default;
+#else
+struct TORCH_API UnfoldBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UnfoldBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dimension = 0;
+  std::vector<c10::SymInt> self_sym_sizes;
+  int64_t size = 0;
+  int64_t step = 0;
+
+};
+#ifdef _WIN32
+struct UnfoldBackwardBackward0 : public TraceableFunction {
+  TORCH_API UnfoldBackwardBackward0() = default;
+#else
+struct TORCH_API UnfoldBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UnfoldBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  int64_t size = 0;
+  int64_t step = 0;
+
+};
+#ifdef _WIN32
+struct UniformBackward0 : public TraceableFunction {
+  TORCH_API UniformBackward0() = default;
+#else
+struct TORCH_API UniformBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UniformBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct UniqueBackward0 : public TraceableFunction {
+  TORCH_API UniqueBackward0() = default;
+#else
+struct TORCH_API UniqueBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UniqueBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct UniqueDimBackward0 : public TraceableFunction {
+  TORCH_API UniqueDimBackward0() = default;
+#else
+struct TORCH_API UniqueDimBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UniqueDimBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct UniqueConsecutiveBackward0 : public TraceableFunction {
+  TORCH_API UniqueConsecutiveBackward0() = default;
+#else
+struct TORCH_API UniqueConsecutiveBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UniqueConsecutiveBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct UniqueDimConsecutiveBackward0 : public TraceableFunction {
+  TORCH_API UniqueDimConsecutiveBackward0() = default;
+#else
+struct TORCH_API UniqueDimConsecutiveBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UniqueDimConsecutiveBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct Unique2Backward0 : public TraceableFunction {
+  TORCH_API Unique2Backward0() = default;
+#else
+struct TORCH_API Unique2Backward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "Unique2Backward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct UnsafeViewBackward0 : public TraceableFunction {
+  TORCH_API UnsafeViewBackward0() = default;
+#else
+struct TORCH_API UnsafeViewBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UnsafeViewBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct LiftBackward0 : public TraceableFunction {
+  TORCH_API LiftBackward0() = default;
+#else
+struct TORCH_API LiftBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LiftBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct LiftFreshBackward0 : public TraceableFunction {
+  TORCH_API LiftFreshBackward0() = default;
+#else
+struct TORCH_API LiftFreshBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LiftFreshBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct UnsqueezeBackward0 : public Node {
+  TORCH_API UnsqueezeBackward0() = default;
+#else
+struct TORCH_API UnsqueezeBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UnsqueezeBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+
+};
+#ifdef _WIN32
+struct UnsqueezeBackward1 : public TraceableFunction {
+  TORCH_API UnsqueezeBackward1() = default;
+#else
+struct TORCH_API UnsqueezeBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UnsqueezeBackward1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+
+};
+#ifdef _WIN32
+struct VarBackward0 : public TraceableFunction {
+  TORCH_API VarBackward0() = default;
+#else
+struct TORCH_API VarBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "VarBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  ::std::optional<at::Scalar> correction;
+  c10::OptionalArray<int64_t> dim;
+  bool keepdim;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct VarMeanBackward0 : public TraceableFunction {
+  TORCH_API VarMeanBackward0() = default;
+#else
+struct TORCH_API VarMeanBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "VarMeanBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  ::std::optional<at::Scalar> correction;
+  c10::OptionalArray<int64_t> dim;
+  bool keepdim;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ViewBackward0 : public Node {
+  TORCH_API ViewBackward0() = default;
+#else
+struct TORCH_API ViewBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ViewBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct ViewBackwardAutogradNestedTensor0 : public Node {
+  TORCH_API ViewBackwardAutogradNestedTensor0() = default;
+#else
+struct TORCH_API ViewBackwardAutogradNestedTensor0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ViewBackwardAutogradNestedTensor0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ViewAsRealBackward0 : public Node {
+  TORCH_API ViewAsRealBackward0() = default;
+#else
+struct TORCH_API ViewAsRealBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ViewAsRealBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct ViewAsComplexBackward0 : public Node {
+  TORCH_API ViewAsComplexBackward0() = default;
+#else
+struct TORCH_API ViewAsComplexBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ViewAsComplexBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct WhereBackward0 : public TraceableFunction {
+  TORCH_API WhereBackward0() = default;
+#else
+struct TORCH_API WhereBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "WhereBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    condition_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable condition_;
+
+};
+#ifdef _WIN32
+struct WeightNormInterfaceBackward0 : public TraceableFunction {
+  TORCH_API WeightNormInterfaceBackward0() = default;
+#else
+struct TORCH_API WeightNormInterfaceBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "WeightNormInterfaceBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    g_.reset_data();
+    v_.reset_data();
+    result1_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable g_;
+  SavedVariable v_;
+  SavedVariable result1_;
+
+};
+#ifdef _WIN32
+struct ZeroBackward0 : public TraceableFunction {
+  TORCH_API ZeroBackward0() = default;
+#else
+struct TORCH_API ZeroBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ZeroBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct SparseMaskBackward0 : public TraceableFunction {
+  TORCH_API SparseMaskBackward0() = default;
+#else
+struct TORCH_API SparseMaskBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SparseMaskBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    mask_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable mask_;
+  at::Layout self_layout;
+
+};
+#ifdef _WIN32
+struct SparseCooTensorWithDimsAndTensorsBackward0 : public TraceableFunction {
+  TORCH_API SparseCooTensorWithDimsAndTensorsBackward0() = default;
+#else
+struct TORCH_API SparseCooTensorWithDimsAndTensorsBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SparseCooTensorWithDimsAndTensorsBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct SparseCompressedTensorBackward0 : public TraceableFunction {
+  TORCH_API SparseCompressedTensorBackward0() = default;
+#else
+struct TORCH_API SparseCompressedTensorBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SparseCompressedTensorBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    values_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable values_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct SparseSumBackward0 : public TraceableFunction {
+  TORCH_API SparseSumBackward0() = default;
+#else
+struct TORCH_API SparseSumBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SparseSumBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dim;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct StandardGammaBackward0 : public TraceableFunction {
+  TORCH_API StandardGammaBackward0() = default;
+#else
+struct TORCH_API StandardGammaBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "StandardGammaBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct StandardGammaGradBackward0 : public TraceableFunction {
+  TORCH_API StandardGammaGradBackward0() = default;
+#else
+struct TORCH_API StandardGammaGradBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "StandardGammaGradBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct ValuesBackward0 : public Node {
+  TORCH_API ValuesBackward0() = default;
+#else
+struct TORCH_API ValuesBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ValuesBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ValuesBackwardAutogradNestedTensor0 : public Node {
+  TORCH_API ValuesBackwardAutogradNestedTensor0() = default;
+#else
+struct TORCH_API ValuesBackwardAutogradNestedTensor0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ValuesBackwardAutogradNestedTensor0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct TrilinearBackward0 : public TraceableFunction {
+  TORCH_API TrilinearBackward0() = default;
+#else
+struct TORCH_API TrilinearBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TrilinearBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    i1_.reset_data();
+    i2_.reset_data();
+    i3_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> expand1;
+  std::vector<int64_t> expand2;
+  std::vector<int64_t> expand3;
+  SavedVariable i1_;
+  SavedVariable i2_;
+  SavedVariable i3_;
+  std::vector<int64_t> sumdim;
+
+};
+#ifdef _WIN32
+struct ConstantPadNdBackward0 : public TraceableFunction {
+  TORCH_API ConstantPadNdBackward0() = default;
+#else
+struct TORCH_API ConstantPadNdBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ConstantPadNdBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> pad;
+
+};
+#ifdef _WIN32
+struct BinaryCrossEntropyBackward0 : public TraceableFunction {
+  TORCH_API BinaryCrossEntropyBackward0() = default;
+#else
+struct TORCH_API BinaryCrossEntropyBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "BinaryCrossEntropyBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    target_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t reduction = 0;
+  SavedVariable self_;
+  SavedVariable target_;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct BinaryCrossEntropyBackwardBackward0 : public TraceableFunction {
+  TORCH_API BinaryCrossEntropyBackwardBackward0() = default;
+#else
+struct TORCH_API BinaryCrossEntropyBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "BinaryCrossEntropyBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    self_.reset_data();
+    target_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable grad_output_;
+  int64_t reduction = 0;
+  SavedVariable self_;
+  SavedVariable target_;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct BinaryCrossEntropyWithLogitsBackward0 : public TraceableFunction {
+  TORCH_API BinaryCrossEntropyWithLogitsBackward0() = default;
+#else
+struct TORCH_API BinaryCrossEntropyWithLogitsBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "BinaryCrossEntropyWithLogitsBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    pos_weight_.reset_data();
+    self_.reset_data();
+    target_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable pos_weight_;
+  int64_t reduction = 0;
+  SavedVariable self_;
+  SavedVariable target_;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct EmbeddingBackward0 : public TraceableFunction {
+  TORCH_API EmbeddingBackward0() = default;
+#else
+struct TORCH_API EmbeddingBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "EmbeddingBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable indices_;
+  c10::SymInt padding_idx;
+  bool scale_grad_by_freq;
+  bool sparse;
+  c10::SymInt weight_sym_argsize_0;
+
+};
+#ifdef _WIN32
+struct EmbeddingDenseBackwardBackward0 : public TraceableFunction {
+  TORCH_API EmbeddingDenseBackwardBackward0() = default;
+#else
+struct TORCH_API EmbeddingDenseBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "EmbeddingDenseBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable indices_;
+  c10::SymInt padding_idx;
+
+};
+#ifdef _WIN32
+struct EmbeddingBagBackward0 : public TraceableFunction {
+  TORCH_API EmbeddingBagBackward0() = default;
+#else
+struct TORCH_API EmbeddingBagBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "EmbeddingBagBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+    offsets_.reset_data();
+    per_sample_weights_.reset_data();
+    weight_.reset_data();
+    result1_.reset_data();
+    result2_.reset_data();
+    result3_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable indices_;
+  int64_t mode = 0;
+  SavedVariable offsets_;
+  int64_t padding_idx = 0;
+  SavedVariable per_sample_weights_;
+  bool scale_grad_by_freq;
+  bool sparse;
+  SavedVariable weight_;
+  c10::SymInt weight_sym_argsize_0;
+  SavedVariable result1_;
+  SavedVariable result2_;
+  SavedVariable result3_;
+
+};
+#ifdef _WIN32
+struct EmbeddingBagBackwardBackward0 : public TraceableFunction {
+  TORCH_API EmbeddingBagBackwardBackward0() = default;
+#else
+struct TORCH_API EmbeddingBagBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "EmbeddingBagBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct EmbeddingBagDenseBackwardBackward0 : public TraceableFunction {
+  TORCH_API EmbeddingBagDenseBackwardBackward0() = default;
+#else
+struct TORCH_API EmbeddingBagDenseBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "EmbeddingBagDenseBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct EmbeddingRenormBackward0 : public TraceableFunction {
+  TORCH_API EmbeddingRenormBackward0() = default;
+#else
+struct TORCH_API EmbeddingRenormBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "EmbeddingRenormBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct MseLossBackward0 : public TraceableFunction {
+  TORCH_API MseLossBackward0() = default;
+#else
+struct TORCH_API MseLossBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MseLossBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    target_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t reduction = 0;
+  SavedVariable self_;
+  SavedVariable target_;
+
+};
+#ifdef _WIN32
+struct MultiMarginLossBackward0 : public TraceableFunction {
+  TORCH_API MultiMarginLossBackward0() = default;
+#else
+struct TORCH_API MultiMarginLossBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MultiMarginLossBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    target_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar margin;
+  at::Scalar p;
+  int64_t reduction = 0;
+  SavedVariable self_;
+  SavedVariable target_;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct MultilabelMarginLossBackward0 : public TraceableFunction {
+  TORCH_API MultilabelMarginLossBackward0() = default;
+#else
+struct TORCH_API MultilabelMarginLossBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MultilabelMarginLossBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    target_.reset_data();
+    is_target_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t reduction = 0;
+  SavedVariable self_;
+  SavedVariable target_;
+  SavedVariable is_target_;
+
+};
+#ifdef _WIN32
+struct NllLossBackward0 : public TraceableFunction {
+  TORCH_API NllLossBackward0() = default;
+#else
+struct TORCH_API NllLossBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NllLossBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    target_.reset_data();
+    weight_.reset_data();
+    total_weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::SymInt ignore_index;
+  int64_t reduction = 0;
+  SavedVariable self_;
+  SavedVariable target_;
+  SavedVariable weight_;
+  SavedVariable total_weight_;
+
+};
+#ifdef _WIN32
+struct NllLoss2DBackward0 : public TraceableFunction {
+  TORCH_API NllLoss2DBackward0() = default;
+#else
+struct TORCH_API NllLoss2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NllLoss2DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    target_.reset_data();
+    weight_.reset_data();
+    total_weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::SymInt ignore_index;
+  int64_t reduction = 0;
+  SavedVariable self_;
+  SavedVariable target_;
+  SavedVariable weight_;
+  SavedVariable total_weight_;
+
+};
+#ifdef _WIN32
+struct SmoothL1LossBackward0 : public TraceableFunction {
+  TORCH_API SmoothL1LossBackward0() = default;
+#else
+struct TORCH_API SmoothL1LossBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SmoothL1LossBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    target_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double beta;
+  int64_t reduction = 0;
+  SavedVariable self_;
+  SavedVariable target_;
+
+};
+#ifdef _WIN32
+struct HuberLossBackward0 : public TraceableFunction {
+  TORCH_API HuberLossBackward0() = default;
+#else
+struct TORCH_API HuberLossBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "HuberLossBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    target_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double delta;
+  int64_t reduction = 0;
+  SavedVariable self_;
+  SavedVariable target_;
+
+};
+#ifdef _WIN32
+struct SoftMarginLossBackward0 : public TraceableFunction {
+  TORCH_API SoftMarginLossBackward0() = default;
+#else
+struct TORCH_API SoftMarginLossBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SoftMarginLossBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    target_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t reduction = 0;
+  SavedVariable self_;
+  SavedVariable target_;
+
+};
+#ifdef _WIN32
+struct ReluBackward0 : public TraceableFunction {
+  TORCH_API ReluBackward0() = default;
+#else
+struct TORCH_API ReluBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ReluBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct SiluBackward0 : public TraceableFunction {
+  TORCH_API SiluBackward0() = default;
+#else
+struct TORCH_API SiluBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SiluBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct MishBackward0 : public TraceableFunction {
+  TORCH_API MishBackward0() = default;
+#else
+struct TORCH_API MishBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MishBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct EluBackward0 : public TraceableFunction {
+  TORCH_API EluBackward0() = default;
+#else
+struct TORCH_API EluBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "EluBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  at::Scalar input_scale;
+  at::Scalar scale;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct EluBackward1 : public TraceableFunction {
+  TORCH_API EluBackward1() = default;
+#else
+struct TORCH_API EluBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "EluBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  at::Scalar input_scale;
+  at::Scalar scale;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct CeluBackward0 : public TraceableFunction {
+  TORCH_API CeluBackward0() = default;
+#else
+struct TORCH_API CeluBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CeluBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct CeluBackward1 : public TraceableFunction {
+  TORCH_API CeluBackward1() = default;
+#else
+struct TORCH_API CeluBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CeluBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct GeluBackward0 : public TraceableFunction {
+  TORCH_API GeluBackward0() = default;
+#else
+struct TORCH_API GeluBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "GeluBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::string approximate;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct GeluBackwardBackward0 : public TraceableFunction {
+  TORCH_API GeluBackwardBackward0() = default;
+#else
+struct TORCH_API GeluBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "GeluBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::string approximate;
+  SavedVariable grad_output_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct GluBackward0 : public TraceableFunction {
+  TORCH_API GluBackward0() = default;
+#else
+struct TORCH_API GluBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "GluBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct HardshrinkBackward0 : public TraceableFunction {
+  TORCH_API HardshrinkBackward0() = default;
+#else
+struct TORCH_API HardshrinkBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "HardshrinkBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar lambd;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct HardshrinkBackwardBackward0 : public TraceableFunction {
+  TORCH_API HardshrinkBackwardBackward0() = default;
+#else
+struct TORCH_API HardshrinkBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "HardshrinkBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar lambd;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct HardtanhBackward0 : public TraceableFunction {
+  TORCH_API HardtanhBackward0() = default;
+#else
+struct TORCH_API HardtanhBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "HardtanhBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar max_val;
+  at::Scalar min_val;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct LeakyReluBackward0 : public TraceableFunction {
+  TORCH_API LeakyReluBackward0() = default;
+#else
+struct TORCH_API LeakyReluBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LeakyReluBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar negative_slope;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct LeakyReluBackward1 : public TraceableFunction {
+  TORCH_API LeakyReluBackward1() = default;
+#else
+struct TORCH_API LeakyReluBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LeakyReluBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar negative_slope;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct LogSigmoidBackward0 : public TraceableFunction {
+  TORCH_API LogSigmoidBackward0() = default;
+#else
+struct TORCH_API LogSigmoidBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LogSigmoidBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    buffer_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable buffer_;
+
+};
+#ifdef _WIN32
+struct LogSoftmaxBackward0 : public TraceableFunction {
+  TORCH_API LogSoftmaxBackward0() = default;
+#else
+struct TORCH_API LogSoftmaxBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LogSoftmaxBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  at::ScalarType self_scalar_type;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct SparseLogSoftmaxBackward0 : public TraceableFunction {
+  TORCH_API SparseLogSoftmaxBackward0() = default;
+#else
+struct TORCH_API SparseLogSoftmaxBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SparseLogSoftmaxBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct MaskedSoftmaxBackward0 : public TraceableFunction {
+  TORCH_API MaskedSoftmaxBackward0() = default;
+#else
+struct TORCH_API MaskedSoftmaxBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MaskedSoftmaxBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    mask_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  ::std::optional<int64_t> dim;
+  SavedVariable mask_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct PreluKernelBackward0 : public TraceableFunction {
+  TORCH_API PreluKernelBackward0() = default;
+#else
+struct TORCH_API PreluKernelBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "PreluKernelBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct PreluKernelBackwardBackward0 : public TraceableFunction {
+  TORCH_API PreluKernelBackwardBackward0() = default;
+#else
+struct TORCH_API PreluKernelBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "PreluKernelBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable grad_output_;
+  at::TensorOptions grad_output_options;
+  SavedVariable self_;
+  torch::autograd::generated::TypeAndSize self_info;
+  at::TensorOptions self_options;
+  SavedVariable weight_;
+  at::TensorOptions weight_options;
+
+};
+#ifdef _WIN32
+struct RreluWithNoiseBackward0 : public TraceableFunction {
+  TORCH_API RreluWithNoiseBackward0() = default;
+#else
+struct TORCH_API RreluWithNoiseBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "RreluWithNoiseBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    noise_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar lower;
+  SavedVariable noise_;
+  SavedVariable self_;
+  bool training;
+  at::Scalar upper;
+
+};
+#ifdef _WIN32
+struct RreluWithNoiseBackward1 : public TraceableFunction {
+  TORCH_API RreluWithNoiseBackward1() = default;
+#else
+struct TORCH_API RreluWithNoiseBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "RreluWithNoiseBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    noise_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar lower;
+  SavedVariable noise_;
+  bool training;
+  at::Scalar upper;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct RreluWithNoiseFunctionalBackward0 : public TraceableFunction {
+  TORCH_API RreluWithNoiseFunctionalBackward0() = default;
+#else
+struct TORCH_API RreluWithNoiseFunctionalBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "RreluWithNoiseFunctionalBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    noise_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar lower;
+  SavedVariable noise_;
+  SavedVariable self_;
+  bool training;
+  at::Scalar upper;
+
+};
+#ifdef _WIN32
+struct SoftmaxBackward0 : public TraceableFunction {
+  TORCH_API SoftmaxBackward0() = default;
+#else
+struct TORCH_API SoftmaxBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SoftmaxBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  at::ScalarType self_scalar_type;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct SparseSoftmaxBackward0 : public TraceableFunction {
+  TORCH_API SparseSoftmaxBackward0() = default;
+#else
+struct TORCH_API SparseSoftmaxBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SparseSoftmaxBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable self_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct SparseSparseMatmulBackward0 : public TraceableFunction {
+  TORCH_API SparseSparseMatmulBackward0() = default;
+#else
+struct TORCH_API SparseSparseMatmulBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SparseSparseMatmulBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct SoftplusBackward0 : public TraceableFunction {
+  TORCH_API SoftplusBackward0() = default;
+#else
+struct TORCH_API SoftplusBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SoftplusBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar beta;
+  SavedVariable self_;
+  at::Scalar threshold;
+
+};
+#ifdef _WIN32
+struct SoftshrinkBackward0 : public TraceableFunction {
+  TORCH_API SoftshrinkBackward0() = default;
+#else
+struct TORCH_API SoftshrinkBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SoftshrinkBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar lambd;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ThresholdBackward0 : public TraceableFunction {
+  TORCH_API ThresholdBackward0() = default;
+#else
+struct TORCH_API ThresholdBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ThresholdBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  at::Scalar threshold;
+
+};
+#ifdef _WIN32
+struct ThresholdBackward1 : public TraceableFunction {
+  TORCH_API ThresholdBackward1() = default;
+#else
+struct TORCH_API ThresholdBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ThresholdBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  at::Scalar threshold;
+
+};
+#ifdef _WIN32
+struct ReflectionPad1DBackward0 : public TraceableFunction {
+  TORCH_API ReflectionPad1DBackward0() = default;
+#else
+struct TORCH_API ReflectionPad1DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ReflectionPad1DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ReflectionPad2DBackward0 : public TraceableFunction {
+  TORCH_API ReflectionPad2DBackward0() = default;
+#else
+struct TORCH_API ReflectionPad2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ReflectionPad2DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ReflectionPad3DBackward0 : public TraceableFunction {
+  TORCH_API ReflectionPad3DBackward0() = default;
+#else
+struct TORCH_API ReflectionPad3DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ReflectionPad3DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ReplicationPad1DBackward0 : public TraceableFunction {
+  TORCH_API ReplicationPad1DBackward0() = default;
+#else
+struct TORCH_API ReplicationPad1DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ReplicationPad1DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ReplicationPad2DBackward0 : public TraceableFunction {
+  TORCH_API ReplicationPad2DBackward0() = default;
+#else
+struct TORCH_API ReplicationPad2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ReplicationPad2DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ReplicationPad3DBackward0 : public TraceableFunction {
+  TORCH_API ReplicationPad3DBackward0() = default;
+#else
+struct TORCH_API ReplicationPad3DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ReplicationPad3DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct UpsampleLinear1DBackward0 : public TraceableFunction {
+  TORCH_API UpsampleLinear1DBackward0() = default;
+#else
+struct TORCH_API UpsampleLinear1DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleLinear1DBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool align_corners;
+  std::vector<c10::SymInt> output_size;
+  ::std::optional<double> scales;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct UpsampleBilinear2DBackward0 : public TraceableFunction {
+  TORCH_API UpsampleBilinear2DBackward0() = default;
+#else
+struct TORCH_API UpsampleBilinear2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleBilinear2DBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool align_corners;
+  std::vector<c10::SymInt> output_size;
+  ::std::optional<double> scales_h;
+  ::std::optional<double> scales_w;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct UpsampleBilinear2DAaBackward0 : public TraceableFunction {
+  TORCH_API UpsampleBilinear2DAaBackward0() = default;
+#else
+struct TORCH_API UpsampleBilinear2DAaBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleBilinear2DAaBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool align_corners;
+  std::vector<c10::SymInt> output_size;
+  ::std::optional<double> scales_h;
+  ::std::optional<double> scales_w;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct UpsampleBicubic2DBackward0 : public TraceableFunction {
+  TORCH_API UpsampleBicubic2DBackward0() = default;
+#else
+struct TORCH_API UpsampleBicubic2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleBicubic2DBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool align_corners;
+  std::vector<c10::SymInt> output_size;
+  ::std::optional<double> scales_h;
+  ::std::optional<double> scales_w;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct UpsampleBicubic2DAaBackward0 : public TraceableFunction {
+  TORCH_API UpsampleBicubic2DAaBackward0() = default;
+#else
+struct TORCH_API UpsampleBicubic2DAaBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleBicubic2DAaBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool align_corners;
+  std::vector<c10::SymInt> output_size;
+  ::std::optional<double> scales_h;
+  ::std::optional<double> scales_w;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct UpsampleTrilinear3DBackward0 : public TraceableFunction {
+  TORCH_API UpsampleTrilinear3DBackward0() = default;
+#else
+struct TORCH_API UpsampleTrilinear3DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleTrilinear3DBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool align_corners;
+  std::vector<c10::SymInt> output_size;
+  ::std::optional<double> scales_d;
+  ::std::optional<double> scales_h;
+  ::std::optional<double> scales_w;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct UpsampleNearest1DBackward0 : public TraceableFunction {
+  TORCH_API UpsampleNearest1DBackward0() = default;
+#else
+struct TORCH_API UpsampleNearest1DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleNearest1DBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> output_size;
+  ::std::optional<double> scales;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct UpsampleNearestExact1DBackward0 : public TraceableFunction {
+  TORCH_API UpsampleNearestExact1DBackward0() = default;
+#else
+struct TORCH_API UpsampleNearestExact1DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleNearestExact1DBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> output_size;
+  ::std::optional<double> scales;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct UpsampleNearest2DBackward0 : public TraceableFunction {
+  TORCH_API UpsampleNearest2DBackward0() = default;
+#else
+struct TORCH_API UpsampleNearest2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleNearest2DBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> output_size;
+  ::std::optional<double> scales_h;
+  ::std::optional<double> scales_w;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct UpsampleNearestExact2DBackward0 : public TraceableFunction {
+  TORCH_API UpsampleNearestExact2DBackward0() = default;
+#else
+struct TORCH_API UpsampleNearestExact2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleNearestExact2DBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> output_size;
+  ::std::optional<double> scales_h;
+  ::std::optional<double> scales_w;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct UpsampleNearest3DBackward0 : public TraceableFunction {
+  TORCH_API UpsampleNearest3DBackward0() = default;
+#else
+struct TORCH_API UpsampleNearest3DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleNearest3DBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> output_size;
+  ::std::optional<double> scales_d;
+  ::std::optional<double> scales_h;
+  ::std::optional<double> scales_w;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct UpsampleNearestExact3DBackward0 : public TraceableFunction {
+  TORCH_API UpsampleNearestExact3DBackward0() = default;
+#else
+struct TORCH_API UpsampleNearestExact3DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleNearestExact3DBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> output_size;
+  ::std::optional<double> scales_d;
+  ::std::optional<double> scales_h;
+  ::std::optional<double> scales_w;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct PixelShuffleBackward0 : public TraceableFunction {
+  TORCH_API PixelShuffleBackward0() = default;
+#else
+struct TORCH_API PixelShuffleBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "PixelShuffleBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t upscale_factor = 0;
+
+};
+#ifdef _WIN32
+struct PixelUnshuffleBackward0 : public TraceableFunction {
+  TORCH_API PixelUnshuffleBackward0() = default;
+#else
+struct TORCH_API PixelUnshuffleBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "PixelUnshuffleBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t downscale_factor = 0;
+
+};
+#ifdef _WIN32
+struct ChannelShuffleBackward0 : public TraceableFunction {
+  TORCH_API ChannelShuffleBackward0() = default;
+#else
+struct TORCH_API ChannelShuffleBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ChannelShuffleBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::SymInt groups;
+
+};
+#ifdef _WIN32
+struct AdaptiveAvgPool2DBackward0 : public TraceableFunction {
+  TORCH_API AdaptiveAvgPool2DBackward0() = default;
+#else
+struct TORCH_API AdaptiveAvgPool2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AdaptiveAvgPool2DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct AdaptiveAvgPool3DBackward0 : public TraceableFunction {
+  TORCH_API AdaptiveAvgPool3DBackward0() = default;
+#else
+struct TORCH_API AdaptiveAvgPool3DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AdaptiveAvgPool3DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct AdaptiveMaxPool2DBackward0 : public TraceableFunction {
+  TORCH_API AdaptiveMaxPool2DBackward0() = default;
+#else
+struct TORCH_API AdaptiveMaxPool2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AdaptiveMaxPool2DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result1_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable result1_;
+
+};
+#ifdef _WIN32
+struct AdaptiveMaxPool3DBackward0 : public TraceableFunction {
+  TORCH_API AdaptiveMaxPool3DBackward0() = default;
+#else
+struct TORCH_API AdaptiveMaxPool3DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AdaptiveMaxPool3DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result1_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable result1_;
+
+};
+#ifdef _WIN32
+struct AvgPool2DBackward0 : public TraceableFunction {
+  TORCH_API AvgPool2DBackward0() = default;
+#else
+struct TORCH_API AvgPool2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AvgPool2DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool ceil_mode;
+  bool count_include_pad;
+  ::std::optional<int64_t> divisor_override;
+  std::vector<int64_t> kernel_size;
+  std::vector<int64_t> padding;
+  SavedVariable self_;
+  std::vector<int64_t> stride;
+
+};
+#ifdef _WIN32
+struct AvgPool3DBackward0 : public TraceableFunction {
+  TORCH_API AvgPool3DBackward0() = default;
+#else
+struct TORCH_API AvgPool3DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AvgPool3DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool ceil_mode;
+  bool count_include_pad;
+  ::std::optional<int64_t> divisor_override;
+  std::vector<int64_t> kernel_size;
+  std::vector<int64_t> padding;
+  SavedVariable self_;
+  std::vector<int64_t> stride;
+
+};
+#ifdef _WIN32
+struct FractionalMaxPool2DBackward0 : public TraceableFunction {
+  TORCH_API FractionalMaxPool2DBackward0() = default;
+#else
+struct TORCH_API FractionalMaxPool2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FractionalMaxPool2DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result1_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> kernel_size;
+  std::vector<int64_t> output_size;
+  SavedVariable self_;
+  SavedVariable result1_;
+
+};
+#ifdef _WIN32
+struct FractionalMaxPool3DBackward0 : public TraceableFunction {
+  TORCH_API FractionalMaxPool3DBackward0() = default;
+#else
+struct TORCH_API FractionalMaxPool3DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FractionalMaxPool3DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result1_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> kernel_size;
+  std::vector<int64_t> output_size;
+  SavedVariable self_;
+  SavedVariable result1_;
+
+};
+#ifdef _WIN32
+struct LinearBackward0 : public TraceableFunction {
+  TORCH_API LinearBackward0() = default;
+#else
+struct TORCH_API LinearBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinearBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    input_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable input_;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct LinearBackwardBackward0 : public TraceableFunction {
+  TORCH_API LinearBackwardBackward0() = default;
+#else
+struct TORCH_API LinearBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LinearBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable grad_output_;
+  SavedVariable self_;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct MaxPool2DBackward0 : public TraceableFunction {
+  TORCH_API MaxPool2DBackward0() = default;
+#else
+struct TORCH_API MaxPool2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MaxPool2DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool ceil_mode;
+  std::vector<int64_t> dilation;
+  std::vector<int64_t> kernel_size;
+  std::vector<int64_t> padding;
+  SavedVariable self_;
+  std::vector<int64_t> stride;
+
+};
+#ifdef _WIN32
+struct MpsConvolutionBackward0 : public TraceableFunction {
+  TORCH_API MpsConvolutionBackward0() = default;
+#else
+struct TORCH_API MpsConvolutionBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MpsConvolutionBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> dilation;
+  c10::SymInt groups;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+  std::vector<c10::SymInt> stride;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct MpsConvolutionBackwardBackward0 : public TraceableFunction {
+  TORCH_API MpsConvolutionBackwardBackward0() = default;
+#else
+struct TORCH_API MpsConvolutionBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MpsConvolutionBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> dilation;
+  SavedVariable grad_output_;
+  c10::SymInt groups;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+  std::vector<c10::SymInt> stride;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct MaxPool2DWithIndicesBackward0 : public TraceableFunction {
+  TORCH_API MaxPool2DWithIndicesBackward0() = default;
+#else
+struct TORCH_API MaxPool2DWithIndicesBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MaxPool2DWithIndicesBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result1_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool ceil_mode;
+  std::vector<int64_t> dilation;
+  std::vector<int64_t> kernel_size;
+  std::vector<int64_t> padding;
+  SavedVariable self_;
+  std::vector<int64_t> stride;
+  SavedVariable result1_;
+
+};
+#ifdef _WIN32
+struct MaxPool3DWithIndicesBackward0 : public TraceableFunction {
+  TORCH_API MaxPool3DWithIndicesBackward0() = default;
+#else
+struct TORCH_API MaxPool3DWithIndicesBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MaxPool3DWithIndicesBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result1_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool ceil_mode;
+  std::vector<int64_t> dilation;
+  std::vector<int64_t> kernel_size;
+  std::vector<int64_t> padding;
+  SavedVariable self_;
+  std::vector<int64_t> stride;
+  SavedVariable result1_;
+
+};
+#ifdef _WIN32
+struct MaxUnpool2DBackward0 : public TraceableFunction {
+  TORCH_API MaxUnpool2DBackward0() = default;
+#else
+struct TORCH_API MaxUnpool2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MaxUnpool2DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable indices_;
+
+};
+#ifdef _WIN32
+struct MaxUnpool3DBackward0 : public TraceableFunction {
+  TORCH_API MaxUnpool3DBackward0() = default;
+#else
+struct TORCH_API MaxUnpool3DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MaxUnpool3DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable indices_;
+
+};
+#ifdef _WIN32
+struct ConvolutionBackward0 : public TraceableFunction {
+  TORCH_API ConvolutionBackward0() = default;
+#else
+struct TORCH_API ConvolutionBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ConvolutionBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    input_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::OptionalArray<c10::SymInt> bias_sym_sizes_opt;
+  std::vector<c10::SymInt> dilation;
+  c10::SymInt groups;
+  SavedVariable input_;
+  std::vector<c10::SymInt> output_padding;
+  std::vector<c10::SymInt> padding;
+  std::vector<c10::SymInt> stride;
+  bool transposed;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct ConvolutionBackward1 : public TraceableFunction {
+  TORCH_API ConvolutionBackward1() = default;
+#else
+struct TORCH_API ConvolutionBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ConvolutionBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    input_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::OptionalArray<c10::SymInt> bias_sym_sizes_opt;
+  std::vector<c10::SymInt> dilation;
+  c10::SymInt groups;
+  SavedVariable input_;
+  std::vector<c10::SymInt> output_padding;
+  std::vector<c10::SymInt> padding;
+  std::vector<c10::SymInt> stride;
+  bool transposed;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct ConvolutionBackwardBackward0 : public TraceableFunction {
+  TORCH_API ConvolutionBackwardBackward0() = default;
+#else
+struct TORCH_API ConvolutionBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ConvolutionBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    input_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> dilation;
+  SavedVariable grad_output_;
+  c10::SymInt groups;
+  SavedVariable input_;
+  std::vector<c10::SymInt> output_padding;
+  std::vector<c10::SymInt> padding;
+  std::vector<c10::SymInt> stride;
+  bool transposed;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct ConvolutionOverrideableBackward0 : public TraceableFunction {
+  TORCH_API ConvolutionOverrideableBackward0() = default;
+#else
+struct TORCH_API ConvolutionOverrideableBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ConvolutionOverrideableBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    input_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> dilation;
+  c10::SymInt groups;
+  SavedVariable input_;
+  std::vector<c10::SymInt> output_padding;
+  std::vector<c10::SymInt> padding;
+  std::vector<c10::SymInt> stride;
+  bool transposed;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct ConvolutionBackwardOverrideableBackward0 : public TraceableFunction {
+  TORCH_API ConvolutionBackwardOverrideableBackward0() = default;
+#else
+struct TORCH_API ConvolutionBackwardOverrideableBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ConvolutionBackwardOverrideableBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    input_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> dilation;
+  SavedVariable grad_output_;
+  c10::SymInt groups;
+  SavedVariable input_;
+  std::vector<c10::SymInt> output_padding;
+  std::vector<c10::SymInt> padding;
+  std::vector<c10::SymInt> stride;
+  bool transposed;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct SlowConvTranspose2DBackward0 : public TraceableFunction {
+  TORCH_API SlowConvTranspose2DBackward0() = default;
+#else
+struct TORCH_API SlowConvTranspose2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SlowConvTranspose2DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::OptionalArray<c10::SymInt> bias_sym_sizes_opt;
+  std::vector<c10::SymInt> dilation;
+  std::vector<c10::SymInt> output_padding;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+  std::vector<c10::SymInt> stride;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct SlowConvTranspose3DBackward0 : public TraceableFunction {
+  TORCH_API SlowConvTranspose3DBackward0() = default;
+#else
+struct TORCH_API SlowConvTranspose3DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SlowConvTranspose3DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::OptionalArray<c10::SymInt> bias_sym_sizes_opt;
+  std::vector<c10::SymInt> dilation;
+  std::vector<c10::SymInt> output_padding;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+  std::vector<c10::SymInt> stride;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct SlowConv2DBackward0 : public TraceableFunction {
+  TORCH_API SlowConv2DBackward0() = default;
+#else
+struct TORCH_API SlowConv2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SlowConv2DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> kernel_size;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+  std::vector<c10::SymInt> stride;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct SlowConv2DBackwardBackward0 : public TraceableFunction {
+  TORCH_API SlowConv2DBackwardBackward0() = default;
+#else
+struct TORCH_API SlowConv2DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SlowConv2DBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable grad_output_;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+  std::vector<c10::SymInt> stride;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct ConvDepthwise2DBackward0 : public TraceableFunction {
+  TORCH_API ConvDepthwise2DBackward0() = default;
+#else
+struct TORCH_API ConvDepthwise2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ConvDepthwise2DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::OptionalArray<c10::SymInt> bias_sym_sizes_opt;
+  std::vector<c10::SymInt> dilation;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+  std::vector<c10::SymInt> stride;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct ConvDepthwise3DBackward0 : public TraceableFunction {
+  TORCH_API ConvDepthwise3DBackward0() = default;
+#else
+struct TORCH_API ConvDepthwise3DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ConvDepthwise3DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::OptionalArray<c10::SymInt> bias_sym_sizes_opt;
+  std::vector<c10::SymInt> dilation;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+  std::vector<c10::SymInt> stride;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct SlowConv3DBackward0 : public TraceableFunction {
+  TORCH_API SlowConv3DBackward0() = default;
+#else
+struct TORCH_API SlowConv3DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SlowConv3DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::OptionalArray<c10::SymInt> bias_sym_sizes_opt;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+  std::vector<c10::SymInt> stride;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct SlowConvDilated2DBackward0 : public TraceableFunction {
+  TORCH_API SlowConvDilated2DBackward0() = default;
+#else
+struct TORCH_API SlowConvDilated2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SlowConvDilated2DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::OptionalArray<c10::SymInt> bias_sym_sizes_opt;
+  std::vector<c10::SymInt> dilation;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+  std::vector<c10::SymInt> stride;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct SlowConvDilated3DBackward0 : public TraceableFunction {
+  TORCH_API SlowConvDilated3DBackward0() = default;
+#else
+struct TORCH_API SlowConvDilated3DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SlowConvDilated3DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::OptionalArray<c10::SymInt> bias_sym_sizes_opt;
+  std::vector<c10::SymInt> dilation;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+  std::vector<c10::SymInt> stride;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct Col2ImBackward0 : public TraceableFunction {
+  TORCH_API Col2ImBackward0() = default;
+#else
+struct TORCH_API Col2ImBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "Col2ImBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dilation;
+  std::vector<int64_t> kernel_size;
+  std::vector<int64_t> padding;
+  std::vector<int64_t> stride;
+
+};
+#ifdef _WIN32
+struct Im2ColBackward0 : public TraceableFunction {
+  TORCH_API Im2ColBackward0() = default;
+#else
+struct TORCH_API Im2ColBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "Im2ColBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dilation;
+  std::vector<int64_t> kernel_size;
+  std::vector<int64_t> padding;
+  c10::SymInt self_sym_argsize_minus_1;
+  c10::SymInt self_sym_argsize_minus_2;
+  std::vector<int64_t> stride;
+
+};
+#ifdef _WIN32
+struct AdaptiveAvgPool2DBackwardBackward0 : public TraceableFunction {
+  TORCH_API AdaptiveAvgPool2DBackwardBackward0() = default;
+#else
+struct TORCH_API AdaptiveAvgPool2DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AdaptiveAvgPool2DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::SymInt grad_output_sym_argsize_minus_1;
+  c10::SymInt grad_output_sym_argsize_minus_2;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct AdaptiveAvgPool3DBackwardBackward0 : public TraceableFunction {
+  TORCH_API AdaptiveAvgPool3DBackwardBackward0() = default;
+#else
+struct TORCH_API AdaptiveAvgPool3DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AdaptiveAvgPool3DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::SymInt grad_output_sym_argsize_minus_1;
+  c10::SymInt grad_output_sym_argsize_minus_2;
+  c10::SymInt grad_output_sym_argsize_minus_3;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct AdaptiveMaxPool2DBackwardBackward0 : public TraceableFunction {
+  TORCH_API AdaptiveMaxPool2DBackwardBackward0() = default;
+#else
+struct TORCH_API AdaptiveMaxPool2DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AdaptiveMaxPool2DBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable indices_;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct AdaptiveMaxPool3DBackwardBackward0 : public TraceableFunction {
+  TORCH_API AdaptiveMaxPool3DBackwardBackward0() = default;
+#else
+struct TORCH_API AdaptiveMaxPool3DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AdaptiveMaxPool3DBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable indices_;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct AvgPool2DBackwardBackward0 : public TraceableFunction {
+  TORCH_API AvgPool2DBackwardBackward0() = default;
+#else
+struct TORCH_API AvgPool2DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AvgPool2DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool ceil_mode;
+  bool count_include_pad;
+  ::std::optional<int64_t> divisor_override;
+  std::vector<int64_t> kernel_size;
+  std::vector<int64_t> padding;
+  torch::autograd::generated::TypeAndSize self_info;
+  std::vector<int64_t> stride;
+
+};
+#ifdef _WIN32
+struct AvgPool3DBackwardBackward0 : public TraceableFunction {
+  TORCH_API AvgPool3DBackwardBackward0() = default;
+#else
+struct TORCH_API AvgPool3DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AvgPool3DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool ceil_mode;
+  bool count_include_pad;
+  ::std::optional<int64_t> divisor_override;
+  std::vector<int64_t> kernel_size;
+  std::vector<int64_t> padding;
+  torch::autograd::generated::TypeAndSize self_info;
+  std::vector<int64_t> stride;
+
+};
+#ifdef _WIN32
+struct EluBackwardBackward0 : public TraceableFunction {
+  TORCH_API EluBackwardBackward0() = default;
+#else
+struct TORCH_API EluBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "EluBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    self_or_result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  SavedVariable grad_output_;
+  at::Scalar input_scale;
+  bool is_result;
+  at::Scalar scale;
+  SavedVariable self_or_result_;
+
+};
+#ifdef _WIN32
+struct FractionalMaxPool2DBackwardBackward0 : public TraceableFunction {
+  TORCH_API FractionalMaxPool2DBackwardBackward0() = default;
+#else
+struct TORCH_API FractionalMaxPool2DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FractionalMaxPool2DBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable indices_;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct FractionalMaxPool3DBackwardBackward0 : public TraceableFunction {
+  TORCH_API FractionalMaxPool3DBackwardBackward0() = default;
+#else
+struct TORCH_API FractionalMaxPool3DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FractionalMaxPool3DBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable indices_;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct GluBackwardBackward0 : public TraceableFunction {
+  TORCH_API GluBackwardBackward0() = default;
+#else
+struct TORCH_API GluBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "GluBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable grad_output_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct HardtanhBackwardBackward0 : public TraceableFunction {
+  TORCH_API HardtanhBackwardBackward0() = default;
+#else
+struct TORCH_API HardtanhBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "HardtanhBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar max_val;
+  at::Scalar min_val;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct LogSigmoidBackwardBackward0 : public TraceableFunction {
+  TORCH_API LogSigmoidBackwardBackward0() = default;
+#else
+struct TORCH_API LogSigmoidBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LogSigmoidBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    buffer_.reset_data();
+    grad_output_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable buffer_;
+  SavedVariable grad_output_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct LogSoftmaxBackwardDataBackward0 : public TraceableFunction {
+  TORCH_API LogSoftmaxBackwardDataBackward0() = default;
+#else
+struct TORCH_API LogSoftmaxBackwardDataBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LogSoftmaxBackwardDataBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    output_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable grad_output_;
+  SavedVariable output_;
+
+};
+#ifdef _WIN32
+struct LeakyReluBackwardBackward0 : public TraceableFunction {
+  TORCH_API LeakyReluBackwardBackward0() = default;
+#else
+struct TORCH_API LeakyReluBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LeakyReluBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar negative_slope;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct MaxPool2DBackwardBackward0 : public TraceableFunction {
+  TORCH_API MaxPool2DBackwardBackward0() = default;
+#else
+struct TORCH_API MaxPool2DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MaxPool2DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct MaxPool2DWithIndicesBackwardBackward0 : public TraceableFunction {
+  TORCH_API MaxPool2DWithIndicesBackwardBackward0() = default;
+#else
+struct TORCH_API MaxPool2DWithIndicesBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MaxPool2DWithIndicesBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable indices_;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct MaxPool3DWithIndicesBackwardBackward0 : public TraceableFunction {
+  TORCH_API MaxPool3DWithIndicesBackwardBackward0() = default;
+#else
+struct TORCH_API MaxPool3DWithIndicesBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MaxPool3DWithIndicesBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    indices_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable indices_;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct MseLossBackwardBackward0 : public TraceableFunction {
+  TORCH_API MseLossBackwardBackward0() = default;
+#else
+struct TORCH_API MseLossBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MseLossBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    self_.reset_data();
+    target_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable grad_output_;
+  int64_t reduction = 0;
+  SavedVariable self_;
+  SavedVariable target_;
+
+};
+#ifdef _WIN32
+struct NllLossBackwardBackward0 : public TraceableFunction {
+  TORCH_API NllLossBackwardBackward0() = default;
+#else
+struct TORCH_API NllLossBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NllLossBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    target_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::SymInt ignore_index;
+  int64_t reduction = 0;
+  SavedVariable target_;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct NllLoss2DBackwardBackward0 : public TraceableFunction {
+  TORCH_API NllLoss2DBackwardBackward0() = default;
+#else
+struct TORCH_API NllLoss2DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NllLoss2DBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    target_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::SymInt ignore_index;
+  int64_t reduction = 0;
+  SavedVariable target_;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct RreluWithNoiseBackwardBackward0 : public TraceableFunction {
+  TORCH_API RreluWithNoiseBackwardBackward0() = default;
+#else
+struct TORCH_API RreluWithNoiseBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "RreluWithNoiseBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    noise_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar lower;
+  SavedVariable noise_;
+  SavedVariable self_;
+  bool training;
+  at::Scalar upper;
+
+};
+#ifdef _WIN32
+struct ReflectionPad1DBackwardBackward0 : public TraceableFunction {
+  TORCH_API ReflectionPad1DBackwardBackward0() = default;
+#else
+struct TORCH_API ReflectionPad1DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ReflectionPad1DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> padding;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct ReflectionPad2DBackwardBackward0 : public TraceableFunction {
+  TORCH_API ReflectionPad2DBackwardBackward0() = default;
+#else
+struct TORCH_API ReflectionPad2DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ReflectionPad2DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> padding;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct ReflectionPad3DBackwardBackward0 : public TraceableFunction {
+  TORCH_API ReflectionPad3DBackwardBackward0() = default;
+#else
+struct TORCH_API ReflectionPad3DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ReflectionPad3DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> padding;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct ReplicationPad1DBackwardBackward0 : public TraceableFunction {
+  TORCH_API ReplicationPad1DBackwardBackward0() = default;
+#else
+struct TORCH_API ReplicationPad1DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ReplicationPad1DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> padding;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct ReplicationPad2DBackwardBackward0 : public TraceableFunction {
+  TORCH_API ReplicationPad2DBackwardBackward0() = default;
+#else
+struct TORCH_API ReplicationPad2DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ReplicationPad2DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> padding;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct ReplicationPad3DBackwardBackward0 : public TraceableFunction {
+  TORCH_API ReplicationPad3DBackwardBackward0() = default;
+#else
+struct TORCH_API ReplicationPad3DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ReplicationPad3DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> padding;
+  torch::autograd::generated::TypeAndSize self_info;
+
+};
+#ifdef _WIN32
+struct SparseSampledAddmmBackward0 : public TraceableFunction {
+  TORCH_API SparseSampledAddmmBackward0() = default;
+#else
+struct TORCH_API SparseSampledAddmmBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SparseSampledAddmmBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    mat1_.reset_data();
+    mat2_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  at::Scalar beta;
+  SavedVariable mat1_;
+  SavedVariable mat2_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct SparseMmReduceImplBackward0 : public TraceableFunction {
+  TORCH_API SparseMmReduceImplBackward0() = default;
+#else
+struct TORCH_API SparseMmReduceImplBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SparseMmReduceImplBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.reset_data();
+    result1_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  std::string reduce;
+  SavedVariable self_;
+  SavedVariable result1_;
+
+};
+#ifdef _WIN32
+struct SmoothL1LossBackwardBackward0 : public TraceableFunction {
+  TORCH_API SmoothL1LossBackwardBackward0() = default;
+#else
+struct TORCH_API SmoothL1LossBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SmoothL1LossBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    self_.reset_data();
+    target_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double beta;
+  SavedVariable grad_output_;
+  int64_t reduction = 0;
+  SavedVariable self_;
+  SavedVariable target_;
+
+};
+#ifdef _WIN32
+struct HuberLossBackwardBackward0 : public TraceableFunction {
+  TORCH_API HuberLossBackwardBackward0() = default;
+#else
+struct TORCH_API HuberLossBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "HuberLossBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    self_.reset_data();
+    target_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double delta;
+  SavedVariable grad_output_;
+  int64_t reduction = 0;
+  SavedVariable self_;
+  SavedVariable target_;
+
+};
+#ifdef _WIN32
+struct SoftplusBackwardBackward0 : public TraceableFunction {
+  TORCH_API SoftplusBackwardBackward0() = default;
+#else
+struct TORCH_API SoftplusBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SoftplusBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar beta;
+  SavedVariable grad_output_;
+  SavedVariable self_;
+  at::Scalar threshold;
+
+};
+#ifdef _WIN32
+struct SoftmaxBackwardDataBackward0 : public TraceableFunction {
+  TORCH_API SoftmaxBackwardDataBackward0() = default;
+#else
+struct TORCH_API SoftmaxBackwardDataBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SoftmaxBackwardDataBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    output_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable grad_output_;
+  at::ScalarType input_dtype;
+  SavedVariable output_;
+
+};
+#ifdef _WIN32
+struct SoftMarginLossBackwardBackward0 : public TraceableFunction {
+  TORCH_API SoftMarginLossBackwardBackward0() = default;
+#else
+struct TORCH_API SoftMarginLossBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SoftMarginLossBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    self_.reset_data();
+    target_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable grad_output_;
+  int64_t reduction = 0;
+  SavedVariable self_;
+  SavedVariable target_;
+
+};
+#ifdef _WIN32
+struct SoftshrinkBackwardBackward0 : public TraceableFunction {
+  TORCH_API SoftshrinkBackwardBackward0() = default;
+#else
+struct TORCH_API SoftshrinkBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SoftshrinkBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar lambd;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ThresholdBackwardBackward0 : public TraceableFunction {
+  TORCH_API ThresholdBackwardBackward0() = default;
+#else
+struct TORCH_API ThresholdBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ThresholdBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  at::Scalar threshold;
+
+};
+#ifdef _WIN32
+struct UpsampleLinear1DBackwardBackward0 : public TraceableFunction {
+  TORCH_API UpsampleLinear1DBackwardBackward0() = default;
+#else
+struct TORCH_API UpsampleLinear1DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleLinear1DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool align_corners;
+  std::vector<c10::SymInt> output_size;
+  ::std::optional<double> scales;
+
+};
+#ifdef _WIN32
+struct UpsampleBilinear2DBackwardBackward0 : public TraceableFunction {
+  TORCH_API UpsampleBilinear2DBackwardBackward0() = default;
+#else
+struct TORCH_API UpsampleBilinear2DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleBilinear2DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool align_corners;
+  std::vector<c10::SymInt> output_size;
+  ::std::optional<double> scales_h;
+  ::std::optional<double> scales_w;
+
+};
+#ifdef _WIN32
+struct UpsampleBilinear2DAaBackwardBackward0 : public TraceableFunction {
+  TORCH_API UpsampleBilinear2DAaBackwardBackward0() = default;
+#else
+struct TORCH_API UpsampleBilinear2DAaBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleBilinear2DAaBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool align_corners;
+  std::vector<c10::SymInt> output_size;
+  ::std::optional<double> scales_h;
+  ::std::optional<double> scales_w;
+
+};
+#ifdef _WIN32
+struct UpsampleBicubic2DBackwardBackward0 : public TraceableFunction {
+  TORCH_API UpsampleBicubic2DBackwardBackward0() = default;
+#else
+struct TORCH_API UpsampleBicubic2DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleBicubic2DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool align_corners;
+  std::vector<c10::SymInt> output_size;
+  ::std::optional<double> scales_h;
+  ::std::optional<double> scales_w;
+
+};
+#ifdef _WIN32
+struct UpsampleBicubic2DAaBackwardBackward0 : public TraceableFunction {
+  TORCH_API UpsampleBicubic2DAaBackwardBackward0() = default;
+#else
+struct TORCH_API UpsampleBicubic2DAaBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleBicubic2DAaBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool align_corners;
+  std::vector<c10::SymInt> output_size;
+  ::std::optional<double> scales_h;
+  ::std::optional<double> scales_w;
+
+};
+#ifdef _WIN32
+struct UpsampleTrilinear3DBackwardBackward0 : public TraceableFunction {
+  TORCH_API UpsampleTrilinear3DBackwardBackward0() = default;
+#else
+struct TORCH_API UpsampleTrilinear3DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleTrilinear3DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool align_corners;
+  std::vector<c10::SymInt> output_size;
+  ::std::optional<double> scales_d;
+  ::std::optional<double> scales_h;
+  ::std::optional<double> scales_w;
+
+};
+#ifdef _WIN32
+struct UpsampleNearest1DBackwardBackward0 : public TraceableFunction {
+  TORCH_API UpsampleNearest1DBackwardBackward0() = default;
+#else
+struct TORCH_API UpsampleNearest1DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleNearest1DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> output_size;
+  ::std::optional<double> scales;
+
+};
+#ifdef _WIN32
+struct UpsampleNearestExact1DBackwardBackward0 : public TraceableFunction {
+  TORCH_API UpsampleNearestExact1DBackwardBackward0() = default;
+#else
+struct TORCH_API UpsampleNearestExact1DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleNearestExact1DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> output_size;
+  ::std::optional<double> scales;
+
+};
+#ifdef _WIN32
+struct UpsampleNearest2DBackwardBackward0 : public TraceableFunction {
+  TORCH_API UpsampleNearest2DBackwardBackward0() = default;
+#else
+struct TORCH_API UpsampleNearest2DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleNearest2DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> output_size;
+  ::std::optional<double> scales_h;
+  ::std::optional<double> scales_w;
+
+};
+#ifdef _WIN32
+struct UpsampleNearestExact2DBackwardBackward0 : public TraceableFunction {
+  TORCH_API UpsampleNearestExact2DBackwardBackward0() = default;
+#else
+struct TORCH_API UpsampleNearestExact2DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleNearestExact2DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> output_size;
+  ::std::optional<double> scales_h;
+  ::std::optional<double> scales_w;
+
+};
+#ifdef _WIN32
+struct UpsampleNearest3DBackwardBackward0 : public TraceableFunction {
+  TORCH_API UpsampleNearest3DBackwardBackward0() = default;
+#else
+struct TORCH_API UpsampleNearest3DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleNearest3DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> output_size;
+  ::std::optional<double> scales_d;
+  ::std::optional<double> scales_h;
+  ::std::optional<double> scales_w;
+
+};
+#ifdef _WIN32
+struct UpsampleNearestExact3DBackwardBackward0 : public TraceableFunction {
+  TORCH_API UpsampleNearestExact3DBackwardBackward0() = default;
+#else
+struct TORCH_API UpsampleNearestExact3DBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UpsampleNearestExact3DBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> output_size;
+  ::std::optional<double> scales_d;
+  ::std::optional<double> scales_h;
+  ::std::optional<double> scales_w;
+
+};
+#ifdef _WIN32
+struct SigmoidBackwardBackward0 : public TraceableFunction {
+  TORCH_API SigmoidBackwardBackward0() = default;
+#else
+struct TORCH_API SigmoidBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SigmoidBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    output_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable grad_output_;
+  SavedVariable output_;
+
+};
+#ifdef _WIN32
+struct TanhBackwardBackward0 : public TraceableFunction {
+  TORCH_API TanhBackwardBackward0() = default;
+#else
+struct TORCH_API TanhBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TanhBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    output_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable grad_output_;
+  SavedVariable output_;
+
+};
+#ifdef _WIN32
+struct CudnnCtcLossBackward0 : public TraceableFunction {
+  TORCH_API CudnnCtcLossBackward0() = default;
+#else
+struct TORCH_API CudnnCtcLossBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CudnnCtcLossBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result0_.reset_data();
+    result1_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool zero_infinity;
+  SavedVariable result0_;
+  SavedVariable result1_;
+
+};
+#ifdef _WIN32
+struct CudnnCtcLossBackward1 : public TraceableFunction {
+  TORCH_API CudnnCtcLossBackward1() = default;
+#else
+struct TORCH_API CudnnCtcLossBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CudnnCtcLossBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result0_.reset_data();
+    result1_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool zero_infinity;
+  SavedVariable result0_;
+  SavedVariable result1_;
+
+};
+#ifdef _WIN32
+struct CudnnConvolutionTransposeBackward0 : public TraceableFunction {
+  TORCH_API CudnnConvolutionTransposeBackward0() = default;
+#else
+struct TORCH_API CudnnConvolutionTransposeBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CudnnConvolutionTransposeBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> dilation;
+  c10::SymInt groups;
+  std::vector<c10::SymInt> output_padding;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+  std::vector<c10::SymInt> stride;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct MpsConvolutionTransposeBackward0 : public TraceableFunction {
+  TORCH_API MpsConvolutionTransposeBackward0() = default;
+#else
+struct TORCH_API MpsConvolutionTransposeBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MpsConvolutionTransposeBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> dilation;
+  c10::SymInt groups;
+  std::vector<c10::SymInt> output_padding;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+  std::vector<c10::SymInt> stride;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct CudnnConvolutionBackward0 : public TraceableFunction {
+  TORCH_API CudnnConvolutionBackward0() = default;
+#else
+struct TORCH_API CudnnConvolutionBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CudnnConvolutionBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> dilation;
+  c10::SymInt groups;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+  std::vector<c10::SymInt> stride;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct CudnnGridSamplerBackward0 : public TraceableFunction {
+  TORCH_API CudnnGridSamplerBackward0() = default;
+#else
+struct TORCH_API CudnnGridSamplerBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CudnnGridSamplerBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grid_.reset_data();
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable grid_;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct CudnnAffineGridGeneratorBackward0 : public TraceableFunction {
+  TORCH_API CudnnAffineGridGeneratorBackward0() = default;
+#else
+struct TORCH_API CudnnAffineGridGeneratorBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CudnnAffineGridGeneratorBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t C = 0;
+  int64_t H = 0;
+  int64_t N = 0;
+  int64_t W = 0;
+
+};
+#ifdef _WIN32
+struct CudnnBatchNormBackward0 : public TraceableFunction {
+  TORCH_API CudnnBatchNormBackward0() = default;
+#else
+struct TORCH_API CudnnBatchNormBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CudnnBatchNormBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    input_.reset_data();
+    running_mean_.reset_data();
+    running_var_.reset_data();
+    weight_.reset_data();
+    result1_.reset_data();
+    result2_.reset_data();
+    result3_.reset_data();
+  }
+  bool retain_variables = true;
+  void will_release_variables() override {
+    retain_variables = false;
+  }
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double epsilon;
+  SavedVariable input_;
+  SavedVariable running_mean_;
+  SavedVariable running_var_;
+  bool training;
+  SavedVariable weight_;
+  SavedVariable result1_;
+  SavedVariable result2_;
+  SavedVariable result3_;
+
+};
+#ifdef _WIN32
+struct CudnnBatchNormBackwardBackward0 : public TraceableFunction {
+  TORCH_API CudnnBatchNormBackwardBackward0() = default;
+#else
+struct TORCH_API CudnnBatchNormBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CudnnBatchNormBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    input_.reset_data();
+    reserveSpace_.reset_data();
+    running_mean_.reset_data();
+    running_var_.reset_data();
+    save_mean_.reset_data();
+    save_var_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double epsilon;
+  SavedVariable grad_output_;
+  SavedVariable input_;
+  SavedVariable reserveSpace_;
+  SavedVariable running_mean_;
+  SavedVariable running_var_;
+  SavedVariable save_mean_;
+  SavedVariable save_var_;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct NnpackSpatialConvolutionBackward0 : public TraceableFunction {
+  TORCH_API NnpackSpatialConvolutionBackward0() = default;
+#else
+struct TORCH_API NnpackSpatialConvolutionBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NnpackSpatialConvolutionBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    input_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::OptionalArray<c10::SymInt> bias_sym_sizes_opt;
+  SavedVariable input_;
+  std::vector<c10::SymInt> padding;
+  std::vector<c10::SymInt> stride;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct LstmMpsBackward0 : public TraceableFunction {
+  TORCH_API LstmMpsBackward0() = default;
+#else
+struct TORCH_API LstmMpsBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LstmMpsBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    hx_.clear();
+    hx_released_ = true;
+    input_.reset_data();
+    params_.clear();
+    params_released_ = true;
+    result3_.reset_data();
+    result4_.reset_data();
+    result5_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool batch_first;
+  bool bidirectional;
+  double dropout;
+  bool has_biases;
+  std::vector<SavedVariable> hx_;
+  bool hx_released_ = false;
+  SavedVariable input_;
+  int64_t num_layers = 0;
+  std::vector<SavedVariable> params_;
+  bool params_released_ = false;
+  bool train;
+  SavedVariable result3_;
+  SavedVariable result4_;
+  SavedVariable result5_;
+  size_t hx_size_;
+  size_t params_size_;
+};
+#ifdef _WIN32
+struct CudnnRnnBackward0 : public TraceableFunction {
+  TORCH_API CudnnRnnBackward0() = default;
+#else
+struct TORCH_API CudnnRnnBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CudnnRnnBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    cx_.reset_data();
+    dropout_state_.reset_data();
+    hx_.reset_data();
+    input_.reset_data();
+    weight_.clear();
+    weight_released_ = true;
+    result0_.reset_data();
+    result3_.reset_data();
+    result4_.reset_data();
+  }
+  bool retain_variables = true;
+  void will_release_variables() override {
+    retain_variables = false;
+  }
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool batch_first;
+  std::vector<c10::SymInt> batch_sizes;
+  bool bidirectional;
+  SavedVariable cx_;
+  double dropout;
+  SavedVariable dropout_state_;
+  c10::SymInt hidden_size;
+  SavedVariable hx_;
+  SavedVariable input_;
+  int64_t mode = 0;
+  int64_t num_layers = 0;
+  c10::SymInt proj_size;
+  bool train;
+  std::vector<SavedVariable> weight_;
+  bool weight_released_ = false;
+  int64_t weight_stride0 = 0;
+  SavedVariable result0_;
+  SavedVariable result3_;
+  SavedVariable result4_;
+  size_t weight_size_;
+};
+#ifdef _WIN32
+struct CudnnRnnBackwardBackward0 : public TraceableFunction {
+  TORCH_API CudnnRnnBackwardBackward0() = default;
+#else
+struct TORCH_API CudnnRnnBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "CudnnRnnBackwardBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+  size_t weight_size_;
+};
+#ifdef _WIN32
+struct MiopenConvolutionTransposeBackward0 : public TraceableFunction {
+  TORCH_API MiopenConvolutionTransposeBackward0() = default;
+#else
+struct TORCH_API MiopenConvolutionTransposeBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MiopenConvolutionTransposeBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::OptionalArray<c10::SymInt> bias_sym_sizes_opt;
+  std::vector<c10::SymInt> dilation;
+  c10::SymInt groups;
+  std::vector<c10::SymInt> output_padding;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+  std::vector<c10::SymInt> stride;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct MiopenConvolutionBackward0 : public TraceableFunction {
+  TORCH_API MiopenConvolutionBackward0() = default;
+#else
+struct TORCH_API MiopenConvolutionBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MiopenConvolutionBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::OptionalArray<c10::SymInt> bias_sym_sizes_opt;
+  std::vector<c10::SymInt> dilation;
+  c10::SymInt groups;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+  std::vector<c10::SymInt> stride;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct MiopenDepthwiseConvolutionBackward0 : public TraceableFunction {
+  TORCH_API MiopenDepthwiseConvolutionBackward0() = default;
+#else
+struct TORCH_API MiopenDepthwiseConvolutionBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MiopenDepthwiseConvolutionBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::OptionalArray<c10::SymInt> bias_sym_sizes_opt;
+  std::vector<c10::SymInt> dilation;
+  c10::SymInt groups;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+  std::vector<c10::SymInt> stride;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct MiopenBatchNormBackward0 : public TraceableFunction {
+  TORCH_API MiopenBatchNormBackward0() = default;
+#else
+struct TORCH_API MiopenBatchNormBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MiopenBatchNormBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    input_.reset_data();
+    running_mean_.reset_data();
+    running_var_.reset_data();
+    weight_.reset_data();
+    result1_.reset_data();
+    result2_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double epsilon;
+  SavedVariable input_;
+  SavedVariable running_mean_;
+  SavedVariable running_var_;
+  bool training;
+  SavedVariable weight_;
+  SavedVariable result1_;
+  SavedVariable result2_;
+
+};
+#ifdef _WIN32
+struct MiopenBatchNormBackwardBackward0 : public TraceableFunction {
+  TORCH_API MiopenBatchNormBackwardBackward0() = default;
+#else
+struct TORCH_API MiopenBatchNormBackwardBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MiopenBatchNormBackwardBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    grad_output_.reset_data();
+    input_.reset_data();
+    running_mean_.reset_data();
+    running_var_.reset_data();
+    save_mean_.reset_data();
+    save_var_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double epsilon;
+  SavedVariable grad_output_;
+  SavedVariable input_;
+  SavedVariable running_mean_;
+  SavedVariable running_var_;
+  SavedVariable save_mean_;
+  SavedVariable save_var_;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct MiopenRnnBackward0 : public TraceableFunction {
+  TORCH_API MiopenRnnBackward0() = default;
+#else
+struct TORCH_API MiopenRnnBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MiopenRnnBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    cx_.reset_data();
+    dropout_state_.reset_data();
+    hx_.reset_data();
+    input_.reset_data();
+    weight_.clear();
+    weight_released_ = true;
+    result0_.reset_data();
+    result3_.reset_data();
+    result4_.reset_data();
+  }
+  bool retain_variables = true;
+  void will_release_variables() override {
+    retain_variables = false;
+  }
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool batch_first;
+  std::vector<int64_t> batch_sizes;
+  bool bidirectional;
+  SavedVariable cx_;
+  double dropout;
+  SavedVariable dropout_state_;
+  int64_t hidden_size = 0;
+  SavedVariable hx_;
+  SavedVariable input_;
+  int64_t mode = 0;
+  int64_t num_layers = 0;
+  bool train;
+  std::vector<SavedVariable> weight_;
+  bool weight_released_ = false;
+  int64_t weight_stride0 = 0;
+  SavedVariable result0_;
+  SavedVariable result3_;
+  SavedVariable result4_;
+  size_t weight_size_;
+};
+#ifdef _WIN32
+struct MkldnnRnnLayerBackward0 : public TraceableFunction {
+  TORCH_API MkldnnRnnLayerBackward0() = default;
+#else
+struct TORCH_API MkldnnRnnLayerBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MkldnnRnnLayerBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    cx__.reset_data();
+    hx__.reset_data();
+    input_.reset_data();
+    weight0_.reset_data();
+    weight1_.reset_data();
+    weight2_.reset_data();
+    weight3_.reset_data();
+    result0_.reset_data();
+    result1_.reset_data();
+    result2_.reset_data();
+    result3_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool batch_first;
+  std::vector<int64_t> batch_sizes;
+  bool bidirectional;
+  SavedVariable cx__;
+  bool has_biases;
+  int64_t hidden_size = 0;
+  SavedVariable hx__;
+  SavedVariable input_;
+  int64_t mode = 0;
+  int64_t num_layers = 0;
+  bool reverse;
+  bool train;
+  SavedVariable weight0_;
+  SavedVariable weight1_;
+  SavedVariable weight2_;
+  SavedVariable weight3_;
+  SavedVariable result0_;
+  SavedVariable result1_;
+  SavedVariable result2_;
+  SavedVariable result3_;
+
+};
+#ifdef _WIN32
+struct MkldnnConvolutionBackward0 : public TraceableFunction {
+  TORCH_API MkldnnConvolutionBackward0() = default;
+#else
+struct TORCH_API MkldnnConvolutionBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MkldnnConvolutionBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  c10::OptionalArray<c10::SymInt> bias_sym_sizes_opt;
+  std::vector<c10::SymInt> dilation;
+  c10::SymInt groups;
+  std::vector<c10::SymInt> padding;
+  SavedVariable self_;
+  std::vector<c10::SymInt> stride;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct MkldnnLinearBackward0 : public TraceableFunction {
+  TORCH_API MkldnnLinearBackward0() = default;
+#else
+struct TORCH_API MkldnnLinearBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MkldnnLinearBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    weight_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  SavedVariable weight_;
+
+};
+#ifdef _WIN32
+struct MkldnnMaxPool2DBackward0 : public TraceableFunction {
+  TORCH_API MkldnnMaxPool2DBackward0() = default;
+#else
+struct TORCH_API MkldnnMaxPool2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MkldnnMaxPool2DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool ceil_mode;
+  std::vector<int64_t> dilation;
+  std::vector<int64_t> kernel_size;
+  std::vector<int64_t> padding;
+  SavedVariable self_;
+  std::vector<int64_t> stride;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct MkldnnMaxPool3DBackward0 : public TraceableFunction {
+  TORCH_API MkldnnMaxPool3DBackward0() = default;
+#else
+struct TORCH_API MkldnnMaxPool3DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MkldnnMaxPool3DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool ceil_mode;
+  std::vector<int64_t> dilation;
+  std::vector<int64_t> kernel_size;
+  std::vector<int64_t> padding;
+  SavedVariable self_;
+  std::vector<int64_t> stride;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct MkldnnAdaptiveAvgPool2DBackward0 : public TraceableFunction {
+  TORCH_API MkldnnAdaptiveAvgPool2DBackward0() = default;
+#else
+struct TORCH_API MkldnnAdaptiveAvgPool2DBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MkldnnAdaptiveAvgPool2DBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct MkldnnReshapeBackward0 : public TraceableFunction {
+  TORCH_API MkldnnReshapeBackward0() = default;
+#else
+struct TORCH_API MkldnnReshapeBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "MkldnnReshapeBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct NestedTensorFromTensorListBackward0 : public TraceableFunction {
+  TORCH_API NestedTensorFromTensorListBackward0() = default;
+#else
+struct TORCH_API NestedTensorFromTensorListBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NestedTensorFromTensorListBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    list_.clear();
+    list_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> list_;
+  bool list_released_ = false;
+  size_t list_size_;
+};
+#ifdef _WIN32
+struct NestedTensorFromMaskBackward0 : public TraceableFunction {
+  TORCH_API NestedTensorFromMaskBackward0() = default;
+#else
+struct TORCH_API NestedTensorFromMaskBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NestedTensorFromMaskBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> t_sym_sizes;
+
+};
+#ifdef _WIN32
+struct NestedFromPaddedBackward0 : public TraceableFunction {
+  TORCH_API NestedFromPaddedBackward0() = default;
+#else
+struct TORCH_API NestedFromPaddedBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NestedFromPaddedBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    padded_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool fuse_transform_0213;
+  SavedVariable padded_;
+
+};
+#ifdef _WIN32
+struct ToPaddedTensorBackward0 : public TraceableFunction {
+  TORCH_API ToPaddedTensorBackward0() = default;
+#else
+struct TORCH_API ToPaddedTensorBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ToPaddedTensorBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+  at::Layout self_layout;
+
+};
+#ifdef _WIN32
+struct NestedFromPaddedTensorBackward0 : public TraceableFunction {
+  TORCH_API NestedFromPaddedTensorBackward0() = default;
+#else
+struct TORCH_API NestedFromPaddedTensorBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NestedFromPaddedTensorBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> padded_sym_sizes;
+
+};
+#ifdef _WIN32
+struct NestedViewFromBufferBackward0 : public Node {
+  TORCH_API NestedViewFromBufferBackward0() = default;
+#else
+struct TORCH_API NestedViewFromBufferBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NestedViewFromBufferBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct NestedViewFromJaggedBackward0 : public Node {
+  TORCH_API NestedViewFromJaggedBackward0() = default;
+#else
+struct TORCH_API NestedViewFromJaggedBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NestedViewFromJaggedBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct NestedGetValuesBackward0 : public Node {
+  TORCH_API NestedGetValuesBackward0() = default;
+#else
+struct TORCH_API NestedGetValuesBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NestedGetValuesBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct SafeSoftmaxBackward0 : public TraceableFunction {
+  TORCH_API SafeSoftmaxBackward0() = default;
+#else
+struct TORCH_API SafeSoftmaxBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SafeSoftmaxBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  at::ScalarType self_scalar_type;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct ScaledDotProductEfficientAttentionBackward0 : public TraceableFunction {
+  TORCH_API ScaledDotProductEfficientAttentionBackward0() = default;
+#else
+struct TORCH_API ScaledDotProductEfficientAttentionBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ScaledDotProductEfficientAttentionBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    attn_bias_.reset_data();
+    key_.reset_data();
+    query_.reset_data();
+    value_.reset_data();
+    log_sumexp_.reset_data();
+    output_.reset_data();
+    philox_offset_.reset_data();
+    philox_seed_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable attn_bias_;
+  double dropout_p;
+  bool is_causal;
+  SavedVariable key_;
+  SavedVariable query_;
+  ::std::optional<double> scale;
+  SavedVariable value_;
+  SavedVariable log_sumexp_;
+  SavedVariable output_;
+  SavedVariable philox_offset_;
+  SavedVariable philox_seed_;
+
+};
+#ifdef _WIN32
+struct ScaledDotProductFlashAttentionBackward0 : public TraceableFunction {
+  TORCH_API ScaledDotProductFlashAttentionBackward0() = default;
+#else
+struct TORCH_API ScaledDotProductFlashAttentionBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ScaledDotProductFlashAttentionBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    key_.reset_data();
+    query_.reset_data();
+    value_.reset_data();
+    cum_seq_k_.reset_data();
+    cum_seq_q_.reset_data();
+    logsumexp_.reset_data();
+    output_.reset_data();
+    rng_state_.reset_data();
+    unused_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  double dropout_p;
+  bool is_causal;
+  SavedVariable key_;
+  SavedVariable query_;
+  ::std::optional<double> scale;
+  SavedVariable value_;
+  SavedVariable cum_seq_k_;
+  SavedVariable cum_seq_q_;
+  SavedVariable logsumexp_;
+  c10::SymInt max_k;
+  c10::SymInt max_q;
+  SavedVariable output_;
+  SavedVariable rng_state_;
+  SavedVariable unused_;
+
+};
+#ifdef _WIN32
+struct ScaledDotProductFlashAttentionForCpuBackward0 : public TraceableFunction {
+  TORCH_API ScaledDotProductFlashAttentionForCpuBackward0() = default;
+#else
+struct TORCH_API ScaledDotProductFlashAttentionForCpuBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ScaledDotProductFlashAttentionForCpuBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    attn_mask_.reset_data();
+    key_.reset_data();
+    query_.reset_data();
+    value_.reset_data();
+    logsumexp_.reset_data();
+    output_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable attn_mask_;
+  double dropout_p;
+  bool is_causal;
+  SavedVariable key_;
+  SavedVariable query_;
+  ::std::optional<double> scale;
+  SavedVariable value_;
+  SavedVariable logsumexp_;
+  SavedVariable output_;
+
+};
+#ifdef _WIN32
+struct FlashAttentionBackward0 : public TraceableFunction {
+  TORCH_API FlashAttentionBackward0() = default;
+#else
+struct TORCH_API FlashAttentionBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FlashAttentionBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    cum_seq_k_.reset_data();
+    cum_seq_q_.reset_data();
+    key_.reset_data();
+    query_.reset_data();
+    value_.reset_data();
+    output_.reset_data();
+    rng_state_.reset_data();
+    softmax_logsumexp_.reset_data();
+    unused_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable cum_seq_k_;
+  SavedVariable cum_seq_q_;
+  double dropout_p;
+  bool is_causal;
+  SavedVariable key_;
+  c10::SymInt max_k;
+  c10::SymInt max_q;
+  SavedVariable query_;
+  ::std::optional<double> scale;
+  SavedVariable value_;
+  ::std::optional<c10::SymInt> window_size_left;
+  ::std::optional<c10::SymInt> window_size_right;
+  SavedVariable output_;
+  SavedVariable rng_state_;
+  SavedVariable softmax_logsumexp_;
+  SavedVariable unused_;
+
+};
+#ifdef _WIN32
+struct EfficientAttentionBackward0 : public TraceableFunction {
+  TORCH_API EfficientAttentionBackward0() = default;
+#else
+struct TORCH_API EfficientAttentionBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "EfficientAttentionBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    bias_.reset_data();
+    cu_seqlens_k_.reset_data();
+    cu_seqlens_q_.reset_data();
+    key_.reset_data();
+    query_.reset_data();
+    value_.reset_data();
+    logsumexp_.reset_data();
+    output_.reset_data();
+    philox_offset_.reset_data();
+    philox_seed_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable bias_;
+  SavedVariable cu_seqlens_k_;
+  SavedVariable cu_seqlens_q_;
+  int64_t custom_mask_type = 0;
+  double dropout_p;
+  SavedVariable key_;
+  SavedVariable query_;
+  ::std::optional<double> scale;
+  SavedVariable value_;
+  SavedVariable logsumexp_;
+  c10::SymInt max_seqlen_batch_k;
+  c10::SymInt max_seqlen_batch_q;
+  SavedVariable output_;
+  SavedVariable philox_offset_;
+  SavedVariable philox_seed_;
+
+};
+#ifdef _WIN32
+struct ScaledDotProductCudnnAttentionBackward0 : public TraceableFunction {
+  TORCH_API ScaledDotProductCudnnAttentionBackward0() = default;
+#else
+struct TORCH_API ScaledDotProductCudnnAttentionBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ScaledDotProductCudnnAttentionBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    attn_bias_.reset_data();
+    key_.reset_data();
+    query_.reset_data();
+    value_.reset_data();
+    cum_seq_k_.reset_data();
+    cum_seq_q_.reset_data();
+    logsumexp_.reset_data();
+    output_.reset_data();
+    philox_offset_.reset_data();
+    philox_seed_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable attn_bias_;
+  double dropout_p;
+  bool is_causal;
+  SavedVariable key_;
+  SavedVariable query_;
+  ::std::optional<double> scale;
+  SavedVariable value_;
+  SavedVariable cum_seq_k_;
+  SavedVariable cum_seq_q_;
+  SavedVariable logsumexp_;
+  c10::SymInt max_k;
+  c10::SymInt max_q;
+  SavedVariable output_;
+  SavedVariable philox_offset_;
+  SavedVariable philox_seed_;
+
+};
+#ifdef _WIN32
+struct ScaledDotProductFusedAttentionOverrideableBackward0 : public TraceableFunction {
+  TORCH_API ScaledDotProductFusedAttentionOverrideableBackward0() = default;
+#else
+struct TORCH_API ScaledDotProductFusedAttentionOverrideableBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ScaledDotProductFusedAttentionOverrideableBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    attn_bias_.reset_data();
+    key_.reset_data();
+    query_.reset_data();
+    value_.reset_data();
+    cum_seq_k_.reset_data();
+    cum_seq_q_.reset_data();
+    logsumexp_.reset_data();
+    output_.reset_data();
+    philox_offset_.reset_data();
+    philox_seed_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable attn_bias_;
+  double dropout_p;
+  bool is_causal;
+  SavedVariable key_;
+  SavedVariable query_;
+  ::std::optional<double> scale;
+  SavedVariable value_;
+  SavedVariable cum_seq_k_;
+  SavedVariable cum_seq_q_;
+  SavedVariable logsumexp_;
+  c10::SymInt max_k;
+  c10::SymInt max_q;
+  SavedVariable output_;
+  SavedVariable philox_offset_;
+  SavedVariable philox_seed_;
+
+};
+#ifdef _WIN32
+struct FftR2CBackward0 : public TraceableFunction {
+  TORCH_API FftR2CBackward0() = default;
+#else
+struct TORCH_API FftR2CBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FftR2CBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dim;
+  int64_t normalization = 0;
+  bool onesided;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct FftC2RBackward0 : public TraceableFunction {
+  TORCH_API FftC2RBackward0() = default;
+#else
+struct TORCH_API FftC2RBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FftC2RBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dim;
+  int64_t normalization = 0;
+
+};
+#ifdef _WIN32
+struct FftC2CBackward0 : public TraceableFunction {
+  TORCH_API FftC2CBackward0() = default;
+#else
+struct TORCH_API FftC2CBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "FftC2CBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> dim;
+  bool forward;
+  int64_t normalization = 0;
+
+};
+#ifdef _WIN32
+struct UnbindBackward0 : public Node {
+  TORCH_API UnbindBackward0() = default;
+#else
+struct TORCH_API UnbindBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UnbindBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+
+};
+#ifdef _WIN32
+struct UnbindBackwardAutogradNestedTensor0 : public Node {
+  TORCH_API UnbindBackwardAutogradNestedTensor0() = default;
+#else
+struct TORCH_API UnbindBackwardAutogradNestedTensor0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UnbindBackwardAutogradNestedTensor0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable self_;
+  at::Layout self_layout;
+  at::TensorOptions self_options;
+
+};
+#ifdef _WIN32
+struct StackBackward0 : public TraceableFunction {
+  TORCH_API StackBackward0() = default;
+#else
+struct TORCH_API StackBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "StackBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  ::std::vector<at::ScalarType> tensors_args_scalartypes;
+  size_t tensors_size_;
+};
+#ifdef _WIN32
+struct ThnnFusedLstmCellBackward0 : public TraceableFunction {
+  TORCH_API ThnnFusedLstmCellBackward0() = default;
+#else
+struct TORCH_API ThnnFusedLstmCellBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ThnnFusedLstmCellBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    cx_.reset_data();
+    hidden_bias_.reset_data();
+    hidden_gates_.reset_data();
+    input_bias_.reset_data();
+    input_gates_.reset_data();
+    result1_.reset_data();
+    result2_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable cx_;
+  SavedVariable hidden_bias_;
+  SavedVariable hidden_gates_;
+  SavedVariable input_bias_;
+  SavedVariable input_gates_;
+  SavedVariable result1_;
+  SavedVariable result2_;
+
+};
+#ifdef _WIN32
+struct ThnnFusedGruCellBackward0 : public TraceableFunction {
+  TORCH_API ThnnFusedGruCellBackward0() = default;
+#else
+struct TORCH_API ThnnFusedGruCellBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ThnnFusedGruCellBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    hidden_bias_.reset_data();
+    hidden_gates_.reset_data();
+    hx_.reset_data();
+    input_bias_.reset_data();
+    input_gates_.reset_data();
+    result1_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable hidden_bias_;
+  SavedVariable hidden_gates_;
+  SavedVariable hx_;
+  SavedVariable input_bias_;
+  SavedVariable input_gates_;
+  SavedVariable result1_;
+
+};
+#ifdef _WIN32
+struct PackPaddedSequenceBackward0 : public TraceableFunction {
+  TORCH_API PackPaddedSequenceBackward0() = default;
+#else
+struct TORCH_API PackPaddedSequenceBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "PackPaddedSequenceBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result1_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  bool batch_first;
+  std::vector<c10::SymInt> input_sym_sizes;
+  SavedVariable result1_;
+
+};
+#ifdef _WIN32
+struct SegmentReduceBackward0 : public TraceableFunction {
+  TORCH_API SegmentReduceBackward0() = default;
+#else
+struct TORCH_API SegmentReduceBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SegmentReduceBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    data_.reset_data();
+    lengths_.reset_data();
+    offsets_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t axis = 0;
+  SavedVariable data_;
+  ::std::optional<at::Scalar> initial;
+  SavedVariable lengths_;
+  SavedVariable offsets_;
+  std::string reduce;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct PinMemoryBackward0 : public TraceableFunction {
+  TORCH_API PinMemoryBackward0() = default;
+#else
+struct TORCH_API PinMemoryBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "PinMemoryBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct TestWarnInAutogradBackward0 : public TraceableFunction {
+  TORCH_API TestWarnInAutogradBackward0() = default;
+#else
+struct TORCH_API TestWarnInAutogradBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TestWarnInAutogradBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct TestAutogradMultipleDispatchBackward0 : public TraceableFunction {
+  TORCH_API TestAutogradMultipleDispatchBackward0() = default;
+#else
+struct TORCH_API TestAutogradMultipleDispatchBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TestAutogradMultipleDispatchBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct TestAutogradMultipleDispatchBackwardAutogradNestedTensor0 : public TraceableFunction {
+  TORCH_API TestAutogradMultipleDispatchBackwardAutogradNestedTensor0() = default;
+#else
+struct TORCH_API TestAutogradMultipleDispatchBackwardAutogradNestedTensor0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TestAutogradMultipleDispatchBackwardAutogradNestedTensor0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct TestAutogradMultipleDispatchBackwardAutogradCUDA0 : public TraceableFunction {
+  TORCH_API TestAutogradMultipleDispatchBackwardAutogradCUDA0() = default;
+#else
+struct TORCH_API TestAutogradMultipleDispatchBackwardAutogradCUDA0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TestAutogradMultipleDispatchBackwardAutogradCUDA0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct TestAutogradMultipleDispatchBackwardAutogradNestedTensor1 : public TraceableFunction {
+  TORCH_API TestAutogradMultipleDispatchBackwardAutogradNestedTensor1() = default;
+#else
+struct TORCH_API TestAutogradMultipleDispatchBackwardAutogradNestedTensor1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TestAutogradMultipleDispatchBackwardAutogradNestedTensor1"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct TestAutogradMultipleDispatchViewBackward0 : public Node {
+  TORCH_API TestAutogradMultipleDispatchViewBackward0() = default;
+#else
+struct TORCH_API TestAutogradMultipleDispatchViewBackward0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TestAutogradMultipleDispatchViewBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct TestAutogradMultipleDispatchViewBackwardAutogradCUDA0 : public Node {
+  TORCH_API TestAutogradMultipleDispatchViewBackwardAutogradCUDA0() = default;
+#else
+struct TORCH_API TestAutogradMultipleDispatchViewBackwardAutogradCUDA0 : public Node {
+#endif
+  using Node::Node;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TestAutogradMultipleDispatchViewBackwardAutogradCUDA0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ScatterReduceBackward0 : public TraceableFunction {
+  TORCH_API ScatterReduceBackward0() = default;
+#else
+struct TORCH_API ScatterReduceBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ScatterReduceBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    index_.reset_data();
+    self_.reset_data();
+    src_.reset_data();
+    result_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  bool include_self;
+  SavedVariable index_;
+  std::string reduce;
+  SavedVariable self_;
+  SavedVariable src_;
+  SavedVariable result_;
+
+};
+#ifdef _WIN32
+struct ReshapeCopyBackward0 : public TraceableFunction {
+  TORCH_API ReshapeCopyBackward0() = default;
+#else
+struct TORCH_API ReshapeCopyBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ReshapeCopyBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct ForeachDivBackward0 : public TraceableFunction {
+  TORCH_API ForeachDivBackward0() = default;
+#else
+struct TORCH_API ForeachDivBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachDivBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.clear();
+    other_released_ = true;
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> other_;
+  bool other_released_ = false;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+  size_t other_size_;
+};
+#ifdef _WIN32
+struct ForeachPowBackward0 : public TraceableFunction {
+  TORCH_API ForeachPowBackward0() = default;
+#else
+struct TORCH_API ForeachPowBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachPowBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    exponent_.clear();
+    exponent_released_ = true;
+    self_.clear();
+    self_released_ = true;
+    result_.clear();
+    result_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> exponent_;
+  bool exponent_released_ = false;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  std::vector<SavedVariable> result_;
+  bool result_released_ = false;
+  size_t self_size_;
+  size_t exponent_size_;
+};
+#ifdef _WIN32
+struct ForeachPowBackward1 : public TraceableFunction {
+  TORCH_API ForeachPowBackward1() = default;
+#else
+struct TORCH_API ForeachPowBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachPowBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    exponent.clear();
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<at::Scalar> exponent;
+  bool exponent_released_ = false;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachPowBackward2 : public TraceableFunction {
+  TORCH_API ForeachPowBackward2() = default;
+#else
+struct TORCH_API ForeachPowBackward2 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachPowBackward2"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    exponent_.clear();
+    exponent_released_ = true;
+    result_.clear();
+    result_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> exponent_;
+  bool exponent_released_ = false;
+  at::Scalar self;
+  std::vector<SavedVariable> result_;
+  bool result_released_ = false;
+  size_t exponent_size_;
+};
+#ifdef _WIN32
+struct ForeachMinimumBackward0 : public TraceableFunction {
+  TORCH_API ForeachMinimumBackward0() = default;
+#else
+struct TORCH_API ForeachMinimumBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachMinimumBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar scalar;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachMinimumBackward1 : public TraceableFunction {
+  TORCH_API ForeachMinimumBackward1() = default;
+#else
+struct TORCH_API ForeachMinimumBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachMinimumBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    scalars.clear();
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<at::Scalar> scalars;
+  bool scalars_released_ = false;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachMaximumBackward0 : public TraceableFunction {
+  TORCH_API ForeachMaximumBackward0() = default;
+#else
+struct TORCH_API ForeachMaximumBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachMaximumBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar scalar;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachMaximumBackward1 : public TraceableFunction {
+  TORCH_API ForeachMaximumBackward1() = default;
+#else
+struct TORCH_API ForeachMaximumBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachMaximumBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    scalars.clear();
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<at::Scalar> scalars;
+  bool scalars_released_ = false;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachNormBackward0 : public TraceableFunction {
+  TORCH_API ForeachNormBackward0() = default;
+#else
+struct TORCH_API ForeachNormBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachNormBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+    result_.clear();
+    result_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar ord;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  std::vector<SavedVariable> result_;
+  bool result_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct AliasBackward0_copy : public TraceableFunction {
+  TORCH_API AliasBackward0_copy() = default;
+#else
+struct TORCH_API AliasBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AliasBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct AsStridedBackward0_copy : public TraceableFunction {
+  TORCH_API AsStridedBackward0_copy() = default;
+#else
+struct TORCH_API AsStridedBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "AsStridedBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::TensorGeometry self_geometry;
+  std::vector<c10::SymInt> size;
+  ::std::optional<c10::SymInt> storage_offset;
+  std::vector<c10::SymInt> stride;
+
+};
+#ifdef _WIN32
+struct ConjBackward0_copy : public TraceableFunction {
+  TORCH_API ConjBackward0_copy() = default;
+#else
+struct TORCH_API ConjBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ConjBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct NegViewBackward0_copy : public TraceableFunction {
+  TORCH_API NegViewBackward0_copy() = default;
+#else
+struct TORCH_API NegViewBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NegViewBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct DiagonalBackward0_copy : public TraceableFunction {
+  TORCH_API DiagonalBackward0_copy() = default;
+#else
+struct TORCH_API DiagonalBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "DiagonalBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim1 = 0;
+  int64_t dim2 = 0;
+  int64_t offset = 0;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct ExpandBackward0_copy : public TraceableFunction {
+  TORCH_API ExpandBackward0_copy() = default;
+#else
+struct TORCH_API ExpandBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ExpandBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct PermuteBackward0_copy : public TraceableFunction {
+  TORCH_API PermuteBackward0_copy() = default;
+#else
+struct TORCH_API PermuteBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "PermuteBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dims;
+
+};
+#ifdef _WIN32
+struct ReshapeAliasBackward0_copy : public TraceableFunction {
+  TORCH_API ReshapeAliasBackward0_copy() = default;
+#else
+struct TORCH_API ReshapeAliasBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ReshapeAliasBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct SelectBackward0_copy : public TraceableFunction {
+  TORCH_API SelectBackward0_copy() = default;
+#else
+struct TORCH_API SelectBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SelectBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  c10::SymInt index;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct SelectBackwardAutogradNestedTensor0_copy : public TraceableFunction {
+  TORCH_API SelectBackwardAutogradNestedTensor0_copy() = default;
+#else
+struct TORCH_API SelectBackwardAutogradNestedTensor0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SelectBackwardAutogradNestedTensor0_copy"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  c10::SymInt index;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct SliceBackward0_copy : public TraceableFunction {
+  TORCH_API SliceBackward0_copy() = default;
+#else
+struct TORCH_API SliceBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SliceBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  ::std::optional<c10::SymInt> end;
+  std::vector<c10::SymInt> self_sym_sizes;
+  ::std::optional<c10::SymInt> start;
+  c10::SymInt step;
+
+};
+#ifdef _WIN32
+struct SplitBackward0_copy : public TraceableFunction {
+  TORCH_API SplitBackward0_copy() = default;
+#else
+struct TORCH_API SplitBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SplitBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  at::TensorOptions self_options;
+  std::vector<c10::SymInt> self_sym_sizes;
+  c10::SymInt split_size;
+
+};
+#ifdef _WIN32
+struct SplitWithSizesBackward0_copy : public TraceableFunction {
+  TORCH_API SplitWithSizesBackward0_copy() = default;
+#else
+struct TORCH_API SplitWithSizesBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SplitWithSizesBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  at::TensorOptions self_options;
+  std::vector<c10::SymInt> self_sym_sizes;
+  std::vector<c10::SymInt> split_sizes;
+
+};
+#ifdef _WIN32
+struct SplitWithSizesBackwardAutogradNestedTensor0_copy : public TraceableFunction {
+  TORCH_API SplitWithSizesBackwardAutogradNestedTensor0_copy() = default;
+#else
+struct TORCH_API SplitWithSizesBackwardAutogradNestedTensor0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SplitWithSizesBackwardAutogradNestedTensor0_copy"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable self_;
+  at::TensorOptions self_options;
+  std::vector<c10::SymInt> split_sizes;
+
+};
+#ifdef _WIN32
+struct SqueezeBackward0_copy : public TraceableFunction {
+  TORCH_API SqueezeBackward0_copy() = default;
+#else
+struct TORCH_API SqueezeBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SqueezeBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct SqueezeBackward1_copy : public TraceableFunction {
+  TORCH_API SqueezeBackward1_copy() = default;
+#else
+struct TORCH_API SqueezeBackward1_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SqueezeBackward1_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct SqueezeBackwardAutogradNestedTensor0_copy : public TraceableFunction {
+  TORCH_API SqueezeBackwardAutogradNestedTensor0_copy() = default;
+#else
+struct TORCH_API SqueezeBackwardAutogradNestedTensor0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SqueezeBackwardAutogradNestedTensor0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+
+};
+#ifdef _WIN32
+struct SqueezeBackward2_copy : public TraceableFunction {
+  TORCH_API SqueezeBackward2_copy() = default;
+#else
+struct TORCH_API SqueezeBackward2_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SqueezeBackward2_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dim;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct SqueezeBackwardAutogradNestedTensor1_copy : public TraceableFunction {
+  TORCH_API SqueezeBackwardAutogradNestedTensor1_copy() = default;
+#else
+struct TORCH_API SqueezeBackwardAutogradNestedTensor1_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "SqueezeBackwardAutogradNestedTensor1_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<int64_t> dim;
+  int64_t self_dim = 0;
+
+};
+#ifdef _WIN32
+struct TBackward0_copy : public TraceableFunction {
+  TORCH_API TBackward0_copy() = default;
+#else
+struct TORCH_API TBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct TransposeBackward0_copy : public TraceableFunction {
+  TORCH_API TransposeBackward0_copy() = default;
+#else
+struct TORCH_API TransposeBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TransposeBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim0 = 0;
+  int64_t dim1 = 0;
+
+};
+#ifdef _WIN32
+struct UnfoldBackward0_copy : public TraceableFunction {
+  TORCH_API UnfoldBackward0_copy() = default;
+#else
+struct TORCH_API UnfoldBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UnfoldBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dimension = 0;
+  std::vector<c10::SymInt> self_sym_sizes;
+  int64_t size = 0;
+  int64_t step = 0;
+
+};
+#ifdef _WIN32
+struct LiftFreshBackward0_copy : public TraceableFunction {
+  TORCH_API LiftFreshBackward0_copy() = default;
+#else
+struct TORCH_API LiftFreshBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "LiftFreshBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct UnsqueezeBackward0_copy : public TraceableFunction {
+  TORCH_API UnsqueezeBackward0_copy() = default;
+#else
+struct TORCH_API UnsqueezeBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UnsqueezeBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+
+};
+#ifdef _WIN32
+struct ViewBackward0_copy : public TraceableFunction {
+  TORCH_API ViewBackward0_copy() = default;
+#else
+struct TORCH_API ViewBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ViewBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<c10::SymInt> self_sym_sizes;
+
+};
+#ifdef _WIN32
+struct ViewBackwardAutogradNestedTensor0_copy : public TraceableFunction {
+  TORCH_API ViewBackwardAutogradNestedTensor0_copy() = default;
+#else
+struct TORCH_API ViewBackwardAutogradNestedTensor0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ViewBackwardAutogradNestedTensor0_copy"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ViewAsRealBackward0_copy : public TraceableFunction {
+  TORCH_API ViewAsRealBackward0_copy() = default;
+#else
+struct TORCH_API ViewAsRealBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ViewAsRealBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct ViewAsComplexBackward0_copy : public TraceableFunction {
+  TORCH_API ViewAsComplexBackward0_copy() = default;
+#else
+struct TORCH_API ViewAsComplexBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ViewAsComplexBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct ValuesBackward0_copy : public TraceableFunction {
+  TORCH_API ValuesBackward0_copy() = default;
+#else
+struct TORCH_API ValuesBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ValuesBackward0_copy"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ValuesBackwardAutogradNestedTensor0_copy : public TraceableFunction {
+  TORCH_API ValuesBackwardAutogradNestedTensor0_copy() = default;
+#else
+struct TORCH_API ValuesBackwardAutogradNestedTensor0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ValuesBackwardAutogradNestedTensor0_copy"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct NestedViewFromBufferBackward0_copy : public TraceableFunction {
+  TORCH_API NestedViewFromBufferBackward0_copy() = default;
+#else
+struct TORCH_API NestedViewFromBufferBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NestedViewFromBufferBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct NestedViewFromJaggedBackward0_copy : public TraceableFunction {
+  TORCH_API NestedViewFromJaggedBackward0_copy() = default;
+#else
+struct TORCH_API NestedViewFromJaggedBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NestedViewFromJaggedBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+
+};
+#ifdef _WIN32
+struct NestedGetValuesBackward0_copy : public TraceableFunction {
+  TORCH_API NestedGetValuesBackward0_copy() = default;
+#else
+struct TORCH_API NestedGetValuesBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "NestedGetValuesBackward0_copy"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct UnbindBackward0_copy : public TraceableFunction {
+  TORCH_API UnbindBackward0_copy() = default;
+#else
+struct TORCH_API UnbindBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UnbindBackward0_copy"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+
+};
+#ifdef _WIN32
+struct UnbindBackwardAutogradNestedTensor0_copy : public TraceableFunction {
+  TORCH_API UnbindBackwardAutogradNestedTensor0_copy() = default;
+#else
+struct TORCH_API UnbindBackwardAutogradNestedTensor0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "UnbindBackwardAutogradNestedTensor0_copy"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  int64_t dim = 0;
+  SavedVariable self_;
+  at::Layout self_layout;
+  at::TensorOptions self_options;
+
+};
+#ifdef _WIN32
+struct TestAutogradMultipleDispatchViewBackward0_copy : public TraceableFunction {
+  TORCH_API TestAutogradMultipleDispatchViewBackward0_copy() = default;
+#else
+struct TORCH_API TestAutogradMultipleDispatchViewBackward0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TestAutogradMultipleDispatchViewBackward0_copy"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct TestAutogradMultipleDispatchViewBackwardAutogradCUDA0_copy : public TraceableFunction {
+  TORCH_API TestAutogradMultipleDispatchViewBackwardAutogradCUDA0_copy() = default;
+#else
+struct TORCH_API TestAutogradMultipleDispatchViewBackwardAutogradCUDA0_copy : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "TestAutogradMultipleDispatchViewBackwardAutogradCUDA0_copy"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.reset_data();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable self_;
+
+};
+#ifdef _WIN32
+struct ForeachAbsBackward0 : public TraceableFunction {
+  TORCH_API ForeachAbsBackward0() = default;
+#else
+struct TORCH_API ForeachAbsBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachAbsBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachAcosBackward0 : public TraceableFunction {
+  TORCH_API ForeachAcosBackward0() = default;
+#else
+struct TORCH_API ForeachAcosBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachAcosBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachAddBackward1Scalar : public TraceableFunction {
+  TORCH_API ForeachAddBackward1Scalar() = default;
+#else
+struct TORCH_API ForeachAddBackward1Scalar : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachAddBackward1Scalar"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachAddBackward0List : public TraceableFunction {
+  TORCH_API ForeachAddBackward0List() = default;
+#else
+struct TORCH_API ForeachAddBackward0List : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachAddBackward0List"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.clear();
+    other_released_ = true;
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  std::vector<SavedVariable> other_;
+  bool other_released_ = false;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+  size_t other_size_;
+};
+#ifdef _WIN32
+struct ForeachAddBackward1ScalarList : public TraceableFunction {
+  TORCH_API ForeachAddBackward1ScalarList() = default;
+#else
+struct TORCH_API ForeachAddBackward1ScalarList : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachAddBackward1ScalarList"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachAddBackward0Tensor : public TraceableFunction {
+  TORCH_API ForeachAddBackward0Tensor() = default;
+#else
+struct TORCH_API ForeachAddBackward0Tensor : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachAddBackward0Tensor"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  SavedVariable other_;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachAddcdivBackward0Scalar : public TraceableFunction {
+  TORCH_API ForeachAddcdivBackward0Scalar() = default;
+#else
+struct TORCH_API ForeachAddcdivBackward0Scalar : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachAddcdivBackward0Scalar"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+    tensor1_.clear();
+    tensor1_released_ = true;
+    tensor2_.clear();
+    tensor2_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  std::vector<SavedVariable> tensor1_;
+  bool tensor1_released_ = false;
+  std::vector<SavedVariable> tensor2_;
+  bool tensor2_released_ = false;
+  at::Scalar value;
+  size_t self_size_;
+  size_t tensor1_size_;
+  size_t tensor2_size_;
+};
+#ifdef _WIN32
+struct ForeachAddcdivBackward0ScalarList : public TraceableFunction {
+  TORCH_API ForeachAddcdivBackward0ScalarList() = default;
+#else
+struct TORCH_API ForeachAddcdivBackward0ScalarList : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachAddcdivBackward0ScalarList"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    scalars.clear();
+    self_.clear();
+    self_released_ = true;
+    tensor1_.clear();
+    tensor1_released_ = true;
+    tensor2_.clear();
+    tensor2_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<at::Scalar> scalars;
+  bool scalars_released_ = false;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  std::vector<SavedVariable> tensor1_;
+  bool tensor1_released_ = false;
+  std::vector<SavedVariable> tensor2_;
+  bool tensor2_released_ = false;
+  size_t self_size_;
+  size_t tensor1_size_;
+  size_t tensor2_size_;
+};
+#ifdef _WIN32
+struct ForeachAddcmulBackward0Scalar : public TraceableFunction {
+  TORCH_API ForeachAddcmulBackward0Scalar() = default;
+#else
+struct TORCH_API ForeachAddcmulBackward0Scalar : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachAddcmulBackward0Scalar"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+    tensor1_.clear();
+    tensor1_released_ = true;
+    tensor2_.clear();
+    tensor2_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  std::vector<SavedVariable> tensor1_;
+  bool tensor1_released_ = false;
+  std::vector<SavedVariable> tensor2_;
+  bool tensor2_released_ = false;
+  at::Scalar value;
+  size_t self_size_;
+  size_t tensor1_size_;
+  size_t tensor2_size_;
+};
+#ifdef _WIN32
+struct ForeachAddcmulBackward0ScalarList : public TraceableFunction {
+  TORCH_API ForeachAddcmulBackward0ScalarList() = default;
+#else
+struct TORCH_API ForeachAddcmulBackward0ScalarList : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachAddcmulBackward0ScalarList"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    scalars.clear();
+    self_.clear();
+    self_released_ = true;
+    tensor1_.clear();
+    tensor1_released_ = true;
+    tensor2_.clear();
+    tensor2_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<at::Scalar> scalars;
+  bool scalars_released_ = false;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  std::vector<SavedVariable> tensor1_;
+  bool tensor1_released_ = false;
+  std::vector<SavedVariable> tensor2_;
+  bool tensor2_released_ = false;
+  size_t self_size_;
+  size_t tensor1_size_;
+  size_t tensor2_size_;
+};
+#ifdef _WIN32
+struct ForeachAsinBackward0 : public TraceableFunction {
+  TORCH_API ForeachAsinBackward0() = default;
+#else
+struct TORCH_API ForeachAsinBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachAsinBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachAtanBackward0 : public TraceableFunction {
+  TORCH_API ForeachAtanBackward0() = default;
+#else
+struct TORCH_API ForeachAtanBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachAtanBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachCeilBackward0 : public TraceableFunction {
+  TORCH_API ForeachCeilBackward0() = default;
+#else
+struct TORCH_API ForeachCeilBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachCeilBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachClampMaxBackward0Scalar : public TraceableFunction {
+  TORCH_API ForeachClampMaxBackward0Scalar() = default;
+#else
+struct TORCH_API ForeachClampMaxBackward0Scalar : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachClampMaxBackward0Scalar"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar scalar;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachClampMaxBackward1List : public TraceableFunction {
+  TORCH_API ForeachClampMaxBackward1List() = default;
+#else
+struct TORCH_API ForeachClampMaxBackward1List : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachClampMaxBackward1List"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.clear();
+    other_released_ = true;
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> other_;
+  bool other_released_ = false;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+  size_t other_size_;
+};
+#ifdef _WIN32
+struct ForeachClampMaxBackward0ScalarList : public TraceableFunction {
+  TORCH_API ForeachClampMaxBackward0ScalarList() = default;
+#else
+struct TORCH_API ForeachClampMaxBackward0ScalarList : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachClampMaxBackward0ScalarList"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    scalars.clear();
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<at::Scalar> scalars;
+  bool scalars_released_ = false;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachClampMinBackward0Scalar : public TraceableFunction {
+  TORCH_API ForeachClampMinBackward0Scalar() = default;
+#else
+struct TORCH_API ForeachClampMinBackward0Scalar : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachClampMinBackward0Scalar"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar scalar;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachClampMinBackward1List : public TraceableFunction {
+  TORCH_API ForeachClampMinBackward1List() = default;
+#else
+struct TORCH_API ForeachClampMinBackward1List : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachClampMinBackward1List"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.clear();
+    other_released_ = true;
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> other_;
+  bool other_released_ = false;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+  size_t other_size_;
+};
+#ifdef _WIN32
+struct ForeachClampMinBackward0ScalarList : public TraceableFunction {
+  TORCH_API ForeachClampMinBackward0ScalarList() = default;
+#else
+struct TORCH_API ForeachClampMinBackward0ScalarList : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachClampMinBackward0ScalarList"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    scalars.clear();
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<at::Scalar> scalars;
+  bool scalars_released_ = false;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachCosBackward0 : public TraceableFunction {
+  TORCH_API ForeachCosBackward0() = default;
+#else
+struct TORCH_API ForeachCosBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachCosBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachCoshBackward0 : public TraceableFunction {
+  TORCH_API ForeachCoshBackward0() = default;
+#else
+struct TORCH_API ForeachCoshBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachCoshBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachDivBackward1Scalar : public TraceableFunction {
+  TORCH_API ForeachDivBackward1Scalar() = default;
+#else
+struct TORCH_API ForeachDivBackward1Scalar : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachDivBackward1Scalar"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar scalar;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachDivBackward1ScalarList : public TraceableFunction {
+  TORCH_API ForeachDivBackward1ScalarList() = default;
+#else
+struct TORCH_API ForeachDivBackward1ScalarList : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachDivBackward1ScalarList"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    scalars.clear();
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<at::Scalar> scalars;
+  bool scalars_released_ = false;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachDivBackward0Tensor : public TraceableFunction {
+  TORCH_API ForeachDivBackward0Tensor() = default;
+#else
+struct TORCH_API ForeachDivBackward0Tensor : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachDivBackward0Tensor"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachErfBackward0 : public TraceableFunction {
+  TORCH_API ForeachErfBackward0() = default;
+#else
+struct TORCH_API ForeachErfBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachErfBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachErfcBackward0 : public TraceableFunction {
+  TORCH_API ForeachErfcBackward0() = default;
+#else
+struct TORCH_API ForeachErfcBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachErfcBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachExpBackward0 : public TraceableFunction {
+  TORCH_API ForeachExpBackward0() = default;
+#else
+struct TORCH_API ForeachExpBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachExpBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.clear();
+    result_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> result_;
+  bool result_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachExpm1Backward0 : public TraceableFunction {
+  TORCH_API ForeachExpm1Backward0() = default;
+#else
+struct TORCH_API ForeachExpm1Backward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachExpm1Backward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.clear();
+    result_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> result_;
+  bool result_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachFloorBackward0 : public TraceableFunction {
+  TORCH_API ForeachFloorBackward0() = default;
+#else
+struct TORCH_API ForeachFloorBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachFloorBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachFracBackward0 : public TraceableFunction {
+  TORCH_API ForeachFracBackward0() = default;
+#else
+struct TORCH_API ForeachFracBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachFracBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachLerpBackward1List : public TraceableFunction {
+  TORCH_API ForeachLerpBackward1List() = default;
+#else
+struct TORCH_API ForeachLerpBackward1List : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachLerpBackward1List"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+    tensors1_.clear();
+    tensors1_released_ = true;
+    weights_.clear();
+    weights_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  std::vector<SavedVariable> tensors1_;
+  bool tensors1_released_ = false;
+  std::vector<SavedVariable> weights_;
+  bool weights_released_ = false;
+  size_t self_size_;
+  size_t tensors1_size_;
+  size_t weights_size_;
+};
+#ifdef _WIN32
+struct ForeachLerpBackward0Scalar : public TraceableFunction {
+  TORCH_API ForeachLerpBackward0Scalar() = default;
+#else
+struct TORCH_API ForeachLerpBackward0Scalar : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachLerpBackward0Scalar"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar weight;
+  size_t self_size_;
+  size_t tensors1_size_;
+};
+#ifdef _WIN32
+struct ForeachLerpBackward0ScalarList : public TraceableFunction {
+  TORCH_API ForeachLerpBackward0ScalarList() = default;
+#else
+struct TORCH_API ForeachLerpBackward0ScalarList : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachLerpBackward0ScalarList"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    weight.clear();
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<at::Scalar> weight;
+  bool weight_released_ = false;
+  size_t self_size_;
+  size_t tensors1_size_;
+};
+#ifdef _WIN32
+struct ForeachLgammaBackward0 : public TraceableFunction {
+  TORCH_API ForeachLgammaBackward0() = default;
+#else
+struct TORCH_API ForeachLgammaBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachLgammaBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachLogBackward0 : public TraceableFunction {
+  TORCH_API ForeachLogBackward0() = default;
+#else
+struct TORCH_API ForeachLogBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachLogBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachLog10Backward0 : public TraceableFunction {
+  TORCH_API ForeachLog10Backward0() = default;
+#else
+struct TORCH_API ForeachLog10Backward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachLog10Backward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachLog1PBackward0 : public TraceableFunction {
+  TORCH_API ForeachLog1PBackward0() = default;
+#else
+struct TORCH_API ForeachLog1PBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachLog1PBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachLog2Backward0 : public TraceableFunction {
+  TORCH_API ForeachLog2Backward0() = default;
+#else
+struct TORCH_API ForeachLog2Backward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachLog2Backward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachMaxBackward1 : public TraceableFunction {
+  TORCH_API ForeachMaxBackward1() = default;
+#else
+struct TORCH_API ForeachMaxBackward1 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachMaxBackward1"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+    result_.clear();
+    result_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  std::vector<SavedVariable> result_;
+  bool result_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachMaximumBackward0List : public TraceableFunction {
+  TORCH_API ForeachMaximumBackward0List() = default;
+#else
+struct TORCH_API ForeachMaximumBackward0List : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachMaximumBackward0List"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.clear();
+    other_released_ = true;
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> other_;
+  bool other_released_ = false;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+  size_t other_size_;
+};
+#ifdef _WIN32
+struct ForeachMinimumBackward0List : public TraceableFunction {
+  TORCH_API ForeachMinimumBackward0List() = default;
+#else
+struct TORCH_API ForeachMinimumBackward0List : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachMinimumBackward0List"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.clear();
+    other_released_ = true;
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> other_;
+  bool other_released_ = false;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+  size_t other_size_;
+};
+#ifdef _WIN32
+struct ForeachMulBackward1Scalar : public TraceableFunction {
+  TORCH_API ForeachMulBackward1Scalar() = default;
+#else
+struct TORCH_API ForeachMulBackward1Scalar : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachMulBackward1Scalar"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar scalar;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachMulBackward0List : public TraceableFunction {
+  TORCH_API ForeachMulBackward0List() = default;
+#else
+struct TORCH_API ForeachMulBackward0List : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachMulBackward0List"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.clear();
+    other_released_ = true;
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> other_;
+  bool other_released_ = false;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+  size_t other_size_;
+};
+#ifdef _WIN32
+struct ForeachMulBackward1ScalarList : public TraceableFunction {
+  TORCH_API ForeachMulBackward1ScalarList() = default;
+#else
+struct TORCH_API ForeachMulBackward1ScalarList : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachMulBackward1ScalarList"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    scalars.clear();
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<at::Scalar> scalars;
+  bool scalars_released_ = false;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachMulBackward0Tensor : public TraceableFunction {
+  TORCH_API ForeachMulBackward0Tensor() = default;
+#else
+struct TORCH_API ForeachMulBackward0Tensor : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachMulBackward0Tensor"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.reset_data();
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  SavedVariable other_;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachNegBackward0 : public TraceableFunction {
+  TORCH_API ForeachNegBackward0() = default;
+#else
+struct TORCH_API ForeachNegBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachNegBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachPowBackward0Scalar : public TraceableFunction {
+  TORCH_API ForeachPowBackward0Scalar() = default;
+#else
+struct TORCH_API ForeachPowBackward0Scalar : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachPowBackward0Scalar"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar exponent;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachReciprocalBackward0 : public TraceableFunction {
+  TORCH_API ForeachReciprocalBackward0() = default;
+#else
+struct TORCH_API ForeachReciprocalBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachReciprocalBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.clear();
+    result_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> result_;
+  bool result_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachRoundBackward0 : public TraceableFunction {
+  TORCH_API ForeachRoundBackward0() = default;
+#else
+struct TORCH_API ForeachRoundBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachRoundBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachRsqrtBackward0 : public TraceableFunction {
+  TORCH_API ForeachRsqrtBackward0() = default;
+#else
+struct TORCH_API ForeachRsqrtBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachRsqrtBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.clear();
+    result_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> result_;
+  bool result_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachSigmoidBackward0 : public TraceableFunction {
+  TORCH_API ForeachSigmoidBackward0() = default;
+#else
+struct TORCH_API ForeachSigmoidBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachSigmoidBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.clear();
+    result_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> result_;
+  bool result_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachSignBackward0 : public TraceableFunction {
+  TORCH_API ForeachSignBackward0() = default;
+#else
+struct TORCH_API ForeachSignBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachSignBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachSinBackward0 : public TraceableFunction {
+  TORCH_API ForeachSinBackward0() = default;
+#else
+struct TORCH_API ForeachSinBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachSinBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachSinhBackward0 : public TraceableFunction {
+  TORCH_API ForeachSinhBackward0() = default;
+#else
+struct TORCH_API ForeachSinhBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachSinhBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachSqrtBackward0 : public TraceableFunction {
+  TORCH_API ForeachSqrtBackward0() = default;
+#else
+struct TORCH_API ForeachSqrtBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachSqrtBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.clear();
+    result_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> result_;
+  bool result_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachSubBackward1Scalar : public TraceableFunction {
+  TORCH_API ForeachSubBackward1Scalar() = default;
+#else
+struct TORCH_API ForeachSubBackward1Scalar : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachSubBackward1Scalar"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachSubBackward0List : public TraceableFunction {
+  TORCH_API ForeachSubBackward0List() = default;
+#else
+struct TORCH_API ForeachSubBackward0List : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachSubBackward0List"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    other_.clear();
+    other_released_ = true;
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  at::Scalar alpha;
+  std::vector<SavedVariable> other_;
+  bool other_released_ = false;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+  size_t other_size_;
+};
+#ifdef _WIN32
+struct ForeachSubBackward1ScalarList : public TraceableFunction {
+  TORCH_API ForeachSubBackward1ScalarList() = default;
+#else
+struct TORCH_API ForeachSubBackward1ScalarList : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachSubBackward1ScalarList"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    self_.clear();
+    self_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> self_;
+  bool self_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachTanBackward0 : public TraceableFunction {
+  TORCH_API ForeachTanBackward0() = default;
+#else
+struct TORCH_API ForeachTanBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachTanBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.clear();
+    result_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> result_;
+  bool result_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachTanhBackward0 : public TraceableFunction {
+  TORCH_API ForeachTanhBackward0() = default;
+#else
+struct TORCH_API ForeachTanhBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachTanhBackward0"; }
+  void release_variables() override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    result_.clear();
+    result_released_ = true;
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+  std::vector<SavedVariable> result_;
+  bool result_released_ = false;
+  size_t self_size_;
+};
+#ifdef _WIN32
+struct ForeachTruncBackward0 : public TraceableFunction {
+  TORCH_API ForeachTruncBackward0() = default;
+#else
+struct TORCH_API ForeachTruncBackward0 : public TraceableFunction {
+#endif
+  using TraceableFunction::TraceableFunction;
+  variable_list apply(variable_list&& grads) override;
+  std::string name() const override { return "ForeachTruncBackward0"; }
+  void release_variables() override {
+
+
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(const variable_list& inputs, SwapSavedVariables& saved) override;
+
+  size_t self_size_;
+};
+
+}}} // namespace torch::autograd::generated
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/generated/VariableType.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/generated/VariableType.h
new file mode 100644
index 0000000000000000000000000000000000000000..800c2e31f96c051470f4754b20f86f558553c9a2
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/generated/VariableType.h
@@ -0,0 +1,55 @@
+#pragma once
+
+// @generated from ..\tools\autograd\templates/VariableType.h
+
+#include <ATen/core/Tensor.h>
+#include <ATen/Context.h>
+
+#include <c10/util/intrusive_ptr.h>
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/autograd/autograd_not_implemented_fallback.h>
+
+#include <cstdint> // for size_t
+#include <functional> // for function
+#include <memory> // for unique_ptr
+#include <string>
+#include <vector>
+
+namespace at {
+  struct Quantizer;
+}
+
+namespace torch { namespace autograd {
+
+using Variable = at::Tensor;
+using at::Context;
+using at::Device;
+using at::Dimname;
+using at::DimnameList;
+using at::Generator;
+using at::IntArrayRef;
+using at::MemoryFormat;
+using at::QScheme;
+using at::Scalar;
+using at::ScalarType;
+using at::Storage;
+using at::Tensor;
+using at::TensorList;
+using at::TensorOptions;
+using at::Quantizer;
+using std::optional;
+
+namespace VariableType {
+  TORCH_API std::vector<at::DeprecatedTypeProperties*> allCUDATypes();
+  TORCH_API std::vector<at::DeprecatedTypeProperties*> allXPUTypes();
+  TORCH_API std::vector<at::DeprecatedTypeProperties*> allCPUTypes();
+  TORCH_API std::vector<at::DeprecatedTypeProperties*> allPrivateUser1Types();
+
+  at::Tensor & unpack(Tensor & t, const char * name, int pos);
+  const at::Tensor & unpack(const Tensor & t, const char * name, int pos);
+  at::Tensor unpack_opt(const Tensor & t, const char * name, int pos);
+  std::vector<at::Tensor> unpack(const at::ITensorListRef& tl, const char *name, int pos);
+}
+
+}} // namespace torch::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/generated/ViewFuncs.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/generated/ViewFuncs.h
new file mode 100644
index 0000000000000000000000000000000000000000..24bffc1162a5023bbe3007b91bb2790ebaf9c791
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/generated/ViewFuncs.h
@@ -0,0 +1,955 @@
+#pragma once
+
+// @generated from ..\tools\autograd\templates/ViewFuncs.h
+
+#include <torch/library.h>
+#include <torch/csrc/autograd/variable.h>
+#include <c10/core/SymIntArrayRef.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Operators.h>
+#else
+#include <ATen/ops/_conj_ops.h>
+#include <ATen/ops/_indices_ops.h>
+#include <ATen/ops/_neg_view_ops.h>
+#include <ATen/ops/_nested_get_values_ops.h>
+#include <ATen/ops/_nested_view_from_buffer_ops.h>
+#include <ATen/ops/_nested_view_from_jagged_ops.h>
+#include <ATen/ops/_reshape_alias_ops.h>
+#include <ATen/ops/_test_autograd_multiple_dispatch_view_ops.h>
+#include <ATen/ops/_values_ops.h>
+#include <ATen/ops/alias_ops.h>
+#include <ATen/ops/as_strided_ops.h>
+#include <ATen/ops/ccol_indices_ops.h>
+#include <ATen/ops/chunk_ops.h>
+#include <ATen/ops/col_indices_ops.h>
+#include <ATen/ops/crow_indices_ops.h>
+#include <ATen/ops/diagonal_ops.h>
+#include <ATen/ops/expand_ops.h>
+#include <ATen/ops/indices_ops.h>
+#include <ATen/ops/narrow_ops.h>
+#include <ATen/ops/permute_ops.h>
+#include <ATen/ops/row_indices_ops.h>
+#include <ATen/ops/select_ops.h>
+#include <ATen/ops/slice_ops.h>
+#include <ATen/ops/slice_inverse_ops.h>
+#include <ATen/ops/split_ops.h>
+#include <ATen/ops/split_with_sizes_ops.h>
+#include <ATen/ops/squeeze_ops.h>
+#include <ATen/ops/squeeze_ops.h>
+#include <ATen/ops/squeeze_ops.h>
+#include <ATen/ops/t_ops.h>
+#include <ATen/ops/transpose_ops.h>
+#include <ATen/ops/unbind_ops.h>
+#include <ATen/ops/unfold_ops.h>
+#include <ATen/ops/unsqueeze_ops.h>
+#include <ATen/ops/values_ops.h>
+#include <ATen/ops/view_ops.h>
+#include <ATen/ops/view_ops.h>
+#include <ATen/ops/view_as_complex_ops.h>
+#include <ATen/ops/view_as_real_ops.h>
+#endif
+
+namespace torch::autograd::generated {
+
+using at::Scalar;
+using at::Tensor;
+using at::IntArrayRef;
+using at::ArrayRef;
+using at::Type;
+using at::ScalarType;
+using std::optional;
+using c10::fmap;
+
+#define _CONJ_VIEW_FUNC_AVAILABLE
+struct _ConjViewFunc : public torch::autograd::ViewFunc {
+  _ConjViewFunc()
+  {}
+  virtual ~_ConjViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+
+};
+
+#define _INDICES_VIEW_FUNC_AVAILABLE
+struct _IndicesViewFunc : public torch::autograd::ViewFunc {
+  _IndicesViewFunc()
+  {}
+  virtual ~_IndicesViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+
+};
+
+#define _NEG_VIEW_VIEW_FUNC_AVAILABLE
+struct _NegViewViewFunc : public torch::autograd::ViewFunc {
+  _NegViewViewFunc()
+  {}
+  virtual ~_NegViewViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+
+};
+
+#define _NESTED_GET_VALUES_VIEW_FUNC_AVAILABLE
+struct _NestedGetValuesViewFunc : public torch::autograd::ViewFunc {
+  _NestedGetValuesViewFunc()
+  {}
+  virtual ~_NestedGetValuesViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+
+};
+
+#define _NESTED_VIEW_FROM_BUFFER_VIEW_FUNC_AVAILABLE
+struct _NestedViewFromBufferViewFunc : public torch::autograd::ViewFunc {
+  _NestedViewFromBufferViewFunc(const at::Tensor & nested_size, const at::Tensor & nested_strides, const at::Tensor & offsets) : nested_size(nested_size), nested_strides(nested_strides), offsets(offsets)
+  {}
+  virtual ~_NestedViewFromBufferViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  at::Tensor nested_size;
+  at::Tensor nested_strides;
+  at::Tensor offsets;
+};
+
+#define _NESTED_VIEW_FROM_JAGGED_VIEW_FUNC_AVAILABLE
+struct _NestedViewFromJaggedViewFunc : public torch::autograd::ViewFunc {
+  _NestedViewFromJaggedViewFunc(const at::Tensor & offsets, const at::Tensor & dummy, const ::std::optional<at::Tensor> & lengths, int64_t ragged_idx, const ::std::optional<at::Tensor> & min_seqlen, const ::std::optional<at::Tensor> & max_seqlen) : offsets(offsets), dummy(dummy), lengths(lengths), ragged_idx(ragged_idx), min_seqlen(min_seqlen), max_seqlen(max_seqlen)
+  {}
+  virtual ~_NestedViewFromJaggedViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  at::Tensor offsets;
+  at::Tensor dummy;
+  ::std::optional<at::Tensor> lengths;
+  int64_t ragged_idx;
+  ::std::optional<at::Tensor> min_seqlen;
+  ::std::optional<at::Tensor> max_seqlen;
+};
+
+#define _RESHAPE_ALIAS_VIEW_FUNC_AVAILABLE
+struct _ReshapeAliasViewFunc : public torch::autograd::ViewFunc {
+  _ReshapeAliasViewFunc(c10::SymIntArrayRef size, c10::SymIntArrayRef stride) : size(size.vec()), stride(stride.vec())
+  {}
+  virtual ~_ReshapeAliasViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  ::std::vector<c10::SymInt> size;
+  ::std::vector<c10::SymInt> stride;
+};
+
+#define _TEST_AUTOGRAD_MULTIPLE_DISPATCH_VIEW_VIEW_FUNC_AVAILABLE
+struct _TestAutogradMultipleDispatchViewViewFunc : public torch::autograd::ViewFunc {
+  _TestAutogradMultipleDispatchViewViewFunc()
+  {}
+  virtual ~_TestAutogradMultipleDispatchViewViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+
+};
+
+#define _VALUES_VIEW_FUNC_AVAILABLE
+struct _ValuesViewFunc : public torch::autograd::ViewFunc {
+  _ValuesViewFunc()
+  {}
+  virtual ~_ValuesViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+
+};
+
+#define ALIAS_VIEW_FUNC_AVAILABLE
+struct AliasViewFunc : public torch::autograd::ViewFunc {
+  AliasViewFunc()
+  {}
+  virtual ~AliasViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+
+};
+
+#define AS_STRIDED_VIEW_FUNC_AVAILABLE
+struct AsStridedViewFunc : public torch::autograd::ViewFunc {
+  AsStridedViewFunc(c10::SymIntArrayRef size, c10::SymIntArrayRef stride, ::std::optional<c10::SymInt> storage_offset) : size(size.vec()), stride(stride.vec()), storage_offset(storage_offset)
+  {}
+  virtual ~AsStridedViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  ::std::vector<c10::SymInt> size;
+  ::std::vector<c10::SymInt> stride;
+  ::std::optional<c10::SymInt> storage_offset;
+};
+
+#define CCOL_INDICES_VIEW_FUNC_AVAILABLE
+struct CcolIndicesViewFunc : public torch::autograd::ViewFunc {
+  CcolIndicesViewFunc()
+  {}
+  virtual ~CcolIndicesViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+
+};
+
+#define CHUNK_VIEW_FUNC_AVAILABLE
+struct ChunkViewFunc : public torch::autograd::ViewFunc {
+  ChunkViewFunc(int64_t chunks, int64_t dim, int64_t view_idx) : chunks(chunks), dim(dim), view_idx(view_idx)
+  {}
+  virtual ~ChunkViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  int64_t chunks;
+  int64_t dim;
+  int64_t view_idx;
+};
+
+#define COL_INDICES_VIEW_FUNC_AVAILABLE
+struct ColIndicesViewFunc : public torch::autograd::ViewFunc {
+  ColIndicesViewFunc()
+  {}
+  virtual ~ColIndicesViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+
+};
+
+#define CROW_INDICES_VIEW_FUNC_AVAILABLE
+struct CrowIndicesViewFunc : public torch::autograd::ViewFunc {
+  CrowIndicesViewFunc()
+  {}
+  virtual ~CrowIndicesViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+
+};
+
+#define DIAGONAL_VIEW_FUNC_AVAILABLE
+struct DiagonalViewFunc : public torch::autograd::ViewFunc {
+  DiagonalViewFunc(int64_t offset, int64_t dim1, int64_t dim2) : offset(offset), dim1(dim1), dim2(dim2)
+  {}
+  virtual ~DiagonalViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  int64_t offset;
+  int64_t dim1;
+  int64_t dim2;
+};
+
+#define EXPAND_VIEW_FUNC_AVAILABLE
+struct ExpandViewFunc : public torch::autograd::ViewFunc {
+  ExpandViewFunc(c10::SymIntArrayRef size, bool implicit) : size(size.vec()), implicit(implicit)
+  {}
+  virtual ~ExpandViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  ::std::vector<c10::SymInt> size;
+  bool implicit;
+};
+
+#define INDICES_VIEW_FUNC_AVAILABLE
+struct IndicesViewFunc : public torch::autograd::ViewFunc {
+  IndicesViewFunc()
+  {}
+  virtual ~IndicesViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+
+};
+
+#define NARROW_VIEW_FUNC_AVAILABLE
+struct NarrowViewFunc : public torch::autograd::ViewFunc {
+  NarrowViewFunc(int64_t dim, c10::SymInt start, c10::SymInt length) : dim(dim), start(start), length(length)
+  {}
+  virtual ~NarrowViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  int64_t dim;
+  c10::SymInt start;
+  c10::SymInt length;
+};
+
+#define PERMUTE_VIEW_FUNC_AVAILABLE
+struct PermuteViewFunc : public torch::autograd::ViewFunc {
+  PermuteViewFunc(at::IntArrayRef dims) : dims(dims.vec())
+  {}
+  virtual ~PermuteViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  ::std::vector<int64_t> dims;
+};
+
+#define ROW_INDICES_VIEW_FUNC_AVAILABLE
+struct RowIndicesViewFunc : public torch::autograd::ViewFunc {
+  RowIndicesViewFunc()
+  {}
+  virtual ~RowIndicesViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+
+};
+
+#define SELECT_INT_VIEW_FUNC_AVAILABLE
+struct SelectIntViewFunc : public torch::autograd::ViewFunc {
+  SelectIntViewFunc(int64_t dim, c10::SymInt index) : dim(dim), index(index)
+  {}
+  virtual ~SelectIntViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  int64_t dim;
+  c10::SymInt index;
+};
+
+#define SLICE_TENSOR_VIEW_FUNC_AVAILABLE
+struct SliceTensorViewFunc : public torch::autograd::ViewFunc {
+  SliceTensorViewFunc(int64_t dim, ::std::optional<c10::SymInt> start, ::std::optional<c10::SymInt> end, c10::SymInt step) : dim(dim), start(start), end(end), step(step)
+  {}
+  virtual ~SliceTensorViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  int64_t dim;
+  ::std::optional<c10::SymInt> start;
+  ::std::optional<c10::SymInt> end;
+  c10::SymInt step;
+};
+
+#define SLICE_INVERSE_VIEW_FUNC_AVAILABLE
+struct SliceInverseViewFunc : public torch::autograd::ViewFunc {
+  SliceInverseViewFunc(const at::Tensor & src, int64_t dim, ::std::optional<c10::SymInt> start, ::std::optional<c10::SymInt> end, c10::SymInt step) : src(src), dim(dim), start(start), end(end), step(step)
+  {}
+  virtual ~SliceInverseViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  at::Tensor src;
+  int64_t dim;
+  ::std::optional<c10::SymInt> start;
+  ::std::optional<c10::SymInt> end;
+  c10::SymInt step;
+};
+
+#define SPLIT_TENSOR_VIEW_FUNC_AVAILABLE
+struct SplitTensorViewFunc : public torch::autograd::ViewFunc {
+  SplitTensorViewFunc(c10::SymInt split_size, int64_t dim, int64_t view_idx) : split_size(split_size), dim(dim), view_idx(view_idx)
+  {}
+  virtual ~SplitTensorViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  c10::SymInt split_size;
+  int64_t dim;
+  int64_t view_idx;
+};
+
+#define SPLIT_WITH_SIZES_VIEW_FUNC_AVAILABLE
+struct SplitWithSizesViewFunc : public torch::autograd::ViewFunc {
+  SplitWithSizesViewFunc(c10::SymIntArrayRef split_sizes, int64_t dim, int64_t view_idx) : split_sizes(split_sizes.vec()), dim(dim), view_idx(view_idx)
+  {}
+  virtual ~SplitWithSizesViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  ::std::vector<c10::SymInt> split_sizes;
+  int64_t dim;
+  int64_t view_idx;
+};
+
+#define SQUEEZE_VIEW_FUNC_AVAILABLE
+struct SqueezeViewFunc : public torch::autograd::ViewFunc {
+  SqueezeViewFunc()
+  {}
+  virtual ~SqueezeViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+
+};
+
+#define SQUEEZE_DIM_VIEW_FUNC_AVAILABLE
+struct SqueezeDimViewFunc : public torch::autograd::ViewFunc {
+  SqueezeDimViewFunc(int64_t dim) : dim(dim)
+  {}
+  virtual ~SqueezeDimViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  int64_t dim;
+};
+
+#define SQUEEZE_DIMS_VIEW_FUNC_AVAILABLE
+struct SqueezeDimsViewFunc : public torch::autograd::ViewFunc {
+  SqueezeDimsViewFunc(at::IntArrayRef dim) : dim(dim.vec())
+  {}
+  virtual ~SqueezeDimsViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  ::std::vector<int64_t> dim;
+};
+
+#define T_VIEW_FUNC_AVAILABLE
+struct TViewFunc : public torch::autograd::ViewFunc {
+  TViewFunc()
+  {}
+  virtual ~TViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+
+};
+
+#define TRANSPOSE_INT_VIEW_FUNC_AVAILABLE
+struct TransposeIntViewFunc : public torch::autograd::ViewFunc {
+  TransposeIntViewFunc(int64_t dim0, int64_t dim1) : dim0(dim0), dim1(dim1)
+  {}
+  virtual ~TransposeIntViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  int64_t dim0;
+  int64_t dim1;
+};
+
+#define UNBIND_INT_VIEW_FUNC_AVAILABLE
+struct UnbindIntViewFunc : public torch::autograd::ViewFunc {
+  UnbindIntViewFunc(int64_t dim, int64_t view_idx) : dim(dim), view_idx(view_idx)
+  {}
+  virtual ~UnbindIntViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  int64_t dim;
+  int64_t view_idx;
+};
+
+#define UNFOLD_VIEW_FUNC_AVAILABLE
+struct UnfoldViewFunc : public torch::autograd::ViewFunc {
+  UnfoldViewFunc(int64_t dimension, int64_t size, int64_t step) : dimension(dimension), size(size), step(step)
+  {}
+  virtual ~UnfoldViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  int64_t dimension;
+  int64_t size;
+  int64_t step;
+};
+
+#define UNSQUEEZE_VIEW_FUNC_AVAILABLE
+struct UnsqueezeViewFunc : public torch::autograd::ViewFunc {
+  UnsqueezeViewFunc(int64_t dim) : dim(dim)
+  {}
+  virtual ~UnsqueezeViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  int64_t dim;
+};
+
+#define VALUES_VIEW_FUNC_AVAILABLE
+struct ValuesViewFunc : public torch::autograd::ViewFunc {
+  ValuesViewFunc()
+  {}
+  virtual ~ValuesViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+
+};
+
+#define VIEW_VIEW_FUNC_AVAILABLE
+struct ViewViewFunc : public torch::autograd::ViewFunc {
+  ViewViewFunc(c10::SymIntArrayRef size) : size(size.vec())
+  {}
+  virtual ~ViewViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  ::std::vector<c10::SymInt> size;
+};
+
+#define VIEW_DTYPE_VIEW_FUNC_AVAILABLE
+struct ViewDtypeViewFunc : public torch::autograd::ViewFunc {
+  ViewDtypeViewFunc(at::ScalarType dtype) : dtype(dtype)
+  {}
+  virtual ~ViewDtypeViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+  at::ScalarType dtype;
+};
+
+#define VIEW_AS_COMPLEX_VIEW_FUNC_AVAILABLE
+struct ViewAsComplexViewFunc : public torch::autograd::ViewFunc {
+  ViewAsComplexViewFunc()
+  {}
+  virtual ~ViewAsComplexViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+
+};
+
+#define VIEW_AS_REAL_VIEW_FUNC_AVAILABLE
+struct ViewAsRealViewFunc : public torch::autograd::ViewFunc {
+  ViewAsRealViewFunc()
+  {}
+  virtual ~ViewAsRealViewFunc() override = default;
+  virtual std::vector<c10::SymInt> get_symints() const override;
+  virtual size_t num_symints() const override;
+  virtual std::vector<at::Tensor> get_tensors() const override;
+  virtual size_t num_tensors() const override;
+  virtual at::Tensor operator()(const at::Tensor&) const override;
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = ::std::nullopt,
+      std::optional<std::vector<at::Tensor>> = ::std::nullopt) const override;
+
+protected:
+  virtual void set_symints(std::vector<c10::SymInt>) override;
+  virtual void set_tensors(std::vector<at::Tensor>) override;
+
+private:
+
+};
+
+} // namespace torch::autograd::generated
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/generated/python_functions.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/generated/python_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..6cdafc4b28febd1fd3e44334f6f98c66799cf8b3
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/generated/python_functions.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <Python.h>
+
+// @generated from ..\tools\autograd\templates/python_functions.h
+
+// Python bindings for automatically generated autograd functions
+
+namespace torch { namespace autograd { namespace generated {
+
+void initialize_autogenerated_functions_0(PyObject* module);
+void initialize_autogenerated_functions_1(PyObject* module);
+void initialize_autogenerated_functions_2(PyObject* module);
+void initialize_autogenerated_functions_3(PyObject* module);
+void initialize_autogenerated_functions_4(PyObject* module);
+
+inline void initialize_autogenerated_functions(PyObject* module) {
+  initialize_autogenerated_functions_0(module);
+  initialize_autogenerated_functions_1(module);
+  initialize_autogenerated_functions_2(module);
+  initialize_autogenerated_functions_3(module);
+  initialize_autogenerated_functions_4(module);
+}
+
+}}} // namespace torch::autograd::generated
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/generated/python_return_types.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/generated/python_return_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..e858e5143443a95ff667efc5d35da812f35ddaef
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/generated/python_return_types.h
@@ -0,0 +1,98 @@
+#pragma once
+
+namespace torch {
+namespace autograd {
+namespace generated {
+
+PyTypeObject* get__fake_quantize_per_tensor_affine_cachemask_tensor_qparams_structseq();
+PyTypeObject* get__fused_moving_avg_obs_fq_helper_structseq();
+PyTypeObject* get__linalg_det_structseq();
+PyTypeObject* get__linalg_det_out_structseq();
+PyTypeObject* get__linalg_eigh_structseq();
+PyTypeObject* get__linalg_eigh_out_structseq();
+PyTypeObject* get__linalg_slogdet_structseq();
+PyTypeObject* get__linalg_slogdet_out_structseq();
+PyTypeObject* get__linalg_solve_ex_structseq();
+PyTypeObject* get__linalg_solve_ex_out_structseq();
+PyTypeObject* get__linalg_svd_structseq();
+PyTypeObject* get__linalg_svd_out_structseq();
+PyTypeObject* get__lu_with_info_structseq();
+PyTypeObject* get__scaled_dot_product_cudnn_attention_structseq();
+PyTypeObject* get__scaled_dot_product_efficient_attention_structseq();
+PyTypeObject* get__scaled_dot_product_flash_attention_structseq();
+PyTypeObject* get__scaled_dot_product_flash_attention_for_cpu_structseq();
+PyTypeObject* get__unpack_dual_structseq();
+PyTypeObject* get_aminmax_structseq();
+PyTypeObject* get_aminmax_out_structseq();
+PyTypeObject* get_cummax_structseq();
+PyTypeObject* get_cummax_out_structseq();
+PyTypeObject* get_cummin_structseq();
+PyTypeObject* get_cummin_out_structseq();
+PyTypeObject* get_frexp_structseq();
+PyTypeObject* get_frexp_out_structseq();
+PyTypeObject* get_geqrf_out_structseq();
+PyTypeObject* get_geqrf_structseq();
+PyTypeObject* get_histogram_out_structseq();
+PyTypeObject* get_histogram_structseq();
+PyTypeObject* get_histogramdd_structseq();
+PyTypeObject* get_kthvalue_structseq();
+PyTypeObject* get_kthvalue_out_structseq();
+PyTypeObject* get_linalg_cholesky_ex_structseq();
+PyTypeObject* get_linalg_cholesky_ex_out_structseq();
+PyTypeObject* get_linalg_eig_structseq();
+PyTypeObject* get_linalg_eig_out_structseq();
+PyTypeObject* get_linalg_eigh_structseq();
+PyTypeObject* get_linalg_eigh_out_structseq();
+PyTypeObject* get_linalg_inv_ex_structseq();
+PyTypeObject* get_linalg_inv_ex_out_structseq();
+PyTypeObject* get_linalg_ldl_factor_structseq();
+PyTypeObject* get_linalg_ldl_factor_out_structseq();
+PyTypeObject* get_linalg_ldl_factor_ex_structseq();
+PyTypeObject* get_linalg_ldl_factor_ex_out_structseq();
+PyTypeObject* get_linalg_lstsq_structseq();
+PyTypeObject* get_linalg_lstsq_out_structseq();
+PyTypeObject* get_linalg_lu_structseq();
+PyTypeObject* get_linalg_lu_out_structseq();
+PyTypeObject* get_linalg_lu_factor_structseq();
+PyTypeObject* get_linalg_lu_factor_out_structseq();
+PyTypeObject* get_linalg_lu_factor_ex_structseq();
+PyTypeObject* get_linalg_lu_factor_ex_out_structseq();
+PyTypeObject* get_linalg_qr_structseq();
+PyTypeObject* get_linalg_qr_out_structseq();
+PyTypeObject* get_linalg_slogdet_structseq();
+PyTypeObject* get_linalg_slogdet_out_structseq();
+PyTypeObject* get_linalg_solve_ex_structseq();
+PyTypeObject* get_linalg_solve_ex_out_structseq();
+PyTypeObject* get_linalg_svd_structseq();
+PyTypeObject* get_linalg_svd_out_structseq();
+PyTypeObject* get_lu_unpack_structseq();
+PyTypeObject* get_lu_unpack_out_structseq();
+PyTypeObject* get_max_structseq();
+PyTypeObject* get_max_out_structseq();
+PyTypeObject* get_median_structseq();
+PyTypeObject* get_median_out_structseq();
+PyTypeObject* get_min_structseq();
+PyTypeObject* get_min_out_structseq();
+PyTypeObject* get_mode_structseq();
+PyTypeObject* get_mode_out_structseq();
+PyTypeObject* get_nanmedian_structseq();
+PyTypeObject* get_nanmedian_out_structseq();
+PyTypeObject* get_qr_out_structseq();
+PyTypeObject* get_qr_structseq();
+PyTypeObject* get_slogdet_structseq();
+PyTypeObject* get_slogdet_out_structseq();
+PyTypeObject* get_sort_out_structseq();
+PyTypeObject* get_sort_structseq();
+PyTypeObject* get_svd_out_structseq();
+PyTypeObject* get_svd_structseq();
+PyTypeObject* get_topk_out_structseq();
+PyTypeObject* get_topk_structseq();
+PyTypeObject* get_triangular_solve_out_structseq();
+PyTypeObject* get_triangular_solve_structseq();
+
+}
+
+void initReturnTypes(PyObject* module);
+
+} // namespace autograd
+} // namespace torch
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/generated/variable_factories.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/generated/variable_factories.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc79d15bd4af9fd45a54bd059c8877f645ed36b7
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/generated/variable_factories.h
@@ -0,0 +1,746 @@
+#pragma once
+
+// @generated from ..\tools\autograd\templates/variable_factories.h
+
+#include <ATen/core/Tensor.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/grad_mode.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/core/MemoryFormat.h>
+#include <torch/csrc/api/include/torch/detail/TensorDataContainer.h>
+#include <torch/csrc/autograd/variable.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/from_blob.h>
+#include <ATen/ops/_make_dep_token.h>
+#include <ATen/ops/_cudnn_init_dropout_state.h>
+#include <ATen/ops/arange.h>
+#include <ATen/ops/arange.h>
+#include <ATen/ops/arange.h>
+#include <ATen/ops/bartlett_window.h>
+#include <ATen/ops/bartlett_window.h>
+#include <ATen/ops/blackman_window.h>
+#include <ATen/ops/blackman_window.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty.h>
+#include <ATen/ops/empty_permuted.h>
+#include <ATen/ops/_empty_affine_quantized.h>
+#include <ATen/ops/_empty_per_channel_affine_quantized.h>
+#include <ATen/ops/empty_quantized.h>
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/empty_strided.h>
+#include <ATen/ops/eye.h>
+#include <ATen/ops/eye.h>
+#include <ATen/ops/full.h>
+#include <ATen/ops/full.h>
+#include <ATen/ops/full_like.h>
+#include <ATen/ops/from_file.h>
+#include <ATen/ops/hann_window.h>
+#include <ATen/ops/hann_window.h>
+#include <ATen/ops/hamming_window.h>
+#include <ATen/ops/hamming_window.h>
+#include <ATen/ops/hamming_window.h>
+#include <ATen/ops/hamming_window.h>
+#include <ATen/ops/kaiser_window.h>
+#include <ATen/ops/kaiser_window.h>
+#include <ATen/ops/kaiser_window.h>
+#include <ATen/ops/linspace.h>
+#include <ATen/ops/linspace.h>
+#include <ATen/ops/linspace.h>
+#include <ATen/ops/linspace.h>
+#include <ATen/ops/logspace.h>
+#include <ATen/ops/logspace.h>
+#include <ATen/ops/logspace.h>
+#include <ATen/ops/logspace.h>
+#include <ATen/ops/ones.h>
+#include <ATen/ops/ones.h>
+#include <ATen/ops/ones_like.h>
+#include <ATen/ops/scalar_tensor.h>
+#include <ATen/ops/rand.h>
+#include <ATen/ops/rand.h>
+#include <ATen/ops/rand.h>
+#include <ATen/ops/rand.h>
+#include <ATen/ops/rand_like.h>
+#include <ATen/ops/randint.h>
+#include <ATen/ops/randint.h>
+#include <ATen/ops/randint.h>
+#include <ATen/ops/randint.h>
+#include <ATen/ops/randint_like.h>
+#include <ATen/ops/randint_like.h>
+#include <ATen/ops/randint_like.h>
+#include <ATen/ops/randn.h>
+#include <ATen/ops/randn.h>
+#include <ATen/ops/randn.h>
+#include <ATen/ops/randn.h>
+#include <ATen/ops/randn_like.h>
+#include <ATen/ops/randperm.h>
+#include <ATen/ops/randperm.h>
+#include <ATen/ops/range.h>
+#include <ATen/ops/range.h>
+#include <ATen/ops/zeros.h>
+#include <ATen/ops/_efficientzerotensor.h>
+#include <ATen/ops/zeros.h>
+#include <ATen/ops/zeros_like.h>
+#include <ATen/ops/_sparse_compressed_tensor_with_dims.h>
+#include <ATen/ops/sparse_compressed_tensor.h>
+#include <ATen/ops/sparse_csr_tensor.h>
+#include <ATen/ops/sparse_csc_tensor.h>
+#include <ATen/ops/sparse_bsr_tensor.h>
+#include <ATen/ops/sparse_bsc_tensor.h>
+#include <ATen/ops/sparse_compressed_tensor.h>
+#include <ATen/ops/sparse_csr_tensor.h>
+#include <ATen/ops/sparse_csc_tensor.h>
+#include <ATen/ops/sparse_bsr_tensor.h>
+#include <ATen/ops/sparse_bsc_tensor.h>
+#include <ATen/ops/_sparse_compressed_tensor_unsafe.h>
+#include <ATen/ops/_sparse_csr_tensor_unsafe.h>
+#include <ATen/ops/_sparse_csc_tensor_unsafe.h>
+#include <ATen/ops/_sparse_bsr_tensor_unsafe.h>
+#include <ATen/ops/_sparse_bsc_tensor_unsafe.h>
+#include <ATen/ops/sparse_coo_tensor.h>
+#include <ATen/ops/sparse_coo_tensor.h>
+#include <ATen/ops/sparse_coo_tensor.h>
+#include <ATen/ops/_sparse_coo_tensor_unsafe.h>
+#include <ATen/ops/_sparse_coo_tensor_with_dims.h>
+#include <ATen/ops/_sparse_coo_tensor_with_dims_and_tensors.h>
+#include <ATen/ops/_to_copy.h>
+#include <ATen/ops/tril_indices.h>
+#include <ATen/ops/triu_indices.h>
+#include <ATen/ops/normal.h>
+#include <ATen/ops/fft_fftfreq.h>
+#include <ATen/ops/fft_rfftfreq.h>
+#endif
+
+#include <functional>
+#include <initializer_list>
+#include <utility>
+
+namespace torch {
+
+/// NOTE: Currently `torch::tensor(...)` doesn't support mixed data types
+/// (i.e. `torch::tensor({{bool, 2.0}})` doesn't work). We might be able to
+/// support it in the future by iterating over all sub-lists to find
+/// the largest data type that can represent all of the elements, or by using
+/// variadic templates.
+///
+/// NOTE: C++ `torch::tensor` with a floating-point type or an `at::ArrayRef` / `std::vector` /
+/// (nested) braced-init-list of floating-point types always produces a tensor of dtype
+/// `torch::get_default_dtype()`, matching Python `torch.tensor` behavior.
+///
+/// NOTE: C++ `torch::tensor` with an integer type or an `at::ArrayRef` / `std::vector` /
+/// (nested) braced-init-list of integer types always produces a tensor of dtype `at::kLong`
+/// (aka. int64_t), matching Python `torch.tensor` behavior.
+///
+/// NOTE: The following dtypes are not supported by `torch::tensor` currently:
+/// - `unsigned int`
+/// - `unsigned long int`
+/// - `unsigned long long int`
+/// - `long long int`
+inline at::Tensor tensor(detail::TensorDataContainer tensor_data_container, const at::TensorOptions& options = {}) {
+  return autograd::make_variable(
+    // note: we remove the requires_grad setting from the TensorOptions because
+    // it is ignored anyways (and we actually have an assertion that it isn't set
+    // which would fail otherwise). We handle requires_grad explicitly here
+    // instead of passing it through to the kernel.
+    tensor_data_container.convert_to_tensor(options.requires_grad(::std::nullopt)),
+    options.requires_grad());
+}
+
+/// A generic deleter function.
+using Deleter = std::function<void(void*)>;
+using at::MemoryFormat;
+
+/// Exposes the given `data` as a `Tensor` without taking ownership of the
+/// original data. `sizes` should specify the shape of the tensor, `strides` the
+/// stride in each dimension. The `deleter` function (a
+/// `std::function<void(void*)>`) will be called on the `data` when the Tensor
+/// data would normally be deallocated. The `TensorOptions` specify additional
+/// configuration options for the returned tensor, such as what type to
+/// interpret the `data` as.
+inline at::Tensor from_blob(
+    void* data,
+    at::IntArrayRef sizes,
+    at::IntArrayRef strides,
+    const Deleter& deleter,
+    const at::TensorOptions& options = at::TensorOptions()) {
+  at::Tensor tensor = ([&]() {
+    at::AutoDispatchBelowAutograd guard;  // TODO: remove
+    at::tracer::impl::NoTracerDispatchMode tracer_guard;
+    return at::from_blob(data, sizes, strides, deleter, options.requires_grad(::std::nullopt));
+  })();
+  return autograd::make_variable(tensor, options.requires_grad());
+}
+
+/// Exposes the given `data` as a `Tensor` without taking ownership of the
+/// original data. `sizes` should specify the shape of the tensor, `strides` the
+/// stride in each dimension. The `TensorOptions`
+/// specify additional configuration options for the returned tensor, such as
+/// what type to interpret the `data` as.
+inline at::Tensor from_blob(
+    void* data,
+    at::IntArrayRef sizes,
+    at::IntArrayRef strides,
+    const at::TensorOptions& options = at::TensorOptions()) {
+  at::Tensor tensor = ([&]() {
+    at::AutoDispatchBelowAutograd guard;  // TODO: remove
+    at::tracer::impl::NoTracerDispatchMode tracer_guard;
+    return at::from_blob(data, sizes, strides, options.requires_grad(::std::nullopt));
+  })();
+  return autograd::make_variable(tensor, options.requires_grad());
+}
+
+/// Exposes the given `data` as a `Tensor` without taking ownership of the
+/// original data. `sizes` should specify the shape of the tensor. The `deleter`
+/// (a `std::function<void(void*)>`) function will be called on the `data` when
+/// the Tensor data would normally be deallocated. The `TensorOptions` specify
+/// additional configuration options for the returned tensor, such as what type
+/// to interpret the `data` as.
+inline at::Tensor from_blob(
+    void* data,
+    at::IntArrayRef sizes,
+    const Deleter& deleter,
+    const at::TensorOptions& options = at::TensorOptions()) {
+  at::Tensor tensor = ([&]() {
+    at::AutoDispatchBelowAutograd guard;  // TODO: remove
+    at::tracer::impl::NoTracerDispatchMode tracer_guard;
+    return at::from_blob(data, sizes, deleter, options.requires_grad(::std::nullopt));
+  })();
+  return autograd::make_variable(tensor, options.requires_grad());
+}
+
+/// Exposes the given `data` as a `Tensor` without taking ownership of the
+/// original data. `sizes` should specify the shape of the tensor. The
+/// `TensorOptions` specify additional configuration options for the returned
+/// tensor, such as what type to interpret the `data` as.
+inline at::Tensor from_blob(
+    void* data,
+    at::IntArrayRef sizes,
+    const at::TensorOptions& options = at::TensorOptions()) {
+  at::Tensor tensor = ([&]() {
+    at::AutoDispatchBelowAutograd guard;  // TODO: remove
+    at::tracer::impl::NoTracerDispatchMode tracer_guard;
+    return at::from_blob(data, sizes, options.requires_grad(::std::nullopt));
+  })();
+  return autograd::make_variable(tensor, options.requires_grad());
+}
+
+inline at::Tensor _make_dep_token(at::TensorOptions options = {}, ::std::optional<at::MemoryFormat> memory_format = ::std::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_make_dep_token(at::TensorOptions(options).requires_grad(::std::nullopt), memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _cudnn_init_dropout_state(double dropout, bool train, int64_t dropout_seed, at::TensorOptions options) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_cudnn_init_dropout_state(dropout, train, dropout_seed, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor arange(const at::Scalar & end, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::arange(end, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor arange(const at::Scalar & start, const at::Scalar & end, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::arange(start, end, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor arange(const at::Scalar & start, const at::Scalar & end, const at::Scalar & step, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::arange(start, end, step, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor bartlett_window(int64_t window_length, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::bartlett_window(window_length, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor bartlett_window(int64_t window_length, bool periodic, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::bartlett_window(window_length, periodic, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor blackman_window(int64_t window_length, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::blackman_window(window_length, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor blackman_window(int64_t window_length, bool periodic, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::blackman_window(window_length, periodic, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor empty(at::IntArrayRef size, ::std::optional<at::DimnameList> names, at::TensorOptions options = {}, ::std::optional<at::MemoryFormat> memory_format = ::std::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::empty(size, names, at::TensorOptions(options).requires_grad(::std::nullopt), memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor empty(at::IntArrayRef size, at::TensorOptions options = {}, ::std::optional<at::MemoryFormat> memory_format = ::std::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::empty(size, at::TensorOptions(options).requires_grad(::std::nullopt), memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor empty_symint(c10::SymIntArrayRef size, at::TensorOptions options = {}, ::std::optional<at::MemoryFormat> memory_format = ::std::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::empty_symint(size, at::TensorOptions(options).requires_grad(::std::nullopt), memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor empty_permuted(at::IntArrayRef size, at::IntArrayRef physical_layout, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::empty_permuted(size, physical_layout, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor empty_permuted_symint(c10::SymIntArrayRef size, at::IntArrayRef physical_layout, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::empty_permuted_symint(size, physical_layout, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _empty_affine_quantized(at::IntArrayRef size, at::TensorOptions options = {}, double scale = 1, int64_t zero_point = 0, ::std::optional<at::MemoryFormat> memory_format = c10::MemoryFormat::Contiguous) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_empty_affine_quantized(size, at::TensorOptions(options).requires_grad(::std::nullopt), scale, zero_point, memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _empty_affine_quantized_symint(c10::SymIntArrayRef size, at::TensorOptions options = {}, double scale = 1, int64_t zero_point = 0, ::std::optional<at::MemoryFormat> memory_format = c10::MemoryFormat::Contiguous) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_empty_affine_quantized_symint(size, at::TensorOptions(options).requires_grad(::std::nullopt), scale, zero_point, memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _empty_per_channel_affine_quantized(at::IntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, at::TensorOptions options = {}, ::std::optional<at::MemoryFormat> memory_format = c10::MemoryFormat::Contiguous) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_empty_per_channel_affine_quantized(size, scales, zero_points, axis, at::TensorOptions(options).requires_grad(::std::nullopt), memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _empty_per_channel_affine_quantized_symint(c10::SymIntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, at::TensorOptions options = {}, ::std::optional<at::MemoryFormat> memory_format = c10::MemoryFormat::Contiguous) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_empty_per_channel_affine_quantized_symint(size, scales, zero_points, axis, at::TensorOptions(options).requires_grad(::std::nullopt), memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor empty_quantized(at::IntArrayRef size, const at::Tensor & qtensor, at::TensorOptions options = {}, ::std::optional<at::MemoryFormat> memory_format = ::std::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::empty_quantized(size, qtensor, at::TensorOptions(options).requires_grad(::std::nullopt), memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor empty_like(const at::Tensor & self, at::TensorOptions options = {}, ::std::optional<at::MemoryFormat> memory_format = ::std::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::empty_like(self, at::TensorOptions(options).requires_grad(::std::nullopt), memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor empty_strided(at::IntArrayRef size, at::IntArrayRef stride, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::empty_strided(size, stride, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor empty_strided_symint(c10::SymIntArrayRef size, c10::SymIntArrayRef stride, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::empty_strided_symint(size, stride, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor eye(int64_t n, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::eye(n, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor eye_symint(c10::SymInt n, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::eye_symint(n, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor eye(int64_t n, int64_t m, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::eye(n, m, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor eye_symint(c10::SymInt n, c10::SymInt m, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::eye_symint(n, m, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor full(at::IntArrayRef size, const at::Scalar & fill_value, ::std::optional<at::DimnameList> names, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::full(size, fill_value, names, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor full(at::IntArrayRef size, const at::Scalar & fill_value, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::full(size, fill_value, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor full_symint(c10::SymIntArrayRef size, const at::Scalar & fill_value, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::full_symint(size, fill_value, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor full_like(const at::Tensor & self, const at::Scalar & fill_value, at::TensorOptions options = {}, ::std::optional<at::MemoryFormat> memory_format = ::std::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::full_like(self, fill_value, at::TensorOptions(options).requires_grad(::std::nullopt), memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor from_file(c10::string_view filename, ::std::optional<bool> shared = ::std::nullopt, ::std::optional<int64_t> size = 0, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::from_file(filename, shared, size, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor hann_window(int64_t window_length, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::hann_window(window_length, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor hann_window(int64_t window_length, bool periodic, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::hann_window(window_length, periodic, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor hamming_window(int64_t window_length, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::hamming_window(window_length, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor hamming_window(int64_t window_length, bool periodic, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::hamming_window(window_length, periodic, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor hamming_window(int64_t window_length, bool periodic, double alpha, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::hamming_window(window_length, periodic, alpha, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor hamming_window(int64_t window_length, bool periodic, double alpha, double beta, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::hamming_window(window_length, periodic, alpha, beta, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor kaiser_window(int64_t window_length, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::kaiser_window(window_length, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor kaiser_window(int64_t window_length, bool periodic, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::kaiser_window(window_length, periodic, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor kaiser_window(int64_t window_length, bool periodic, double beta, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::kaiser_window(window_length, periodic, beta, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor linspace(const at::Scalar & start, const at::Scalar & end, int64_t steps, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::linspace(start, end, steps, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor linspace(const at::Tensor & start, const at::Tensor & end, int64_t steps, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::linspace(start, end, steps, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor linspace(const at::Tensor & start, const at::Scalar & end, int64_t steps, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::linspace(start, end, steps, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor linspace(const at::Scalar & start, const at::Tensor & end, int64_t steps, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::linspace(start, end, steps, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor logspace(const at::Scalar & start, const at::Scalar & end, int64_t steps, double base = 10.0, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::logspace(start, end, steps, base, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor logspace(const at::Tensor & start, const at::Tensor & end, int64_t steps, double base = 10.0, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::logspace(start, end, steps, base, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor logspace(const at::Tensor & start, const at::Scalar & end, int64_t steps, double base = 10.0, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::logspace(start, end, steps, base, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor logspace(const at::Scalar & start, const at::Tensor & end, int64_t steps, double base = 10.0, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::logspace(start, end, steps, base, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor ones(at::IntArrayRef size, ::std::optional<at::DimnameList> names, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::ones(size, names, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor ones(at::IntArrayRef size, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::ones(size, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor ones_symint(c10::SymIntArrayRef size, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::ones_symint(size, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor ones_like(const at::Tensor & self, at::TensorOptions options = {}, ::std::optional<at::MemoryFormat> memory_format = ::std::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::ones_like(self, at::TensorOptions(options).requires_grad(::std::nullopt), memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor scalar_tensor(const at::Scalar & s, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::scalar_tensor(s, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor rand(at::IntArrayRef size, ::std::optional<at::DimnameList> names, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::rand(size, names, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor rand_symint(c10::SymIntArrayRef size, ::std::optional<at::DimnameList> names, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::rand_symint(size, names, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor rand(at::IntArrayRef size, ::std::optional<at::Generator> generator, ::std::optional<at::DimnameList> names, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::rand(size, generator, names, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor rand_symint(c10::SymIntArrayRef size, ::std::optional<at::Generator> generator, ::std::optional<at::DimnameList> names, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::rand_symint(size, generator, names, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor rand(at::IntArrayRef size, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::rand(size, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor rand_symint(c10::SymIntArrayRef size, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::rand_symint(size, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor rand(at::IntArrayRef size, ::std::optional<at::Generator> generator, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::rand(size, generator, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor rand_symint(c10::SymIntArrayRef size, ::std::optional<at::Generator> generator, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::rand_symint(size, generator, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor rand_like(const at::Tensor & self, at::TensorOptions options = {}, ::std::optional<at::MemoryFormat> memory_format = ::std::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::rand_like(self, at::TensorOptions(options).requires_grad(::std::nullopt), memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randint(int64_t high, at::IntArrayRef size, at::TensorOptions options = at::kLong) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randint(high, size, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randint_symint(c10::SymInt high, c10::SymIntArrayRef size, at::TensorOptions options = at::kLong) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randint_symint(high, size, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randint(int64_t high, at::IntArrayRef size, ::std::optional<at::Generator> generator, at::TensorOptions options = at::kLong) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randint(high, size, generator, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randint_symint(c10::SymInt high, c10::SymIntArrayRef size, ::std::optional<at::Generator> generator, at::TensorOptions options = at::kLong) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randint_symint(high, size, generator, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randint(int64_t low, int64_t high, at::IntArrayRef size, at::TensorOptions options = at::kLong) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randint(low, high, size, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randint_symint(c10::SymInt low, c10::SymInt high, c10::SymIntArrayRef size, at::TensorOptions options = at::kLong) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randint_symint(low, high, size, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randint(int64_t low, int64_t high, at::IntArrayRef size, ::std::optional<at::Generator> generator, at::TensorOptions options = at::kLong) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randint(low, high, size, generator, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randint_symint(c10::SymInt low, c10::SymInt high, c10::SymIntArrayRef size, ::std::optional<at::Generator> generator, at::TensorOptions options = at::kLong) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randint_symint(low, high, size, generator, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randint_like(const at::Tensor & self, int64_t high, at::TensorOptions options = {}, ::std::optional<at::MemoryFormat> memory_format = ::std::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randint_like(self, high, at::TensorOptions(options).requires_grad(::std::nullopt), memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randint_like_symint(const at::Tensor & self, c10::SymInt high, at::TensorOptions options = {}, ::std::optional<at::MemoryFormat> memory_format = ::std::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randint_like_symint(self, high, at::TensorOptions(options).requires_grad(::std::nullopt), memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randint_like(const at::Tensor & self, const at::Tensor & high, at::TensorOptions options = {}, ::std::optional<at::MemoryFormat> memory_format = ::std::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randint_like(self, high, at::TensorOptions(options).requires_grad(::std::nullopt), memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randint_like(const at::Tensor & self, int64_t low, int64_t high, at::TensorOptions options = {}, ::std::optional<at::MemoryFormat> memory_format = ::std::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randint_like(self, low, high, at::TensorOptions(options).requires_grad(::std::nullopt), memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randint_like_symint(const at::Tensor & self, c10::SymInt low, c10::SymInt high, at::TensorOptions options = {}, ::std::optional<at::MemoryFormat> memory_format = ::std::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randint_like_symint(self, low, high, at::TensorOptions(options).requires_grad(::std::nullopt), memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randn(at::IntArrayRef size, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randn(size, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randn_symint(c10::SymIntArrayRef size, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randn_symint(size, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randn(at::IntArrayRef size, ::std::optional<at::Generator> generator, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randn(size, generator, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randn_symint(c10::SymIntArrayRef size, ::std::optional<at::Generator> generator, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randn_symint(size, generator, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randn(at::IntArrayRef size, ::std::optional<at::DimnameList> names, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randn(size, names, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randn_symint(c10::SymIntArrayRef size, ::std::optional<at::DimnameList> names, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randn_symint(size, names, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randn(at::IntArrayRef size, ::std::optional<at::Generator> generator, ::std::optional<at::DimnameList> names, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randn(size, generator, names, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randn_symint(c10::SymIntArrayRef size, ::std::optional<at::Generator> generator, ::std::optional<at::DimnameList> names, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randn_symint(size, generator, names, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randn_like(const at::Tensor & self, at::TensorOptions options = {}, ::std::optional<at::MemoryFormat> memory_format = ::std::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randn_like(self, at::TensorOptions(options).requires_grad(::std::nullopt), memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randperm(int64_t n, at::TensorOptions options = at::kLong) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randperm(n, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randperm_symint(c10::SymInt n, at::TensorOptions options = at::kLong) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randperm_symint(n, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randperm(int64_t n, ::std::optional<at::Generator> generator, at::TensorOptions options = at::kLong) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randperm(n, generator, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor randperm_symint(c10::SymInt n, ::std::optional<at::Generator> generator, at::TensorOptions options = at::kLong) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::randperm_symint(n, generator, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor range(const at::Scalar & start, const at::Scalar & end, const at::Scalar & step = 1, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::range(start, end, step, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor range(const at::Scalar & start, const at::Scalar & end, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::range(start, end, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor zeros(at::IntArrayRef size, ::std::optional<at::DimnameList> names, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::zeros(size, names, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _efficientzerotensor(at::IntArrayRef size, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_efficientzerotensor(size, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _efficientzerotensor_symint(c10::SymIntArrayRef size, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_efficientzerotensor_symint(size, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor zeros(at::IntArrayRef size, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::zeros(size, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor zeros_symint(c10::SymIntArrayRef size, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::zeros_symint(size, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor zeros_like(const at::Tensor & self, at::TensorOptions options = {}, ::std::optional<at::MemoryFormat> memory_format = ::std::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::zeros_like(self, at::TensorOptions(options).requires_grad(::std::nullopt), memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _sparse_compressed_tensor_with_dims(int64_t nnz, int64_t dense_dim, at::IntArrayRef size, at::IntArrayRef blocksize, at::ScalarType index_dtype, at::TensorOptions options) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_sparse_compressed_tensor_with_dims(nnz, dense_dim, size, blocksize, index_dtype, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor sparse_compressed_tensor(const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::sparse_compressed_tensor(compressed_indices, plain_indices, values, size, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor sparse_compressed_tensor_symint(const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, c10::SymIntArrayRef size, at::TensorOptions options) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::sparse_compressed_tensor_symint(compressed_indices, plain_indices, values, size, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor sparse_csr_tensor(const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::sparse_csr_tensor(crow_indices, col_indices, values, size, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor sparse_csc_tensor(const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::sparse_csc_tensor(ccol_indices, row_indices, values, size, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor sparse_bsr_tensor(const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::sparse_bsr_tensor(crow_indices, col_indices, values, size, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor sparse_bsc_tensor(const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::sparse_bsc_tensor(ccol_indices, row_indices, values, size, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor sparse_compressed_tensor(const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, at::TensorOptions options) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::sparse_compressed_tensor(compressed_indices, plain_indices, values, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor sparse_csr_tensor(const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::TensorOptions options) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::sparse_csr_tensor(crow_indices, col_indices, values, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor sparse_csc_tensor(const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, at::TensorOptions options) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::sparse_csc_tensor(ccol_indices, row_indices, values, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor sparse_bsr_tensor(const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::TensorOptions options) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::sparse_bsr_tensor(crow_indices, col_indices, values, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor sparse_bsc_tensor(const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, at::TensorOptions options) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::sparse_bsc_tensor(ccol_indices, row_indices, values, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _sparse_compressed_tensor_unsafe(const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_sparse_compressed_tensor_unsafe(compressed_indices, plain_indices, values, size, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _sparse_compressed_tensor_unsafe_symint(const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, c10::SymIntArrayRef size, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_sparse_compressed_tensor_unsafe_symint(compressed_indices, plain_indices, values, size, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _sparse_csr_tensor_unsafe(const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_sparse_csr_tensor_unsafe(crow_indices, col_indices, values, size, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _sparse_csc_tensor_unsafe(const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_sparse_csc_tensor_unsafe(ccol_indices, row_indices, values, size, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _sparse_bsr_tensor_unsafe(const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_sparse_bsr_tensor_unsafe(crow_indices, col_indices, values, size, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _sparse_bsc_tensor_unsafe(const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_sparse_bsc_tensor_unsafe(ccol_indices, row_indices, values, size, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor sparse_coo_tensor(at::IntArrayRef size, at::TensorOptions options) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::sparse_coo_tensor(size, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor sparse_coo_tensor(const at::Tensor & indices, const at::Tensor & values, at::TensorOptions options = {}, ::std::optional<bool> is_coalesced = ::std::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::sparse_coo_tensor(indices, values, at::TensorOptions(options).requires_grad(::std::nullopt), is_coalesced), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor sparse_coo_tensor(const at::Tensor & indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options = {}, ::std::optional<bool> is_coalesced = ::std::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::sparse_coo_tensor(indices, values, size, at::TensorOptions(options).requires_grad(::std::nullopt), is_coalesced), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _sparse_coo_tensor_unsafe(const at::Tensor & indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options = {}, ::std::optional<bool> is_coalesced = ::std::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_sparse_coo_tensor_unsafe(indices, values, size, at::TensorOptions(options).requires_grad(::std::nullopt), is_coalesced), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _sparse_coo_tensor_unsafe_symint(const at::Tensor & indices, const at::Tensor & values, c10::SymIntArrayRef size, at::TensorOptions options = {}, ::std::optional<bool> is_coalesced = ::std::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_sparse_coo_tensor_unsafe_symint(indices, values, size, at::TensorOptions(options).requires_grad(::std::nullopt), is_coalesced), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _sparse_coo_tensor_with_dims(int64_t sparse_dim, int64_t dense_dim, at::IntArrayRef size, at::TensorOptions options) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_sparse_coo_tensor_with_dims(sparse_dim, dense_dim, size, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _sparse_coo_tensor_with_dims_and_tensors(int64_t sparse_dim, int64_t dense_dim, at::IntArrayRef size, const at::Tensor & indices, const at::Tensor & values, at::TensorOptions options, ::std::optional<bool> is_coalesced = ::std::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_sparse_coo_tensor_with_dims_and_tensors(sparse_dim, dense_dim, size, indices, values, at::TensorOptions(options).requires_grad(::std::nullopt), is_coalesced), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _sparse_coo_tensor_with_dims_and_tensors_symint(int64_t sparse_dim, int64_t dense_dim, c10::SymIntArrayRef size, const at::Tensor & indices, const at::Tensor & values, at::TensorOptions options, ::std::optional<bool> is_coalesced = ::std::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_sparse_coo_tensor_with_dims_and_tensors_symint(sparse_dim, dense_dim, size, indices, values, at::TensorOptions(options).requires_grad(::std::nullopt), is_coalesced), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor _to_copy(const at::Tensor & self, at::TensorOptions options = {}, bool non_blocking = false, ::std::optional<at::MemoryFormat> memory_format = ::std::nullopt) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::_to_copy(self, at::TensorOptions(options).requires_grad(::std::nullopt), non_blocking, memory_format), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor tril_indices(int64_t row, int64_t col, int64_t offset = 0, at::TensorOptions options = at::kLong) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::tril_indices(row, col, offset, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor triu_indices(int64_t row, int64_t col, int64_t offset = 0, at::TensorOptions options = at::kLong) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::triu_indices(row, col, offset, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor normal(double mean, double std, at::IntArrayRef size, ::std::optional<at::Generator> generator = ::std::nullopt, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::normal(mean, std, size, generator, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor normal_symint(double mean, double std, c10::SymIntArrayRef size, ::std::optional<at::Generator> generator = ::std::nullopt, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::normal_symint(mean, std, size, generator, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor fft_fftfreq(int64_t n, double d = 1.0, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::fft_fftfreq(n, d, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+inline at::Tensor fft_rfftfreq(int64_t n, double d = 1.0, at::TensorOptions options = {}) {
+  at::AutoDispatchBelowADInplaceOrView guard;
+  return autograd::make_variable(at::fft_rfftfreq(n, d, at::TensorOptions(options).requires_grad(::std::nullopt)), /*requires_grad=*/options.requires_grad());
+}
+
+} // namespace torch
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/grad_mode.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/grad_mode.h
new file mode 100644
index 0000000000000000000000000000000000000000..1935068fb8bb48aeaac06a9697ccc95f2d7efe74
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/grad_mode.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <ATen/core/grad_mode.h>
+#include <torch/csrc/Export.h>
+
+namespace torch::autograd {
+
+using GradMode = at::GradMode;
+using AutoGradMode = at::AutoGradMode;
+
+} // namespace torch::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/graph_task.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/graph_task.h
new file mode 100644
index 0000000000000000000000000000000000000000..46e7c814632c9a5e9b48168dbd09d550ffdeb694
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/graph_task.h
@@ -0,0 +1,229 @@
+#pragma once
+#include <ATen/ThreadLocalState.h>
+#include <ATen/core/Tensor.h>
+#include <c10/util/ThreadLocal.h>
+#include <torch/csrc/autograd/input_buffer.h>
+#include <torch/csrc/autograd/utils/warnings.h>
+#include <vector>
+
+namespace torch::autograd {
+
+using edge_list = std::vector<Edge>;
+struct ReadyQueue;
+
+static constexpr int NO_DEVICE = -2;
+static constexpr int CPU_DEVICE = -1;
+
+// GraphTask holds metadata needed for a single execution of backward()
+struct GraphTask : std::enable_shared_from_this<GraphTask> {
+  std::atomic<uint64_t> outstanding_tasks_{0};
+  // Indicates if an error occurred while executing any task.  When this is
+  // true, it signals all threads to stop executing.
+  std::atomic_bool has_error_{false};
+  std::atomic_bool future_completed_{false};
+  // It is safe to read keep_graph_ without synchronization
+  bool keep_graph_;
+
+  // To protect reads/writes to not_ready_, dependencies_, captured_vars_,
+  // has_error_, future_result_, cpu_ready_queue_, and leaf_streams.
+  std::mutex mutex_;
+  std::unordered_map<Node*, InputBuffer> not_ready_;
+  std::unordered_map<Node*, int> dependencies_;
+
+  // Records the nodes that are in the graph
+  std::unordered_set<Node*> nodes_in_graph_;
+  c10::SmallVector<Node*, 4> graph_roots_;
+  // Note [Exec info]
+  // Exec info is created for each GraphTask, which allows filtering paths on
+  // the graph that are not needed. It has a bit complicated semantics. If it's
+  // empty, it means the task is run in a "default" mode, which means that all
+  // next_edges we encounter should get executed. If it's not empty, only
+  // functions that have an entry and this entry has needed == True should be
+  // executed. exec_info is only empty when the graph is executed via
+  // .backward() and the inputs parameter is not passed. Otherwise, when
+  // executed through .grad(), or when inputs arg is specified for .backward(),
+  // exec_info will be non-empty.
+  //
+  struct ExecInfo {
+    struct Capture {
+      Capture(const Capture&) = delete;
+      Capture(Capture&&) = default;
+      Capture& operator=(const Capture&) = delete;
+      Capture& operator=(Capture&&) = default;
+      ~Capture() = default;
+
+      Capture(int input_idx, int output_idx)
+          : input_idx_(input_idx), output_idx_(output_idx) {}
+      int input_idx_; // within Node inputs
+      int output_idx_; // within the output vector of a GraphTask
+
+      // This hook will be executed after a grad is captured. The captured
+      // grad will be replaced by the return value of the hook.
+      struct GradCaptureHook {
+        virtual ~GradCaptureHook() = default;
+        virtual at::Tensor operator()(const at::Tensor& grad) = 0;
+      };
+      // NOTE [Deprecated capture hooks]
+      //
+      // The current status of capture hooks is that we continue to support
+      // the single usage of it by distributed in the dist_engine. If anyone
+      // else needs to use it for other purposes, they should file an issue.
+      //
+      // Capture hooks were originally created because there did not exist
+      // any way to register pre/post hooks to grad_fn in a way such that it
+      // would still be executed even if that is the grad_fn of a Tensor
+      // passed as input= of .grad. As far as I know, only dist_engine uses
+      // this hook.
+      //
+      // However, there are other alternatives today like tensor hooks that can
+      // replace the usage that originally motivated its creation. Also,
+      // Captures hooks are an outlier in terms of the types of hook that
+      // autograd offers in how it is registered and behaves, e.g. it is a hook
+      // registered not to the graph, but to a particular graph_task! This makes
+      // it a burden to maintain.
+      //
+      // It would be very nice to clean up/do a migration from pre/post
+      // hooks used in distributed to use tensor hooks, but for now we just
+      // mark this method as deprecated to prevent additional usage.
+      //
+      // If you still think you really need to capture hooks, please file an
+      // issue (and tag autograd).
+      const std::vector<std::unique_ptr<GradCaptureHook>>&
+      DO_NOT_USE_DEPRECATED_get_capture_hooks() const {
+        return hooks_;
+      }
+      // See NOTE [deprecated capture hooks]
+      void DO_NOT_USE_DEPRECATED_register_capture_hook(
+          std::unique_ptr<GradCaptureHook> hook) {
+        hooks_.push_back(std::move(hook));
+      }
+
+     private:
+      // The hooks will be called one by one in the order as they were added.
+      // The input grad of a hook will be the output of its preceding hook. The
+      // first hook will take the captured grad as the input. The output of the
+      // last hook will replace the captured grad.
+      std::vector<std::unique_ptr<GradCaptureHook>> hooks_;
+    };
+
+    bool should_execute() const {
+      return needed_ || captures_;
+    }
+
+    bool needed_ = false;
+    std::unique_ptr<std::vector<Capture>> captures_;
+  };
+  // exec_info_ is safe to read without synchronization
+  std::unordered_map<Node*, ExecInfo> exec_info_;
+  // Captures variables are grads captured that we return to the user. After
+  // execution of the GraphTask is completed, the captured_vars_ are moved
+  // out of the GraphTask and are no longer valid.
+  std::vector<Variable> captured_vars_;
+
+  // Note: this field is not ready to be used until the proper
+  // `thread_locals_.set_grad_mode()` call in the constructor.
+  at::ThreadLocalState thread_locals_ = at::ThreadLocalState();
+
+  std::unordered_set<c10::Stream> leaf_streams;
+
+  // Per-device current streams of the execute() that called this GraphTask.
+  // These will be synced with leaf_streams in exec_post_processing.
+  std::vector<std::optional<c10::Stream>> caller_current_streams_;
+
+  // Collects caller_current_streams_ for the accelerator device.
+  void stash_current_streams();
+
+  void init_to_execute(
+      Node& graph_root,
+      const edge_list& outputs,
+      bool accumulate_grad,
+      uint64_t min_topo_nr);
+
+  // The value of worker_device in the thread that created this task.
+  // See Note [Reentrant backwards]
+  // Safe to read owner_ and reentrant_depth_ without synchronization
+  int owner_;
+  // The number of parent graph tasks for this graph task
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const int reentrant_depth_;
+
+  bool can_checkpoint() const {
+    return exec_info_.empty();
+  }
+
+  // check if the GraphTask is completed or not
+  bool completed();
+  // mark the graph task as completed and trigger post processing
+  void mark_as_completed_and_run_post_processing();
+
+  // Set an appropriate exception on this graph_task which was encountered while
+  // running the provided function.
+  void set_exception(std::exception_ptr eptr, const std::shared_ptr<Node>& fn);
+
+  // Set an appropriate exception on this graph_task which was encountered while
+  // running the provided function. But doesn't signal completion on
+  // 'future_result_' right away. The user needs to explicitly mark
+  // 'future_result_' completed with an appropriate exception.
+  void set_exception_without_signal(const std::shared_ptr<Node>& fn);
+
+  // Whether or not to stop execution for this GraphTask when an error is
+  // encountered. When set to true, this would cause Engine::execute() to throw
+  // an exception as soon as the autograd engine receives an exception.
+  bool exit_on_error_;
+
+  // CPU threads are dedicated to processing CPU work for the backward they
+  // invoked. So any given graph task maintains its own cpu_ready_queue_ where
+  // you should send work for it to be done. We memoize the cpu_ready_queue_ per
+  // GraphTask so that we know which ready queue we should push to if we are on
+  // device thread (i.e. GPU) and but next NodeTask should be run on CPU.
+  std::shared_ptr<ReadyQueue> cpu_ready_queue_;
+
+  // Future representing the completion of the graph task. Notified when all
+  // tasks are done.
+  c10::intrusive_ptr<at::ivalue::Future> future_result_;
+
+  // Final callbacks installed during execution of this GraphTask
+  std::vector<std::function<void()>> final_callbacks_;
+  // To protect reads and writes to final_callbacks_. Intentionally no reusing
+  // mutex_ as the two are protecting different data structures.
+  std::mutex final_callbacks_lock_;
+
+  utils::DelayWarningHandler warning_handler_;
+
+  uint64_t id_;
+
+  GraphTask(
+      bool keep_graph,
+      bool grad_mode,
+      int reentrant_depth,
+      std::shared_ptr<ReadyQueue> cpu_ready_queue,
+      c10::SmallVector<Node*, 4> graph_roots,
+      bool exit_on_error = false);
+
+ private:
+  // run GraphTask post processing
+  void exec_post_processing();
+};
+
+// The guard that sets and restores current_graph_task.
+class GraphTaskGuard {
+ public:
+  explicit GraphTaskGuard(std::shared_ptr<GraphTask> graph_task);
+  ~GraphTaskGuard();
+
+  void restore_current_graph_task();
+
+ private:
+  std::shared_ptr<GraphTask> last_graph_task_;
+};
+
+TORCH_API const std::unordered_map<Node*, GraphTask::ExecInfo>*
+get_current_graph_task_exec_info();
+TORCH_API const std::unordered_set<Node*>*
+get_current_graph_task_nodes_in_graph();
+TORCH_API bool get_current_graph_task_keep_graph();
+TORCH_API std::vector<Node*> get_current_graph_task_execution_order();
+TORCH_API int get_current_graph_task_id();
+void add_node_to_current_graph_task_exec_info(Node* fn);
+
+} // namespace torch::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/input_buffer.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/input_buffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..f70ac8b61f630981dbe08dcc46990b301f99600d
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/input_buffer.h
@@ -0,0 +1,55 @@
+#pragma once
+
+// The InputBuffer class accumulates a list of Variables for use by a
+// function. It implements logic to avoid modifying the passed
+// values in-place (adding an input twice will accumulate the result).
+// This behaviour is needed and used only in backward graphs.
+
+#include <utility>
+#include <vector>
+
+#include <c10/core/Stream.h>
+#include <torch/csrc/autograd/variable.h>
+#include <optional>
+
+namespace torch::autograd {
+
+struct InputBuffer {
+  explicit InputBuffer(size_t size)
+      : buffer(size),
+        opt_accum_streams(size),
+        ready_events(size),
+        ready_streams(size) {}
+  InputBuffer(const InputBuffer& other) = delete;
+  InputBuffer(InputBuffer&& other) = default;
+  explicit InputBuffer(variable_list&& inputs) : buffer(std::move(inputs)) {}
+  InputBuffer& operator=(InputBuffer&& other) = default;
+
+  // Accumulates the variable at a specified index.
+  // The optional CUDA streams determine which stream the accumulation
+  // is run on and how the addition is synchronized.
+  TORCH_API void add(
+      size_t pos,
+      Variable&& var,
+      const std::optional<c10::Stream>& opt_producer_stream,
+      const std::optional<c10::Stream>& opt_consumer_stream);
+
+  Variable operator[](size_t pos) {
+    return buffer[pos];
+  }
+
+  // Returns the inputs as a list of variables. Destroys given InputBuffer.
+  static std::vector<Variable> variables(InputBuffer&& g);
+
+  std::vector<Variable> buffer;
+  // The stream used for accumulation when a variable is used multiple times.
+  std::vector<std::optional<c10::Stream>> opt_accum_streams;
+  // The events you need to wait for to ensure the corresponding buffers
+  // are ready. The events are updated as we accumulate into the buffer.
+  std::vector<std::optional<c10::Event>> ready_events;
+  // The streams corresponding to the events above. This is only used to
+  // check if more synchronization is needed or not.
+  std::vector<std::optional<c10::Stream>> ready_streams;
+};
+
+} // namespace torch::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/input_metadata.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/input_metadata.h
new file mode 100644
index 0000000000000000000000000000000000000000..6a2633cc61abc3d76538f059a40bc7f284885d3d
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/input_metadata.h
@@ -0,0 +1,113 @@
+#pragma once
+
+#include <ATen/ExpandUtils.h>
+#include <ATen/NestedTensorImpl.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Device.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/Stream.h>
+#include <c10/core/SymIntArrayRef.h>
+#include <c10/core/TensorImpl.h>
+#include <c10/core/impl/DeviceGuardImplInterface.h>
+#include <c10/util/DimVector.h>
+#include <c10/util/Exception.h>
+#include <c10/util/SmallVector.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/zeros.h>
+#endif
+
+namespace torch::autograd {
+
+using SymIntSmallVec = c10::SmallVector<c10::SymInt, c10::kDimVectorStaticSize>;
+using MetadataShape = std::variant<SymIntSmallVec, at::Tensor>;
+
+/**
+ * Records TensorOptions, shape of the tensor, whether or not the Python
+ * dispatch key is set (tensor subclass), and, where applicable, the stream the
+ * corresponding operation took place on.
+ *
+ * If is_valid() is false, then the corresponding input is not used and may be
+ * an undefined tensor.
+ */
+struct TORCH_API InputMetadata {
+  InputMetadata() = default;
+  InputMetadata(
+      const at::TensorOptions& options,
+      MetadataShape input_shape,
+      bool is_tensor_subclass,
+      bool is_nested);
+  InputMetadata(const at::Tensor& t);
+
+  const at::TensorOptions& options() const {
+    return options_;
+  }
+
+  caffe2::TypeMeta dtype() const {
+    return options_.dtype();
+  }
+
+  at::Device device() const {
+    return options_.device();
+  }
+
+  at::Layout layout() const {
+    return options_.layout();
+  }
+
+  c10::Stream stream() const {
+    return stream_;
+  }
+
+  bool is_tensor_subclass() const {
+    return is_tensor_subclass_;
+  }
+
+  at::Tensor zeros_like() const;
+
+  bool is_same_shape(const at::Tensor& grad) const;
+
+  bool is_expandable_to_shape(const at::Tensor& grad) const;
+
+  at::Tensor reduce_grad(at::Tensor& grad) const;
+
+  at::Tensor maybe_reduce(
+      const size_t index,
+      at::Tensor grad,
+      const std::function<std::string(const std::string&)>& format_error) const;
+
+  std::stringstream incompatible_shape_error_message(
+      const size_t index,
+      const at::Tensor& grad) const;
+
+  bool was_default_constructed() const {
+    return was_default_constructed_;
+  }
+
+  bool is_cpp_nested_tensor() const;
+
+  bool is_nested_tensor() const {
+    return is_nested_;
+  }
+
+  c10::SymIntArrayRef shape_as_dim_vector() const;
+
+  // Danger: not thread safe, caller must protect with lock
+  SymIntSmallVec& mutable_shape_as_dim_vector();
+
+ private:
+  at::Tensor shape_as_tensor() const;
+  bool is_nestedness_same(const at::Tensor& grad) const;
+  bool maybe_expandable_to(const at::Tensor& grad) const;
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const at::TensorOptions options_;
+  MetadataShape shape_;
+  c10::Stream stream_ = c10::Stream(c10::Stream::Default::DEFAULT, device());
+  bool is_tensor_subclass_ = false;
+  bool is_nested_ = false;
+  bool was_default_constructed_ = true;
+};
+} // namespace torch::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/jit_decomp_interface.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/jit_decomp_interface.h
new file mode 100644
index 0000000000000000000000000000000000000000..fa050b5e5806ceec1a42eafb5c55e502b7622362
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/jit_decomp_interface.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <ATen/core/function_schema.h>
+#include <c10/macros/Export.h>
+
+// NOTE: [Jit Decomposition Interface]
+//
+// For some context of why we need this at all, see NOTE: [forward-mode AD
+// decompositions mechanism]
+//
+// Introducing that mechanism from the NOTE is problematic because:
+// - it relies on TorchScript, so now VariableTypeX.cpp depends on TorchScript.
+// - there exist internal builds like lite_trainer, which depend on VariableType
+//   but do not depend on TorchScript.
+//
+// For internal builds like lite_trainer builds to pass, and for OSS builds that
+// do depend on TorchScript to still support the forward AD decomp mechanism, we
+// implement a PImpl pattern to avoid a static dependency in favor of a dynamic
+// one
+// - during static initialization time, if the library is built with TorchScript
+//   setJitDecompImpl is called in decomposition_registry.cpp setting a global
+//   ptr to the impl
+// - when the program is run,if getJitDecompImpl returns a non null ptr, we can
+//   carry on normally, otherwise we gracefully error out
+//
+// For extra context, see VariableHooksInterface.h, where a similar technique
+// is used
+
+namespace torch::autograd::impl {
+
+struct TORCH_API JitDecompInterface {
+  virtual ~JitDecompInterface() = default;
+  virtual bool has_jit_decomposition(
+      const c10::FunctionSchema& schema) const = 0;
+  virtual void run_jit_decomposition(
+      const c10::OperatorHandle& op,
+      jit::Stack* stack) const = 0;
+};
+
+TORCH_API void setJitDecompImpl(JitDecompInterface* impl);
+TORCH_API JitDecompInterface* getJitDecompImpl();
+
+struct TORCH_API JitDecompRegisterer{explicit JitDecompRegisterer(
+    JitDecompInterface * impl){setJitDecompImpl(impl);
+} // namespace torch::autograd::impl
+}
+;
+
+} // namespace torch::autograd::impl
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/profiler.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/profiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..0bee2efbcbf215e65d3a058588b19cffd7b944f7
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/profiler.h
@@ -0,0 +1,4 @@
+#pragma once
+
+#include <torch/csrc/autograd/profiler_kineto.h>
+#include <torch/csrc/autograd/profiler_legacy.h>
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/profiler_kineto.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/profiler_kineto.h
new file mode 100644
index 0000000000000000000000000000000000000000..71118c63ac0074c7040c41bb3bf16e6cd054027c
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/profiler_kineto.h
@@ -0,0 +1,223 @@
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include <torch/csrc/profiler/api.h>
+#include <torch/csrc/profiler/events.h>
+#include <torch/csrc/profiler/stubs/base.h>
+#include <torch/csrc/profiler/util.h>
+
+namespace torch {
+
+namespace profiler::impl {
+struct Result;
+namespace kineto {
+struct ActivityTraceWrapper;
+} // namespace kineto
+} // namespace profiler::impl
+
+namespace autograd::profiler {
+using experimental_event_t = std::shared_ptr<torch::profiler::impl::Result>;
+using extra_meta_t = std::unordered_map<std::string, std::string>;
+
+struct TORCH_API KinetoEvent {
+  KinetoEvent(
+      const std::shared_ptr<const torch::profiler::impl::Result>&,
+      const bool verbose);
+
+  uint64_t startThreadId() const;
+  uint64_t endThreadId() const;
+  uint8_t activityType() const;
+  uint64_t fwdThreadId() const;
+  bool hasShapes() const;
+  const c10::ArrayRef<std::vector<int64_t>> shapes() const;
+  bool hasTypes() const;
+  const c10::ArrayRef<std::string> dtypes() const;
+  bool hasConcreteInputs() const;
+  const c10::ArrayRef<c10::IValue> concreteInputs() const;
+  bool hasKwinputs() const;
+  const std::unordered_map<std::string, c10::IValue> kwinputs() const;
+  uint64_t flops() const;
+  int64_t sequenceNr() const;
+  bool hasStack() const;
+  const c10::ArrayRef<std::string> stack() const;
+  uint8_t scope() const;
+  bool hasModuleHierarchy() const;
+  const c10::ArrayRef<std::string> moduleHierarchy() const;
+  int64_t debugHandle() const;
+  std::string name() const;
+  std::string overload_name() const;
+  c10::DeviceType deviceType() const;
+  int deviceIndex() const;
+  int64_t nBytes() const;
+  uint64_t startNs() const;
+  uint64_t endNs() const;
+  uint64_t durationNs() const;
+  bool isAsync() const;
+  uint64_t correlationId() const;
+  uint64_t linkedCorrelationId() const;
+  int64_t deviceResourceId() const;
+  std::string backend() const;
+  bool isPythonFunction() const;
+  int64_t cudaElapsedUs() const;
+  int64_t privateuse1ElapsedUs() const;
+  void getPerfEventCounters(torch::profiler::perf_counters_t&) const;
+  extra_meta_t extraMeta() const;
+
+ private:
+  torch::profiler::impl::ProfilerVoidEventStub fallbackStart() const;
+  torch::profiler::impl::ProfilerVoidEventStub fallbackEnd() const;
+
+  std::shared_ptr<const torch::profiler::impl::Result> result_;
+  std::vector<std::string> python_stack_;
+
+  // Copy fields from result so we can return ArrayRefs.
+  std::vector<std::vector<int64_t>> shapes_;
+  std::vector<std::string> dtypes_;
+  std::vector<c10::IValue> concrete_inputs_;
+  std::unordered_map<std::string, c10::IValue> kwinputs_;
+};
+
+// Consolidating events returned directly from Kineto
+// with events manually created by us (e.g. start/stop marks,
+// memory allocation events)
+struct TORCH_API ProfilerResult {
+  ProfilerResult();
+  ProfilerResult(
+      uint64_t start_time,
+      std::vector<KinetoEvent> events,
+      std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper>&&
+          trace,
+      std::vector<experimental_event_t>&& event_tree);
+  ~ProfilerResult();
+
+  uint64_t trace_start_ns() const {
+    return trace_start_ns_;
+  }
+
+  const std::vector<KinetoEvent>& events() const {
+    return events_;
+  }
+
+  const std::vector<experimental_event_t>& event_tree() const {
+    return event_tree_;
+  }
+
+  void save(const std::string& path);
+
+ private:
+  uint64_t trace_start_ns_ = 0;
+  std::vector<KinetoEvent> events_;
+  std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper> trace_;
+  std::vector<experimental_event_t> event_tree_;
+};
+
+/*
+ * This API is used by backends to record latency of events that
+ * happened in the backend but were not visible to pytorch runtime.
+ * For example, if part of the model is lowered to a dsp backend, then
+ * the execution of that part of the model is delegated to the backend.
+ * When backend finishes execution it has an option to provide profiling
+ * information (latency only at the moment) corresponding to different operators
+ * that were executed in the backend.
+ * When such events are recorded by backend using this API, the event
+ * records will be collected by active kineto profiler. If no kineto profiler
+ * is active then the event is ignored.
+ * This provides us with a way to generate all the profiling information
+ * for a model regardless of where model (or part of it) executed.
+ * @param start_time_us: start time in us of the event
+ * @param end_time_us: end time in us of the event
+ * @param debug_handle: debug handle to correlate this event/op with
+ * model level module/source information
+ * @param scope: scope of the event, e.g. LITE_INTERPRETER, RECORD_FN etc.
+ * @param event_name: name of the event, e.g. op name
+ * @param backend_name: name of the backend where the event took place.
+ */
+TORCH_API void reportBackendEventToActiveKinetoProfiler(
+    const int64_t start_time_us,
+    const int64_t end_time_us,
+    const int64_t debug_handle,
+    const at::RecordScope scope,
+    const std::string& event_name,
+    const std::string& backend_name);
+
+TORCH_API void enableProfiler(
+    const torch::profiler::impl::ProfilerConfig& config,
+    const std::set<torch::profiler::impl::ActivityType>& activities,
+    const std::unordered_set<at::RecordScope>& scopes = {});
+
+/*
+ * Same as enableProfiler but with callback to do post-processing of
+ * KinetoEvents.
+ * enableProfilerWithEventPostProcess enables profiler to capture
+ * specified activities, with specified RecordFunction scope, if any.
+ * Additionally, it takes a functor that does in-place post processing of
+ * events, e.g. populate stack trace or module hierarchy information lazily
+ * using debug_handle.
+ * Example usage is with lite interpreter that has recording scope of
+ * LITE_INTERPRETER. In this case lite interpreter runtime, records debug
+ * handles in RecordFunction, along with other information. Debug handles are
+ * eventually passed down to KinetoEvent and recorded as part of the event.
+ * KinetoEdgeCPUProfiler, in torch/csrc/jit/mobile/profiler_edge.cpp, enables
+ * profiler using post-processing callback, via
+ * enableProfilerWithEventPostProcess, that takes these debug handles and
+ * generates stack trace and module hierarchy information, once profiling is
+ * done.
+ */
+using post_process_t = std::function<void(
+    /*debug_handle */ int64_t,
+    /*jit_stack    */ std::vector<std::string>&,
+    /*jit_modules  */ std::vector<std::string>&)>;
+TORCH_API void enableProfilerWithEventPostProcess(
+    const torch::profiler::impl::ProfilerConfig& config,
+    const std::set<torch::profiler::impl::ActivityType>& activities,
+    post_process_t&& cb,
+    const std::unordered_set<at::RecordScope>& scopes = {});
+
+TORCH_API std::unique_ptr<ProfilerResult> disableProfiler();
+
+TORCH_API void prepareProfiler(
+    const torch::profiler::impl::ProfilerConfig& config,
+    const std::set<torch::profiler::impl::ActivityType>& activities);
+
+TORCH_API void toggleCollectionDynamic(
+    const bool enable,
+    const std::set<torch::profiler::impl::ActivityType>& activities);
+
+TORCH_API void startMemoryProfile();
+TORCH_API void stopMemoryProfile();
+TORCH_API void exportMemoryProfile(const std::string& path);
+
+/**
+ * When a C++ thread really has no control over how the profiler was enabled,
+ * for example, by some unreachable Python code, it can call these functions
+ * to test/join/unjoin itself into the collection set of a profiler, if any.
+ * Without calling these functions, the symptom may be "not seeing GPU events
+ * from some child C++ threads". This is an example on how to use them,
+ *
+ *    using namespace torch::autograd::profiler;
+ *    bool enabled = isProfilerEnabledInMainThread();
+ *    if (enabled != saved_enabled_state) {
+ *      if (enabled) {
+ *        enableProfilerInChildThread();
+ *      } else {
+ *        disableProfilerInChildThread();
+ *      }
+ *      saved_enabled_state = enabled;
+ *    }
+ */
+TORCH_API bool isProfilerEnabledInMainThread();
+TORCH_API void enableProfilerInChildThread();
+TORCH_API void disableProfilerInChildThread();
+
+} // namespace autograd::profiler
+
+namespace profiler::impl {
+
+// Experimental.
+TORCH_API void _reportVulkanEventToProfiler(vulkan_id_t id);
+
+} // namespace profiler::impl
+
+} // namespace torch
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/profiler_legacy.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/profiler_legacy.h
new file mode 100644
index 0000000000000000000000000000000000000000..218f844b3dd1f0c4b1851d92fd239f7cdd6d817f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/profiler_legacy.h
@@ -0,0 +1,401 @@
+#pragma once
+
+#include <cstdint>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <vector>
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/profiler/api.h>
+#include <torch/csrc/profiler/stubs/base.h>
+#include <torch/csrc/profiler/util.h>
+
+namespace torch::autograd::profiler {
+
+enum class C10_API_ENUM EventKind : uint16_t {
+  Mark,
+  PushRange,
+  PopRange,
+  MemoryAlloc,
+};
+
+// To be deprecated, once we switch to Kineto profiling
+struct TORCH_API LegacyEvent {
+  LegacyEvent(
+      EventKind kind,
+      at::StringView name,
+      uint16_t thread_id,
+      bool record_cuda,
+      at::RecordFunctionHandle handle = 0,
+      std::vector<std::vector<int64_t>>&& shapes = {},
+      int64_t node_id = -1,
+      bool is_async = false)
+      : name_(std::move(name)),
+        kind_(kind),
+        thread_id_(thread_id),
+        handle_(handle),
+        shapes_(std::move(shapes)),
+        node_id_(node_id),
+        is_async_(is_async) {
+    record(record_cuda);
+  }
+
+  // Constructor to be used in conjunction with LegacyEvent::fromIValue.
+  LegacyEvent(
+      EventKind kind,
+      at::StringView name,
+      uint16_t thread_id,
+      at::RecordFunctionHandle handle,
+      std::vector<std::vector<int64_t>>&& shapes,
+      int64_t node_id,
+      bool is_remote,
+      int64_t cpu_memory_usage,
+      int64_t cpu_ns,
+      bool cuda_recorded,
+      int64_t cuda_memory_usage = 0,
+      c10::DeviceIndex device = -1,
+      double cuda_us = -1)
+      : cpu_ns_(cpu_ns),
+        name_(std::move(name)),
+        kind_(kind),
+        thread_id_(thread_id),
+        handle_(handle),
+        shapes_(std::move(shapes)),
+        cpu_memory_usage_(cpu_memory_usage),
+        cuda_memory_usage_(cuda_memory_usage),
+        device_(device),
+        node_id_(node_id),
+        is_remote_(is_remote),
+        cuda_us_(static_cast<int64_t>(cuda_us)) {
+    // Sanity check values that were deserialized
+    TORCH_INTERNAL_ASSERT(cpu_ns_ > 0);
+    if (cuda_recorded) {
+      TORCH_INTERNAL_ASSERT(device_ >= 0);
+      TORCH_INTERNAL_ASSERT(cuda_us_ >= 0);
+    }
+  }
+
+  // Returns IValues corresponding to event structure, to be used for
+  // serialization.
+  at::IValue toIValue() const;
+
+  // Reconstructs an event from IValues given by toIValue.
+  static LegacyEvent fromIValue(const at::IValue& eventIValue);
+
+  void record(bool record_cuda);
+
+  std::string kindStr() const {
+    switch (kind_) {
+      case EventKind::Mark:
+        return "mark";
+      case EventKind::PushRange:
+        return "push";
+      case EventKind::PopRange:
+        return "pop";
+      case EventKind::MemoryAlloc:
+        return "memory_alloc";
+    }
+    throw std::runtime_error("unknown event kind");
+  }
+
+  EventKind kind() const {
+    return kind_;
+  }
+
+  const char* name() const {
+    return name_.str();
+  }
+
+  uint64_t threadId() const {
+    return thread_id_;
+  }
+
+  std::vector<std::vector<int64_t>> shapes() const {
+    return shapes_;
+  }
+
+  double cpuElapsedUs(const LegacyEvent& e) const {
+    return static_cast<double>(e.cpu_ns_ - cpu_ns_) / (1000.0);
+  }
+
+  void setCpuUs(int64_t cpu_us) {
+    cpu_ns_ = cpu_us * 1000;
+  }
+
+  double cpuUs() const {
+    return static_cast<double>(cpu_ns_) / (1000.0);
+  }
+
+  double cudaElapsedUs(const LegacyEvent& e) const;
+
+  bool hasCuda() const {
+    return cuda_event != nullptr || (isRemote() && device_ != -1);
+  }
+
+  c10::DeviceIndex device() const {
+    return device_;
+  }
+
+  void updateMemoryStats(int64_t alloc_size, c10::Device device) {
+    if (device.is_cuda() || device.type() == c10::DeviceType::HIP) {
+      cuda_memory_usage_ = alloc_size;
+    } else if (
+        device.is_cpu() || device.type() == c10::DeviceType::MKLDNN ||
+        device.type() == c10::DeviceType::IDEEP) {
+      cpu_memory_usage_ = alloc_size;
+    } else {
+      LOG(WARNING) << "Unsupported memory profiling device: " << device;
+    }
+  }
+
+  int64_t cpuMemoryUsage() const {
+    return cpu_memory_usage_;
+  }
+
+  int64_t cudaMemoryUsage() const {
+    return cuda_memory_usage_;
+  }
+
+  at::RecordFunctionHandle handle() const {
+    return handle_;
+  }
+
+  // Node ID corresponding to this event.
+  int64_t nodeId() const {
+    return node_id_;
+  }
+
+  // Set Node ID on this event.
+  void setNodeId(int64_t node_id) {
+    node_id_ = node_id;
+  }
+
+  void setName(at::StringView newName_) {
+    name_ = std::move(newName_);
+  }
+
+  bool isRemote() const {
+    return is_remote_;
+  }
+
+  void setCudaUs(int64_t cuda_us) {
+    cuda_us_ = cuda_us;
+  }
+
+  void setSequenceNr(int64_t sequence_nr) {
+    sequence_nr_ = sequence_nr;
+  }
+
+  int64_t sequenceNr() const {
+    return sequence_nr_;
+  }
+
+  void setCorrelationId(uint64_t correlation_id) {
+    correlation_id_ = correlation_id;
+  }
+
+  uint64_t correlationId() const {
+    return correlation_id_;
+  }
+
+  const std::vector<std::string>& stack() const {
+    return stack_;
+  }
+
+  void setStack(const std::vector<std::string>& stack) {
+    stack_ = stack;
+  }
+
+  uint64_t fwdThreadId() const {
+    return fwd_thread_id_;
+  }
+
+  void setFwdThreadId(uint64_t fwd_thread_id) {
+    fwd_thread_id_ = fwd_thread_id;
+  }
+
+  uint8_t scope() const {
+    return scope_;
+  }
+
+  void setScope(uint8_t scope) {
+    scope_ = scope;
+  }
+
+  const std::unordered_map<std::string, c10::IValue>& extraArgs() const {
+    return extra_args_;
+  }
+
+  void setExtraArgs(std::unordered_map<std::string, c10::IValue>&& save_args) {
+    extra_args_ = std::move(save_args);
+  }
+
+  uint64_t flops() {
+    return flops_;
+  }
+
+  bool isAsync() {
+    return is_async_;
+  }
+
+  void setFlops(uint64_t flops) {
+    flops_ = flops;
+  }
+
+ private:
+  // signed to allow for negative intervals, initialized for safety.
+  int64_t cpu_ns_ = 0;
+  at::StringView name_;
+  EventKind kind_;
+  uint64_t thread_id_;
+  uint64_t fwd_thread_id_{0};
+  at::RecordFunctionHandle handle_{0};
+  std::vector<std::vector<int64_t>> shapes_;
+  int64_t cpu_memory_usage_ = 0;
+  int64_t cuda_memory_usage_ = 0;
+  c10::DeviceIndex device_ = -1;
+  torch::profiler::impl::ProfilerVoidEventStub cuda_event = nullptr;
+  int64_t node_id_ = 0;
+  bool is_remote_ = false;
+  int64_t cuda_us_ = -1;
+  int64_t sequence_nr_ = -1;
+  bool is_async_ = false;
+
+  std::vector<std::string> stack_;
+  uint8_t scope_{0};
+  uint64_t correlation_id_{0};
+  // Extra arguments for computing op flops
+  std::unordered_map<std::string, c10::IValue> extra_args_;
+  uint64_t flops_ = 0;
+};
+
+// a linked-list of fixed sized vectors, to avoid
+// a std::vector resize from taking a large amount of time inside
+// a profiling  event
+struct RangeEventList {
+  RangeEventList() {
+    events_.reserve(kReservedCapacity);
+  }
+
+  template <typename... Args>
+  void record(Args&&... args) {
+    std::lock_guard<std::mutex> guard(mutex_);
+    events_.emplace_back(std::forward<Args>(args)...);
+  }
+
+  std::vector<LegacyEvent> consolidate() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    std::vector<LegacyEvent> result;
+    result.insert(
+        result.begin(),
+        std::make_move_iterator(events_.begin()),
+        std::make_move_iterator(events_.end()));
+    events_.erase(events_.begin(), events_.end());
+    return result;
+  }
+
+  size_t size() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return events_.size();
+  }
+
+ private:
+  // This mutex is used to serialize access when different threads are writing
+  // to the same instance of RangeEventList.
+  std::mutex mutex_;
+  std::vector<LegacyEvent> events_;
+
+  static const size_t kReservedCapacity = 1024;
+};
+
+// A struct to control settings of disableProfiler options.
+struct TORCH_API ProfilerDisableOptions {
+  ProfilerDisableOptions() = default;
+  ProfilerDisableOptions(bool shouldCleanupTLSState, bool shouldConsolidate)
+      : cleanupTLSState(shouldCleanupTLSState),
+        consolidate(shouldConsolidate) {}
+  // Whether we should clean up profiler states that are thread local, such as
+  // ThreadLocalDebugInfo and thread local RecordFunction callbacks.
+  bool cleanupTLSState = true;
+  // Whether we should consolidate all currently recorded profiled events. If
+  // false, will not consolidate and other threads can continue to write to the
+  // event lists.
+  bool consolidate = true;
+};
+
+// NOTE: profiler mode is thread local, with automatic propagation
+// across thread boundary (e.g. at::launch tasks)
+TORCH_API void enableProfilerLegacy(
+    const torch::profiler::impl::ProfilerConfig&);
+using thread_event_lists = std::vector<std::vector<LegacyEvent>>;
+TORCH_API thread_event_lists disableProfilerLegacy(
+    std::optional<ProfilerDisableOptions> profilerDisableOptions =
+        std::nullopt);
+
+// adds profiledEvents to the current thread local recorded events. Each event
+// will be marked with node ID given by fromNodeId.
+TORCH_API void addEventList(std::vector<LegacyEvent>&& profiledEvents);
+// Writes profiled events to a stream.
+TORCH_API void writeProfilerEventsToStream(
+    std::ostream& out,
+    const std::vector<LegacyEvent*>& events);
+
+// Usage:
+//   {
+//     RecordProfile guard("filename.trace");
+//     // code you want to profile
+//   }
+// Then open filename.trace in chrome://tracing
+struct TORCH_API RecordProfile {
+  RecordProfile(std::ostream& out);
+  RecordProfile(const std::string& filename);
+
+  ~RecordProfile();
+
+ private:
+  void init();
+  std::unique_ptr<std::ofstream> file_;
+  std::ostream& out_;
+  void processEvents(const std::vector<LegacyEvent*>& events);
+};
+
+// A guard that enables the legacy profiler, taking in an optional callback to
+// process the results Usage:
+// {
+//   TLSLegacyProfilerGuard g([](thread_event_lists profilerResults) {
+//     // process profilerResults
+//   });
+//   Code to profile
+// }
+struct TORCH_API TLSLegacyProfilerGuard {
+  explicit TLSLegacyProfilerGuard(
+      const torch::profiler::impl::ProfilerConfig& cfg,
+      std::optional<std::function<void(const thread_event_lists&)>>
+          resultCallback = std::nullopt,
+      std::optional<ProfilerDisableOptions> profilerDisableOptions =
+          std::nullopt)
+      : cb_(std::move(resultCallback)),
+        profilerDisableOptions_(profilerDisableOptions) {
+    enableProfilerLegacy(cfg);
+  }
+  ~TLSLegacyProfilerGuard() {
+    thread_event_lists event_lists =
+        disableProfilerLegacy(profilerDisableOptions_);
+    if (cb_) {
+      try {
+        (*cb_)(event_lists);
+      } catch (const std::exception& e) {
+        LOG(ERROR) << "Got error processing profiler events: " << e.what();
+      }
+    }
+  }
+
+ private:
+  std::optional<std::function<void(const thread_event_lists&)>> cb_;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const std::optional<ProfilerDisableOptions> profilerDisableOptions_;
+};
+
+} // namespace torch::autograd::profiler
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/profiler_python.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/profiler_python.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c0d1cde2141100c90e845a62922faa9ea3e1f7a
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/profiler_python.h
@@ -0,0 +1,7 @@
+#pragma once
+
+namespace torch::autograd::profiler::python_tracer {
+
+void init();
+
+}
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_anomaly_mode.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_anomaly_mode.h
new file mode 100644
index 0000000000000000000000000000000000000000..a3aedfa61bded2590848bc7692b04a155cc20cdd
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_anomaly_mode.h
@@ -0,0 +1,43 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+#include <torch/csrc/autograd/anomaly_mode.h>
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch::autograd {
+
+struct PyAnomalyMetadata : public AnomalyMetadata {
+  static constexpr const char* ANOMALY_TRACE_KEY = "traceback_";
+  static constexpr const char* ANOMALY_PARENT_KEY = "parent_";
+
+  PyAnomalyMetadata() {
+    pybind11::gil_scoped_acquire gil;
+    // NOLINTNEXTLINE(cppcoreguidelines-prefer-member-initializer)
+    dict_ = PyDict_New();
+  }
+  // NOLINTNEXTLINE(bugprone-exception-escape)
+  ~PyAnomalyMetadata() override {
+    // If python is already dead, leak the wrapped python objects
+    if (Py_IsInitialized()) {
+      pybind11::gil_scoped_acquire gil;
+      Py_DECREF(dict_);
+    }
+  }
+  void store_stack() override;
+  void print_stack(const std::string& current_node_name) override;
+  void assign_parent(const std::shared_ptr<Node>& parent_node) override;
+
+  PyObject* dict() {
+    return dict_;
+  }
+
+ private:
+  PyObject* dict_{nullptr};
+};
+void _print_stack(
+    PyObject* trace_stack,
+    const std::string& current_node_name,
+    bool is_parent);
+
+} // namespace torch::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_autograd.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_autograd.h
new file mode 100644
index 0000000000000000000000000000000000000000..0da4800b04f6b46d9f1301f9f8f3802bc1c97fa7
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_autograd.h
@@ -0,0 +1,18 @@
+#ifndef THP_AUTOGRAD_H
+#define THP_AUTOGRAD_H
+#include <torch/csrc/utils/pythoncapi_compat.h>
+
+PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused);
+void THPAutograd_initFunctions();
+
+namespace torch::autograd {
+
+PyMethodDef* python_functions();
+
+}
+
+#include <torch/csrc/autograd/python_engine.h>
+#include <torch/csrc/autograd/python_function.h>
+#include <torch/csrc/autograd/python_variable.h>
+
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_cpp_function.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_cpp_function.h
new file mode 100644
index 0000000000000000000000000000000000000000..f7a661d325fb852ec474612f870d246ad48918d0
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_cpp_function.h
@@ -0,0 +1,131 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/python_headers.h>
+#include <memory>
+#include <typeinfo>
+
+#include <torch/csrc/Exceptions.h>
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/utils/object_ptr.h>
+
+namespace torch::autograd {
+
+struct THPCppFunction {
+  PyObject_HEAD
+  std::shared_ptr<Node> cdata;
+};
+
+template <typename Ctor>
+TORCH_PYTHON_API PyObject* CppFunction_pynew(
+    PyTypeObject* type,
+    PyObject* args,
+    PyObject* kwds) {
+  THPObjectPtr obj(type->tp_alloc(type, 0));
+  if (!obj)
+    return nullptr;
+  THPCppFunction* f = (THPCppFunction*)obj.get();
+  HANDLE_TH_ERRORS
+  new (&f->cdata) std::shared_ptr<Node>(Ctor()(args));
+  END_HANDLE_TH_ERRORS
+  if (!f->cdata) {
+    return nullptr;
+  }
+  return obj.release();
+}
+
+#define THP_FUNCTION_DEFAULT_METHODS                                           \
+  {(char*)"_register_hook_dict",                                               \
+   THPCppFunction_register_hook_dict,                                          \
+   METH_O,                                                                     \
+   nullptr},                                                                   \
+      {(char*)"register_hook", THPCppFunction_register_hook, METH_O, nullptr}, \
+      {(char*)"register_prehook",                                              \
+       THPCppFunction_register_prehook,                                        \
+       METH_O,                                                                 \
+       nullptr},                                                               \
+      {(char*)"name", THPCppFunction_name, METH_NOARGS, nullptr},              \
+      {(char*)"_sequence_nr",                                                  \
+       THPCppFunction_sequence_nr,                                             \
+       METH_NOARGS,                                                            \
+       nullptr},                                                               \
+  {                                                                            \
+    (char*)"_set_sequence_nr", THPCppFunction_set_sequence_nr, METH_O, nullptr \
+  }
+
+#define THP_FUNCTION_DEFAULT_PROPERTIES                                        \
+  {(char*)"next_functions",                                                    \
+   THPCppFunction_next_functions,                                              \
+   nullptr,                                                                    \
+   nullptr,                                                                    \
+   nullptr},                                                                   \
+      {(char*)"requires_grad",                                                 \
+       THPCppFunction_requires_grad,                                           \
+       nullptr,                                                                \
+       nullptr,                                                                \
+       nullptr},                                                               \
+      {(char*)"metadata", THPCppFunction_metadata, nullptr, nullptr, nullptr}, \
+  {                                                                            \
+    (char*)"_input_metadata", THPCppFunction_input_metadata, nullptr, nullptr, \
+        nullptr                                                                \
+  }
+
+TORCH_PYTHON_API PyObject* THPCppFunction_next_functions(
+    PyObject* self,
+    void* _unused);
+TORCH_PYTHON_API PyObject* THPCppFunction_metadata(
+    PyObject* self,
+    void* _unused);
+TORCH_PYTHON_API PyObject* THPCppFunction_requires_grad(
+    PyObject* self,
+    void* _unused);
+TORCH_PYTHON_API PyObject* THPCppFunction_register_hook_dict(
+    PyObject* self,
+    PyObject* _var);
+TORCH_PYTHON_API PyObject* THPCppFunction_register_hook(
+    PyObject* self,
+    PyObject* hook);
+TORCH_PYTHON_API PyObject* THPCppFunction_register_prehook(
+    PyObject* self,
+    PyObject* hook);
+
+TORCH_PYTHON_API PyObject* THPCppFunction_name(
+    PyObject* self,
+    PyObject* noargs);
+TORCH_PYTHON_API PyObject* THPCppFunction_sequence_nr(
+    PyObject* self,
+    PyObject* noargs);
+TORCH_PYTHON_API PyObject* THPCppFunction_input_metadata(
+    PyObject* self,
+    void* _unused);
+
+TORCH_PYTHON_API PyTypeObject* _initFunctionPyTypeObject(
+    PyTypeObject& type,
+    const char* name,
+    PyGetSetDef* function_properties,
+    PyMethodDef* function_methods);
+
+TORCH_PYTHON_API PyObject* registerFunctionHook(Node& fn, PyObject* hook);
+
+TORCH_PYTHON_API PyObject* registerFunctionPreHook(Node& fn, PyObject* hook);
+
+template <typename Ctor>
+TORCH_PYTHON_API PyTypeObject* createForwardFunctionPyTypeObject(
+    PyTypeObject& type,
+    const char* name,
+    PyGetSetDef* function_properties = nullptr,
+    PyMethodDef* function_methods = nullptr) {
+  type.tp_new = &CppFunction_pynew<Ctor>;
+  return _initFunctionPyTypeObject(
+      type, name, function_properties, function_methods);
+}
+
+TORCH_PYTHON_API void registerCppFunction(
+    const std::type_info& type,
+    PyTypeObject* pytype);
+TORCH_PYTHON_API PyObject* functionToPyObject(
+    const std::shared_ptr<Node>& cdata);
+
+TORCH_PYTHON_API bool THPCppFunction_Check(PyObject* obj);
+
+} // namespace torch::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_engine.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_engine.h
new file mode 100644
index 0000000000000000000000000000000000000000..479565969d3e0dd63e8b3d04f8f7aa7520912d9b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_engine.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+#include <torch/csrc/autograd/engine.h>
+#include <torch/csrc/autograd/function.h>
+
+bool THPEngine_initModule(PyObject* module);
+
+namespace torch::autograd::python {
+
+struct PythonEngine : public Engine {
+  static Engine& get_python_engine();
+  ~PythonEngine() override;
+  void thread_init(
+      int device,
+      const std::shared_ptr<ReadyQueue>& ready_queue,
+      bool should_increment) override;
+  void thread_on_exception(
+      const std::shared_ptr<GraphTask>& graph_task,
+      const std::shared_ptr<Node>& fn,
+      std::exception& e) override;
+  variable_list execute(
+      const edge_list& roots,
+      const variable_list& inputs,
+      bool keep_graph,
+      bool create_graph,
+      bool accumulate_grad,
+      const edge_list& outputs = {}) override;
+
+  c10::intrusive_ptr<at::ivalue::Future> execute_with_graph_task(
+      const std::shared_ptr<GraphTask>& graph_task,
+      std::shared_ptr<Node> graph_root,
+      InputBuffer&& input_buffer) override;
+
+  std::unique_ptr<AnomalyMetadata> make_anomaly_metadata() override;
+  std::unique_ptr<SavedVariableHooks> get_default_saved_variable_hooks()
+      override;
+
+ private:
+  PythonEngine();
+};
+
+} // namespace torch::autograd::python
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_enum_tag.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_enum_tag.h
new file mode 100644
index 0000000000000000000000000000000000000000..a3fd1c1bb78d1c1485ece005b1b8e53530e5b7ab
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_enum_tag.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+namespace torch::autograd {
+void initEnumTag(PyObject* module);
+}
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_fft_functions.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_fft_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e603924eb32a6da0694ce6c275f890e8819bcdf
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_fft_functions.h
@@ -0,0 +1,8 @@
+#pragma once
+#include <torch/csrc/utils/pythoncapi_compat.h>
+
+namespace torch::autograd {
+
+void initFFTFunctions(PyObject* module);
+
+}
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_function.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_function.h
new file mode 100644
index 0000000000000000000000000000000000000000..173108f14332791e9386d7265cc7c7b50c61b28e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_function.h
@@ -0,0 +1,151 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+#include <torch/csrc/Exceptions.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/autograd/custom_function.h>
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/autograd/saved_variable.h>
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/utils/object_ptr.h>
+
+#include <c10/core/DeviceGuard.h>
+#include <optional>
+
+#include <memory>
+#include <vector>
+
+namespace torch::jit {
+struct Graph;
+}
+
+namespace torch::autograd {
+
+// A Function which is implemented by a Python object (i.e., a THPFunction).
+// Calls to 'apply' are forwarded to the Python method implementation.
+// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
+struct PyNode : public Node {
+  PyNode(THPObjectPtr obj) : obj(obj.release()) {}
+
+  PyObject* to_py_args(
+      const variable_list& inputs,
+      at::OptionalDeviceGuard* device_guard);
+  variable_list to_variable_list(
+      const PyObject* r,
+      const std::vector<bool>& is_variable_input);
+
+  variable_list apply(variable_list&& inputs) override;
+  variable_list apply_with_saved_impl(
+      const variable_list& inputs,
+      const SwapSavedVariables& saved);
+
+  void release_variables() override;
+  std::string name() const override;
+  bool is_traceable() override;
+
+  bool is_aot_backward() const override;
+
+  void compiled_args(CompiledNodeArgs& args) const override;
+  variable_list apply_with_saved(
+      const variable_list& inputs,
+      SwapSavedVariables& saved) override;
+
+  // THPFunction this Function is wrapping.  Owning!
+  PyObject* obj;
+
+  // NOLINTNEXTLINE(bugprone-exception-escape)
+  ~PyNode() override {
+    // Can't use THPObjectPtr as a field in this class; destructor won't take
+    // out GIL!  When I forgot to do this by hand
+    // TestAutograd.test_inplace_view_python called me out about it.
+    // If python is already dead, leak the wrapped python objects
+    if (Py_IsInitialized()) {
+      pybind11::gil_scoped_acquire gil;
+      Py_DECREF(obj);
+    }
+  }
+};
+
+/**
+ * Cast an object into a tuple, if it is not a tuple already. Returns true
+ * if the original object was not a tuple.
+ */
+inline bool ensure_tuple(THPObjectPtr& obj) {
+  if (PyTuple_Check(obj.get()))
+    return false;
+
+  PyObject* tuple = PyTuple_New(1);
+  if (!tuple)
+    throw python_error();
+  PyTuple_SET_ITEM(tuple, 0, obj.release());
+  obj = tuple;
+  return true;
+}
+
+} // namespace torch::autograd
+
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+struct THPFunction {
+  PyObject_HEAD
+
+  PyObject* needs_input_grad;
+
+  // Python tuple of tensors whose variables we should save.  Set
+  // by Python with 'save_for_backward'.  If nullptr, no tensors were
+  // saved.
+  PyObject* to_save;
+  // Python tuple of tensors which are not differentiable.  Set by
+  // Python with 'mark_non_differentiable'.  If nullptr, no tensors were
+  // non-differentiable.
+  PyObject* non_differentiable;
+  // Python tuple of tensors which had inplace updates in the forward()
+  // pass.  Set by Python with 'mark_dirty'.  If nullptr, no tensors were
+  // modified inplace.
+  PyObject* dirty_tensors;
+
+  // boolean indicating whether to materialize undefined output grad tensors
+  // into tensors full of zeros. Set by Python with 'set_materialize_grads'.
+  // Default is true.
+  bool materialize_grads;
+
+  // boolean indicating whether to materialize output grad tensors
+  // corresponding to non-differentiable outputs. Normally, someone would
+  // already get this behavior by switching off materialize_grads,
+  // but there are certain use cases where that is not feasible:
+  // https://github.com/pytorch/pytorch/pull/98659#pullrequestreview-1376822560
+  bool materialize_non_diff_grads;
+
+  PyObject* compiled_autograd_backward_state;
+  std::vector<c10::SymInt> compiled_autograd_symints;
+
+  std::vector<torch::autograd::VariableInfo> output_info;
+  std::vector<torch::autograd::VariableInfo> input_info;
+  std::vector<torch::autograd::SavedVariable> saved_variables;
+  // For each input, true if the input is a THPVariable
+  std::vector<bool> is_variable_input;
+  char has_freed_buffers;
+
+  PyObject* saved_for_forward;
+  // The actual PyNode (in the autograd graph) that this data was
+  // saved for.  This field may be NULL (because a user can construct
+  // a THPFunction directly from Python), but when this field is non-NULL,
+  // it is guaranteed that cdata.lock()->obj == this
+  //
+  // In most ordinary use, this field should always be non-NULL; e.g.,
+  // when we allocate a THPFunction because we are running Node.apply,
+  // after constructing a THPFunction, we immediately allocate a PyNode
+  // for it.  We can't enforce this directly in the constructor of
+  // THPFunction though, because there's no way to keep it live long enough
+  // to save an owning reference to PyNode into the grad_fn of a Variable.
+  std::weak_ptr<torch::autograd::PyNode> cdata;
+};
+
+bool THPFunction_initModule(PyObject* module);
+TORCH_PYTHON_API extern PyTypeObject THPFunctionType;
+TORCH_PYTHON_API extern PyObject* THPFunctionClass;
+TORCH_PYTHON_API extern PyObject* THPGradientEdgeClass;
+
+inline bool THPFunction_Check(PyObject* obj) {
+  return PyObject_IsInstance(obj, (PyObject*)&THPFunctionType);
+}
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_hook.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_hook.h
new file mode 100644
index 0000000000000000000000000000000000000000..aadb2c3b24061a311f87d73272c3285dff2570df
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_hook.h
@@ -0,0 +1,59 @@
+#pragma once
+
+#include <torch/csrc/autograd/function_hook.h>
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/object_ptr.h>
+
+namespace torch::dynamo::autograd {
+class SwapSavedVariables;
+} // namespace torch::dynamo::autograd
+
+namespace torch::autograd {
+
+struct PyFunctionTensorPreHook : public FunctionPreHook {
+  PyFunctionTensorPreHook(PyObject* dict, size_t value_idx);
+  ~PyFunctionTensorPreHook() override;
+  variable_list operator()(const variable_list& values) override;
+  void compiled_args(
+      torch::dynamo::autograd::CompiledNodeArgs& args) const override;
+  PyObject* dict;
+  size_t value_idx;
+};
+
+struct PyFunctionPreHook : public FunctionPreHook {
+  PyFunctionPreHook(PyObject* dict);
+  ~PyFunctionPreHook() override;
+  variable_list operator()(const variable_list& values) override;
+  void compiled_args(
+      torch::dynamo::autograd::CompiledNodeArgs& args) const override;
+  PyObject* dict;
+};
+
+struct PyFunctionPostHook : public FunctionPostHook {
+  PyFunctionPostHook(PyObject* dict);
+  ~PyFunctionPostHook() override;
+  variable_list operator()(
+      const variable_list& outputs,
+      const variable_list& inputs) override;
+  void compiled_args(
+      torch::dynamo::autograd::CompiledNodeArgs& args) const override;
+  PyObject* dict;
+};
+
+// PyFunctionTensorPostAccGradHooks is a dictionary of PostAccumulateGradHooks,
+// and it is understandable if you are confused by why it's a subclass. We are
+// simply following the precedent of PyFunctionPreHook and PyFunctionPostHook
+// above to easily enroll into existing infrastructure.
+struct PyFunctionTensorPostAccGradHooks : public PostAccumulateGradHook {
+  PyFunctionTensorPostAccGradHooks(PyObject* dict);
+  ~PyFunctionTensorPostAccGradHooks() override;
+  void operator()(const Variable& tensor) override;
+  void compiled_args(
+      torch::dynamo::autograd::CompiledNodeArgs& args) const override;
+  void apply_with_saved(
+      Variable& tensor,
+      torch::dynamo::autograd::SwapSavedVariables& saved) override;
+  PyObject* dict;
+};
+
+} // namespace torch::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_legacy_variable.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_legacy_variable.h
new file mode 100644
index 0000000000000000000000000000000000000000..819e397ea9711986fac66a85ce7e1acb850bf81f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_legacy_variable.h
@@ -0,0 +1,12 @@
+#pragma once
+
+// Instantiates torch._C._LegacyVariableBase, which defines the Python
+// constructor (__new__) for torch.autograd.Variable.
+
+#include <torch/csrc/python_headers.h>
+
+namespace torch::autograd {
+
+void init_legacy_variable(PyObject* module);
+
+}
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_linalg_functions.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_linalg_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..a3abad599849bda5349d3a0125c848eaf4ddc7ad
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_linalg_functions.h
@@ -0,0 +1,8 @@
+#pragma once
+#include <torch/csrc/utils/pythoncapi_compat.h>
+
+namespace torch::autograd {
+
+void initLinalgFunctions(PyObject* module);
+
+}
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_nested_functions.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_nested_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..a77d4e25f8eae8b9cd82019eaaf983589c321392
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_nested_functions.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <torch/csrc/utils/python_compat.h>
+namespace torch::autograd {
+
+PyMethodDef* get_nested_functions_manual();
+
+void initNestedFunctions(PyObject* module);
+
+} // namespace torch::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_nn_functions.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_nn_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..f0a96ba376d249422fb19416d6a5d630197b8355
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_nn_functions.h
@@ -0,0 +1,7 @@
+#pragma once
+#include <torch/csrc/utils/python_compat.h>
+namespace torch::autograd {
+
+void initNNFunctions(PyObject* module);
+
+}
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_saved_variable_hooks.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_saved_variable_hooks.h
new file mode 100644
index 0000000000000000000000000000000000000000..5dc98886065f1bc110cc2a40a73c5c5d51712d7e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_saved_variable_hooks.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <c10/core/SafePyObject.h>
+#include <pybind11/pybind11.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/autograd/python_variable.h>
+#include <torch/csrc/autograd/saved_variable_hooks.h>
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace py = pybind11;
+
+namespace torch::autograd {
+
+struct PySavedVariableHooks : public SavedVariableHooks {
+  PySavedVariableHooks(py::function& pack_hook, py::function& unpack_hook);
+  void call_pack_hook(const at::Tensor& tensor) override;
+  at::Tensor call_unpack_hook() override;
+  ~PySavedVariableHooks() override;
+  std::optional<std::pair<c10::SafePyObject, c10::SafePyObject>>
+  retrieve_unpack_hook_data() const override;
+
+ private:
+  PyObject* pack_hook_;
+  PyObject* unpack_hook_;
+  PyObject* data_ = nullptr;
+};
+
+struct PyDefaultSavedVariableHooks {
+  static void push_hooks(py::function& pack_hook, py::function& unpack_hook);
+  static void pop_hooks();
+  static std::unique_ptr<SavedVariableHooks> get_hooks();
+};
+
+} // namespace torch::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_sparse_functions.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_sparse_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..1c2ecba8f04a2564f80b2a264f9865b977533022
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_sparse_functions.h
@@ -0,0 +1,8 @@
+#pragma once
+#include <torch/csrc/utils/pythoncapi_compat.h>
+
+namespace torch::autograd {
+
+void initSparseFunctions(PyObject* module);
+
+}
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_special_functions.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_special_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..ee906cfd8587331ebb9f6d1e915245c8fe29cff6
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_special_functions.h
@@ -0,0 +1,7 @@
+#pragma once
+#include <torch/csrc/utils/pythoncapi_compat.h>
+namespace torch::autograd {
+
+void initSpecialFunctions(PyObject* module);
+
+}
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_torch_functions.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_torch_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..207910d9a7febbdecefc6534ed20d885a8925528
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_torch_functions.h
@@ -0,0 +1,25 @@
+#include <Python.h>
+
+namespace torch::autograd {
+
+extern PyObject* THPVariableFunctionsModule;
+
+// Wrapper converts a raised TypeError into returning NotImplemented
+// Used to implement binary arithmetic operators
+template <PyObject* (*Func)(PyObject*, PyObject*, PyObject*)>
+inline PyObject* TypeError_to_NotImplemented_(
+    PyObject* self,
+    PyObject* args,
+    PyObject* kwargs) {
+  PyObject* ret = Func(self, args, kwargs);
+  if (!ret && PyErr_ExceptionMatches(PyExc_TypeError)) {
+    PyErr_Clear();
+    Py_INCREF(Py_NotImplemented);
+    ret = Py_NotImplemented;
+  }
+  return ret;
+}
+
+void initTorchFunctions(PyObject* module);
+
+} // namespace torch::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_variable.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_variable.h
new file mode 100644
index 0000000000000000000000000000000000000000..c355939b06a5522ae3492788fc2f75113d4b94a9
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_variable.h
@@ -0,0 +1,114 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/pythoncapi_compat.h>
+
+#include <ATen/core/function_schema.h>
+#include <pybind11/pybind11.h>
+#include <torch/csrc/Exceptions.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace py = pybind11;
+
+// Python object that backs torch.autograd.Variable
+struct THPVariable {
+  PyObject_HEAD
+  // Payload
+  c10::MaybeOwned<at::Tensor> cdata;
+  // Hooks to be run on backwards pass (corresponds to Python attr
+  // '_backwards_hooks', set by 'register_hook')
+  PyObject* backward_hooks = nullptr;
+  // Hooks to be run in the backwards pass after accumulate grad,
+  // i.e., after the .grad has been set (corresponds to Python attr
+  // '_post_accumulate_grad_hooks', set by 'register_post_accumulate_grad_hook')
+  PyObject* post_accumulate_grad_hooks = nullptr;
+};
+
+TORCH_PYTHON_API void registerPythonTensorClass(
+    const std::string& device,
+    PyObject* python_tensor_class);
+
+TORCH_PYTHON_API void activateGPUTrace();
+
+TORCH_PYTHON_API extern PyObject* THPVariableClass;
+TORCH_PYTHON_API extern PyObject* ParameterClass;
+
+bool THPVariable_initModule(PyObject* module);
+TORCH_PYTHON_API PyObject* THPVariable_Wrap(const at::TensorBase& var);
+
+inline bool THPVariable_CheckTypeExact(PyTypeObject* tp) {
+  // Check that a python object is a `Tensor`, but not a `Tensor` subclass.
+  // (A subclass could have different semantics.) The one exception is
+  // Parameter, which is used for Python bookkeeping but is equivalent to
+  // Tensor as far as C++ is concerned.
+  return (
+      tp == (PyTypeObject*)THPVariableClass ||
+      tp == (PyTypeObject*)ParameterClass);
+}
+
+inline bool THPVariable_CheckExact(PyObject* obj) {
+  return THPVariable_CheckTypeExact(Py_TYPE(obj));
+}
+
+inline bool THPVariable_Check(PyObject* obj) {
+  if (!THPVariableClass)
+    return false;
+
+  // Fast path
+  if (THPVariable_CheckExact(obj)) {
+    return true;
+  }
+
+  const auto result = PyObject_IsInstance(obj, THPVariableClass);
+  if (result == -1)
+    throw python_error();
+  return result;
+}
+
+inline const at::Tensor& THPVariable_Unpack(THPVariable* var) {
+  return *var->cdata;
+}
+
+inline const at::Tensor& THPVariable_Unpack(PyObject* obj) {
+  return THPVariable_Unpack(reinterpret_cast<THPVariable*>(obj));
+}
+
+std::pair<py::object, py::dict> parseIValuesToPyArgsKwargs(
+    const c10::OperatorHandle& op,
+    const std::vector<c10::IValue>& arguments);
+
+void pushPyOutToStack(
+    const c10::OperatorHandle& op,
+    torch::jit::Stack* stack,
+    py::object out,
+    const char* msg);
+
+inline PyObject* THPVariable_WrapList(
+    const torch::autograd::variable_list& inputs) {
+  PyObject* pyinput = PyList_New(static_cast<Py_ssize_t>(inputs.size()));
+  for (const auto i : c10::irange(inputs.size())) {
+    PyList_SET_ITEM(pyinput, i, THPVariable_Wrap(inputs[i]));
+  }
+  return pyinput;
+}
+
+inline torch::autograd::variable_list THPVariable_UnpackList(
+    PyObject* pyresult) {
+  TORCH_CHECK(PyList_CheckExact(pyresult));
+  auto result_len = PyList_GET_SIZE(pyresult);
+  torch::autograd::variable_list result;
+  result.reserve(result_len);
+  for (const auto i : c10::irange(result_len)) {
+    PyObject* item = PyList_GET_ITEM(pyresult, i);
+    if (!Py_IsNone(item)) {
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(THPVariable_Check(item));
+      result.emplace_back(THPVariable_Unpack(item));
+    } else {
+      result.emplace_back();
+    }
+  }
+  return result;
+}
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_variable_indexing.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_variable_indexing.h
new file mode 100644
index 0000000000000000000000000000000000000000..cc21a0beb58de38624864960af37959c4bcee99d
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/python_variable_indexing.h
@@ -0,0 +1,99 @@
+#pragma once
+
+#include <c10/core/SymInt.h>
+#include <torch/csrc/autograd/python_variable.h>
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/pybind.h>
+#include <torch/csrc/utils/python_symnode.h>
+
+namespace torch::autograd {
+
+struct UnpackedSlice {
+  c10::SymInt start;
+  c10::SymInt stop;
+  c10::SymInt step;
+};
+
+// This mirrors Cpython's PySlice_Unpack method
+inline UnpackedSlice __PySlice_Unpack(PyObject* _r) {
+  PySliceObject* r = (PySliceObject*)_r;
+  /* this is harder to get right than you might think */
+
+  c10::SymInt start_sym, stop_sym, step_sym;
+
+  auto clip_val = [](Py_ssize_t val) {
+    if (val < c10::SymInt::min_representable_int()) {
+      auto r = PyErr_WarnEx(
+          PyExc_UserWarning,
+          "Truncating the start/stop/step "
+          "of slice. This is likely because of "
+          "saved old models when the start/stop/step were larger.",
+          1);
+      if (r != 0) {
+        throw python_error();
+      }
+      return (Py_ssize_t)(c10::SymInt::min_representable_int());
+    }
+    return val;
+  };
+
+  if (r->step == Py_None) {
+    step_sym = c10::SymInt(1);
+  } else {
+    if (torch::is_symint(r->step)) {
+      step_sym = py::handle(r->step).cast<c10::SymInt>();
+    } else {
+      Py_ssize_t step = 0;
+      if (!_PyEval_SliceIndex(r->step, &step)) {
+        throw python_error();
+      }
+      if (step == 0) {
+        PyErr_SetString(PyExc_ValueError, "slice step cannot be zero");
+      }
+
+      step = clip_val(step);
+      step_sym = c10::SymInt(step);
+    }
+  }
+
+  if (torch::is_symint(r->start)) {
+    start_sym = py::handle(r->start).cast<c10::SymInt>();
+  } else if (r->start == Py_None) {
+    start_sym = c10::SymInt(step_sym < 0 ? PY_SSIZE_T_MAX : 0);
+  } else {
+    Py_ssize_t start = 0;
+    if (!_PyEval_SliceIndex(r->start, &start)) {
+      throw python_error();
+    }
+    start = clip_val(start);
+    start_sym = c10::SymInt(start);
+  }
+
+  if (torch::is_symint(r->stop)) {
+    stop_sym = py::handle(r->stop).cast<c10::SymInt>();
+  } else if (r->stop == Py_None) {
+    stop_sym = c10::SymInt(
+        step_sym < 0 ? c10::SymInt::min_representable_int() : PY_SSIZE_T_MAX);
+  } else {
+    Py_ssize_t stop = 0;
+    if (!_PyEval_SliceIndex(r->stop, &stop)) {
+      throw python_error();
+    }
+    stop = clip_val(stop);
+    stop_sym = c10::SymInt(stop);
+  }
+
+  return UnpackedSlice{
+      std::move(start_sym), std::move(stop_sym), std::move(step_sym)};
+}
+
+Py_ssize_t THPVariable_length(PyObject* self);
+PyObject* THPVariable_getitem(PyObject* self, PyObject* index);
+int THPVariable_setitem(PyObject* self, PyObject* index, PyObject* value);
+
+Variable valueToTensor(
+    c10::TensorOptions options,
+    PyObject* value,
+    const at::Device& device);
+
+} // namespace torch::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/record_function_ops.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/record_function_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..b876be172e6852d7413ff4bbc7a02c3c33699f54
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/record_function_ops.h
@@ -0,0 +1,27 @@
+#pragma once
+#include <ATen/record_function.h>
+#include <torch/custom_class.h>
+#include <optional>
+
+namespace torch::autograd::profiler {
+
+struct PythonRecordFunction : public torch::CustomClassHolder {
+  at::RecordFunction record;
+
+  explicit PythonRecordFunction(
+      at::RecordScope scope = at::RecordScope::FUNCTION)
+      : record(scope) {}
+};
+
+// Creates a new profiling scope using RecordFunction and invokes its starting
+// callbacks.
+TORCH_API c10::intrusive_ptr<PythonRecordFunction> record_function_enter_new(
+    const std::string& name,
+    const std::optional<std::string>& args = std::nullopt);
+
+// Schedules RecordFunction's end callbacks to be run on completion of a future.
+TORCH_API c10::intrusive_ptr<c10::ivalue::Future> _call_end_callbacks_on_fut_new(
+    const c10::intrusive_ptr<PythonRecordFunction>& record,
+    const c10::intrusive_ptr<c10::ivalue::Future>& fut);
+
+} // namespace torch::autograd::profiler
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/saved_variable.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/saved_variable.h
new file mode 100644
index 0000000000000000000000000000000000000000..a467c5a5b0ab63192e2a65acfb324ee8e49e4775
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/saved_variable.h
@@ -0,0 +1,132 @@
+#pragma once
+
+#include <c10/core/SafePyObject.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/autograd/forward_grad.h>
+#include <torch/csrc/autograd/saved_variable_hooks.h>
+
+#include <ATen/core/Tensor.h>
+
+#include <cstdint>
+#include <memory>
+
+namespace torch::autograd {
+
+using Variable = at::Tensor;
+struct Node;
+
+TORCH_API extern const char* ERR_BACKWARD_TWICE;
+
+/// A snapshot of a variable at a certain version. A `SavedVariable` stores
+/// enough information to reconstruct a variable from a certain point in time.
+class TORCH_API SavedVariable {
+ public:
+  SavedVariable() = default;
+  SavedVariable(
+      const Variable& variable,
+      bool is_output,
+      bool is_inplace_on_view = false);
+  SavedVariable(
+      const std::optional<Variable>& variable,
+      bool is_output,
+      bool is_inplace_on_view = false);
+  SavedVariable(const SavedVariable&) = delete;
+  SavedVariable(SavedVariable&&) = default;
+  SavedVariable& operator=(const SavedVariable&) = delete;
+  SavedVariable& operator=(SavedVariable&&) = default;
+  ~SavedVariable() {
+    if (fw_grad_) {
+      // See note [ Using ForwardGrad ]
+      fw_grad_->clear();
+    }
+  }
+
+  /// Reconstructs the saved variable. Pass `saved_for` as the gradient
+  /// function if constructing the `SavedVariable` with it would have caused a
+  /// circular reference.
+  Variable unpack(std::shared_ptr<Node> saved_for = nullptr) const;
+
+  void register_hooks(std::unique_ptr<SavedVariableHooks>&& hooks);
+
+  void reset_data();
+
+  bool has_hooks() const {
+    return (bool)hooks_;
+  }
+
+  // Used by compiled autograd
+  std::optional<std::pair<c10::SafePyObject, c10::SafePyObject>>
+  retrieve_unpack_hook_data() const {
+    if (!hooks_) {
+      return std::nullopt;
+    }
+    return hooks_->retrieve_unpack_hook_data();
+  }
+
+ private:
+  // This field contains either:
+  // 1. the variable to save
+  // 2. or its tensor_data.
+  // If storing the variable itself would create a circular reference,
+  // we fall into the second case and its metadata is also saved separately.
+  // In that case, the grad_fn must be passed in to the unpack function when
+  // reconstructing the Variable (except when we are doing an inplace operation
+  // on a view, see below). The field saved_original_ below reflects the two
+  // cases: its value is true in the first case and false in the second case.
+  // The value data_.defined() can be false in three cases:
+  // 1. SavedVariable was constructed without a Tensor (the value to save is
+  // None), in that case was_default_constructed_ will be kept at true
+  // 2. The saved variable has been released by calling
+  // SavedVariable::reset_data(), typically during the backward pass
+  // 3. Hooks have been registered. In that case, hooks_ will be defined
+  // instead. Note that the value of saved_original_ only reflects what happened
+  // during the construction of the SavedVariable. If saved_original_ is true,
+  // we saved the original tensor in data_, but if the user registers hooks, we
+  // will no longer have it (despite the saved_original_ still being true)
+  at::Tensor data_;
+
+  // This field is used to store the forward AD gradients associated with
+  // the saved Tensor. Note that this shared_ptr must never be shared with
+  // either the saved Tensor or the unpacked Tensor. See note [ Using
+  // ForwardGrad ]
+  std::shared_ptr<ForwardGrad> fw_grad_;
+
+  // Weak version of grad_fn_ that prevents leaks in rebase_history() for
+  // inplace views.
+  // This variable is used when the user chooses to create a SavedVariable with
+  // is_inplace_on_view = true.
+  // In that case, the grad_fn passed in to the unpack function at unwrapping
+  // time is unused.
+  std::weak_ptr<Node> weak_grad_fn_;
+
+  uint32_t saved_version_ = 0;
+  uint32_t output_nr_ = 0;
+  bool was_default_constructed_ = true;
+  bool is_inplace_on_view_ = false;
+  bool saved_original_ = false;
+  bool is_leaf_ = false;
+  bool is_output_ = false;
+
+  // Hooks are a pair of functions pack_hook/unpack_hook that provides
+  // fine-grained control over how the SavedVariable should save its data.
+  // pack_hook is called upon registration, while unpack_hook is called when
+  // unpacking.
+  std::unique_ptr<SavedVariableHooks> hooks_;
+  // Fields grad_fn_, grad_accumulator_, and requires_grad_ are only used if
+  // hooks are defined. They are set before pack_hook is called and used after
+  // unpack_hook is called.
+  std::shared_ptr<Node> grad_fn_;
+  // For the usual case where leaf tensors are the input, we expect its
+  // grad_acc to be kept alive by the graph. The reason SavedVariable holds
+  // a owning reference is to support the case where a custom autograd Function
+  // saves an intermediate.
+  std::shared_ptr<Node> grad_accumulator_;
+  bool requires_grad_ = false;
+
+  void save_metadata(const Variable& data);
+  static std::unique_ptr<SavedVariableHooks> get_default_hooks();
+  void set_hooks_and_pack_data(
+      std::unique_ptr<SavedVariableHooks>&& hooks,
+      const Variable& data);
+};
+} // namespace torch::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/saved_variable_hooks.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/saved_variable_hooks.h
new file mode 100644
index 0000000000000000000000000000000000000000..68967defb189c14bac9624e9228c6db7b58fbe55
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/saved_variable_hooks.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <c10/core/SafePyObject.h>
+
+namespace torch::autograd {
+
+struct TORCH_API SavedVariableHooks {
+  virtual void call_pack_hook(const at::Tensor& tensor) = 0;
+  virtual at::Tensor call_unpack_hook() = 0;
+  virtual ~SavedVariableHooks() = default;
+  virtual std::optional<std::pair<c10::SafePyObject, c10::SafePyObject>>
+  retrieve_unpack_hook_data() const {
+    throw std::runtime_error(
+        "Compiled Autograd only supports python saved tensor hooks ");
+  }
+};
+
+} // namespace torch::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/symbolic.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/symbolic.h
new file mode 100644
index 0000000000000000000000000000000000000000..99f27f0aba2df896d6fd98d0a841548cdd1985f2
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/symbolic.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/onnx/onnx.h>
+
+namespace torch::autograd {
+
+struct SymbolicContext {
+  jit::Block* block;
+};
+
+struct symbolic_unconvertible : public std::runtime_error {
+  using std::runtime_error::runtime_error;
+};
+
+} // namespace torch::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/utils/error_messages.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/utils/error_messages.h
new file mode 100644
index 0000000000000000000000000000000000000000..25964b66b9d63ff6c70e6be8c3aec6f8144ad74f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/utils/error_messages.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <sstream>
+
+namespace torch::autograd::utils {
+
+inline std::string requires_grad_leaf_error(bool requires_grad) {
+  std::ostringstream oss;
+  oss << "you can only change requires_grad flags of leaf variables.";
+  if (requires_grad == false) {
+    oss << " If you want to use a computed variable in a subgraph "
+           "that doesn't require differentiation use "
+           "var_no_grad = var.detach().";
+  }
+  return oss.str();
+}
+
+} // namespace torch::autograd::utils
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/utils/grad_layout_contract.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/utils/grad_layout_contract.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c55f6cfa697fe1491814a3755842ca93f0ce703
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/utils/grad_layout_contract.h
@@ -0,0 +1,76 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+
+namespace torch::autograd::utils {
+
+// Helper functions to enforce the "Gradient Layout Contract" described in
+// torch/csrc/autograd/functions/accumulate_grad.h.
+
+// Checks if grad obeys the contract with variable.
+inline bool obeys_layout_contract(
+    const at::Tensor& grad,
+    const at::Tensor& variable) {
+  TORCH_INTERNAL_ASSERT(!grad.is_sparse());
+  TORCH_INTERNAL_ASSERT(!grad.is_sparse_csr());
+  TORCH_INTERNAL_ASSERT(!variable.is_sparse_csr());
+
+  // NOLINTNEXTLINE(bugprone-branch-clone)
+  if (variable.is_nested()) {
+    // TODO: Nested Tensor does not have an implementation of detach. The
+    // current implementation of nested tensor likely does obey the gradient
+    // contract and should return true, but this would likely change in the
+    // future
+    return false;
+  } else if (variable.is_sparse()) {
+    // Gradient Layout Contract is not applicable for sparse layouts
+    return false;
+  } else if (variable.is_non_overlapping_and_dense()) {
+    // Only look at stride for dimensions that are not of size 1.
+    const auto& grad_sizes = grad.sym_sizes();
+    const auto& grad_strides = grad.sym_strides();
+    const auto& variable_strides = variable.sym_strides();
+    for (const auto idx : c10::irange(grad_sizes.size())) {
+      if (grad_sizes[idx] != 1) {
+        if (grad_strides[idx] != variable_strides[idx]) {
+          return false;
+        }
+      } else {
+        // This should not be needed but we don't check if a Tensor has views
+        // before stashing it. And 0-strided Tensors of size 1 are actually
+        // views for ops like cat.
+        // TODO: Actually detect views in the accumulateGrad function so that
+        // this Tensor is not considered at all.
+        if (grad_strides[idx] == 0) {
+          return false;
+        }
+      }
+    }
+    return true;
+  } else {
+    return grad.is_contiguous(at::MemoryFormat::Contiguous);
+  }
+}
+
+// Creates a clone of new_grad that obeys the contract with variable.
+// The clone should attach to new_grad's history if GradMode::is_enabled().
+inline at::Tensor clone_obey_contract(
+    const at::Tensor& new_grad,
+    const at::Tensor& variable) {
+  if (variable.is_non_overlapping_and_dense()) {
+    // (1)
+    // Does this dicey-looking sequence attach the result to new_grad's
+    // history if GradMode::is_enabled()?  Yes, and @alband says it should.
+    return std::move(new_grad
+                         .new_empty_strided_symint(
+                             variable.sym_sizes(),
+                             variable.sym_strides(),
+                             variable.options().memory_format(std::nullopt))
+                         .copy_(new_grad));
+  } else {
+    // (2)
+    return new_grad.clone(at::MemoryFormat::Contiguous);
+  }
+}
+
+} // namespace torch::autograd::utils
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/utils/lambda_post_hook.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/utils/lambda_post_hook.h
new file mode 100644
index 0000000000000000000000000000000000000000..02a4cca795c046fa68c358709c3b45d3cfcb3324
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/utils/lambda_post_hook.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include <torch/csrc/autograd/function_hook.h>
+#include <torch/csrc/dynamo/compiled_autograd.h>
+
+namespace torch::autograd::utils {
+
+// Turns lambda into a torch::autograd::FunctionPostHook.
+class LambdaPostHook : public torch::autograd::FunctionPostHook {
+  using variable_list = std::vector<torch::autograd::Variable>;
+  using fn_type =
+      std::function<variable_list(const variable_list&, const variable_list&)>;
+  using compiled_fn_type = std::function<void(CompiledNodeArgs&)>;
+
+ public:
+  // The lambda function takes as arguments the outputs and inputs of the
+  // autograd function and can modify the outputs of the autograd function by
+  // returning a new output if needed.
+  /* implicit */ LambdaPostHook(fn_type fn) : fn_(std::move(fn)) {}
+
+  LambdaPostHook(fn_type fn, compiled_fn_type compiled_fn)
+      : fn_(std::move(fn)), compiled_fn_(std::move(compiled_fn)) {}
+
+  variable_list operator()(
+      const variable_list& outputs,
+      const variable_list& inputs) override {
+    return fn_(outputs, inputs);
+  }
+
+  void compiled_args(CompiledNodeArgs& args) const override {
+    if (compiled_fn_ != nullptr) {
+      return compiled_fn_(args);
+    }
+    return FunctionPostHook::compiled_args(args);
+  }
+
+ protected:
+  std::function<variable_list(const variable_list&, const variable_list&)> fn_;
+  compiled_fn_type compiled_fn_{};
+};
+
+} // namespace torch::autograd::utils
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/utils/python_arg_parsing.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/utils/python_arg_parsing.h
new file mode 100644
index 0000000000000000000000000000000000000000..1e5add2cc91180d98bc04004a3dd13128814cd22
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/utils/python_arg_parsing.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <ATen/core/Tensor.h>
+#include <torch/csrc/python_headers.h>
+
+#include <torch/csrc/utils/python_arg_parser.h>
+
+namespace torch::autograd::utils {
+
+// The parameter allow_copy is to accept copy for Tensor.to (and by proxy
+// PackedSequences.to) but not nn.Module.to.
+inline std::tuple<
+    std::optional<at::Device>,
+    std::optional<at::ScalarType>,
+    bool,
+    bool,
+    std::optional<at::MemoryFormat>>
+parse_to_conversion(PythonArgs& r, bool allow_copy) {
+  if (r.idx == 0) {
+    if (!allow_copy && !r.isNone(3))
+      throw std::runtime_error(".to() does not accept copy argument");
+    return std::make_tuple(
+        r.deviceOptional(0),
+        r.scalartypeOptional(1),
+        r.toBool(2),
+        r.toBool(3),
+        r.memoryformatOptional(4));
+  } else if (r.idx == 1) {
+    if (!allow_copy && !r.isNone(2))
+      throw std::runtime_error(".to() does not accept copy argument");
+    return std::make_tuple(
+        std::nullopt,
+        r.scalartype(0),
+        r.toBool(1),
+        r.toBool(2),
+        r.memoryformatOptional(3));
+  } else {
+    auto tensor = r.tensor(0);
+    if (!allow_copy && !r.isNone(2))
+      throw std::runtime_error(".to() does not accept copy argument");
+    return std::make_tuple(
+        tensor.device(),
+        tensor.scalar_type(),
+        r.toBool(1),
+        r.toBool(2),
+        r.memoryformatOptional(3));
+  }
+}
+} // namespace torch::autograd::utils
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/utils/warnings.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/utils/warnings.h
new file mode 100644
index 0000000000000000000000000000000000000000..230d827432d8f3acc3ba030b8c602720078dc7ff
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/utils/warnings.h
@@ -0,0 +1,24 @@
+#pragma once
+#include <c10/util/Exception.h>
+
+#include <mutex>
+#include <vector>
+
+namespace torch::autograd::utils {
+
+// Warning handler for multi-threaded contexts. Gather warnings from
+// all threads into a single queue, then process together at the end
+// in the main thread.
+class DelayWarningHandler : public at::WarningHandler {
+ public:
+  ~DelayWarningHandler() override = default;
+  void replay_warnings();
+
+ private:
+  void process(const c10::Warning& warning) override;
+
+  std::vector<c10::Warning> warnings_;
+  std::mutex mutex_;
+};
+
+} // namespace torch::autograd::utils
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/utils/wrap_outputs.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/utils/wrap_outputs.h
new file mode 100644
index 0000000000000000000000000000000000000000..6369f3e2a210431a95edcbf7efc283923d8cb1ba
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/utils/wrap_outputs.h
@@ -0,0 +1,149 @@
+#pragma once
+
+// Wrap tensor operation outputs as PyObject*
+
+#include <ATen/ScalarOps.h>
+#include <ATen/core/Tensor.h>
+#include <c10/util/irange.h>
+#include <torch/csrc/python_headers.h>
+#include <initializer_list>
+#include <tuple>
+
+#include <torch/csrc/Dtype.h>
+#include <torch/csrc/DynamicTypes.h>
+#include <torch/csrc/Layout.h>
+#include <torch/csrc/QScheme.h>
+#include <torch/csrc/autograd/python_variable.h>
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/utils/python_numbers.h>
+#include <torch/csrc/utils/tensor_qschemes.h>
+
+namespace torch::autograd::utils {
+
+inline PyObject* wrap(bool value) {
+  if (value) {
+    Py_RETURN_TRUE;
+  } else {
+    Py_RETURN_FALSE;
+  }
+}
+
+inline PyObject* wrap(c10::DeviceIndex value) {
+  return THPUtils_packDeviceIndex(value);
+}
+
+inline PyObject* wrap(int64_t value) {
+  return THPUtils_packInt64(value);
+}
+
+inline PyObject* wrap(double value) {
+  return PyFloat_FromDouble(value);
+}
+
+inline PyObject* wrap(c10::complex<double> value) {
+  // I could probably also use FromComplex with a reinterpret cast,
+  // but... eh.
+  return PyComplex_FromDoubles(value.real(), value.imag());
+}
+
+inline PyObject* wrap(void* value) {
+  return PyLong_FromVoidPtr(value);
+}
+
+inline PyObject* wrap(THPDtype* dtype) {
+  return Py_NewRef(dtype);
+}
+
+inline PyObject* wrap(at::ScalarType scalarType) {
+  return Py_NewRef(getTHPDtype(scalarType));
+}
+
+inline PyObject* wrap(THPLayout* layout) {
+  return Py_NewRef(layout);
+}
+
+inline PyObject* wrap(at::Layout layout) {
+  return Py_NewRef(getTHPLayout(layout));
+}
+
+inline PyObject* wrap(const at::Tensor& tensor) {
+  return THPVariable_Wrap(tensor);
+}
+
+inline PyObject* wrap(const at::Scalar& scalar) {
+  return wrap(scalar_to_tensor(scalar));
+}
+
+inline PyObject* wrap(at::QScheme qscheme) {
+  auto* thp_qscheme = torch::utils::getTHPQScheme(qscheme);
+  Py_INCREF(thp_qscheme);
+  return thp_qscheme;
+}
+
+inline PyObject* wrap(at::TensorList tl) {
+  auto r = THPObjectPtr{PyTuple_New(static_cast<Py_ssize_t>(tl.size()))};
+  if (!r)
+    throw python_error();
+  for (const auto i : c10::irange(tl.size())) {
+    PyTuple_SET_ITEM(r.get(), i, wrap(tl[i]));
+  }
+  return r.release();
+}
+
+inline PyObject* wrap(at::IntArrayRef list) {
+  auto r = THPObjectPtr{PyTuple_New(static_cast<Py_ssize_t>(list.size()))};
+  if (!r)
+    throw python_error();
+  for (const auto i : c10::irange(list.size())) {
+    PyTuple_SET_ITEM(r.get(), i, wrap(list[i]));
+  }
+  return r.release();
+}
+
+inline PyObject* wrap(at::Stream stream) {
+  return THPStream_Wrap(stream);
+}
+
+namespace detail {
+template <typename F, typename Tuple, size_t... Is>
+void apply_with_idx_impl(
+    const F& f,
+    Tuple& t,
+    std::index_sequence<Is...> /*indices*/) {
+  (void)std::initializer_list<int>{(f(std::get<Is>(t), Is), 0)...};
+}
+
+// For tuple(a, b, c), calls f(a, 0), f(b, 1), f(c, 2)
+template <typename F, typename... Ts>
+void apply_with_idx(const F& f, std::tuple<Ts...>& t) {
+  apply_with_idx_impl(f, t, std::index_sequence_for<Ts...>{});
+}
+} // namespace detail
+
+template <typename... Ts>
+PyObject* wrap(std::tuple<Ts...> values) {
+  auto r = THPObjectPtr{PyTuple_New(sizeof...(Ts))};
+  if (!r)
+    throw python_error();
+  detail::apply_with_idx(
+      [&](auto& value, size_t idx) {
+        PyTuple_SET_ITEM(r.get(), idx, wrap(std::move(value)));
+      },
+      values);
+  return r.release();
+}
+
+template <typename... Ts>
+PyObject* wrap(PyTypeObject* type, std::tuple<Ts...> values) {
+  auto r = THPObjectPtr{PyStructSequence_New(type)};
+  if (!r)
+    throw python_error();
+  detail::apply_with_idx(
+      [&](auto& value, size_t idx) {
+        PyStructSequence_SET_ITEM(r.get(), idx, wrap(std::move(value)));
+      },
+      values);
+  return r.release();
+}
+
+} // namespace torch::autograd::utils
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/variable.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/variable.h
new file mode 100644
index 0000000000000000000000000000000000000000..2ccfdbfc185d042fa5ded129e50c9a3097829271
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/variable.h
@@ -0,0 +1,952 @@
+#pragma once
+
+#include <torch/csrc/utils/python_stub.h>
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/autograd/cpp_hook.h>
+#include <torch/csrc/autograd/edge.h>
+#include <torch/csrc/autograd/forward_grad.h>
+#include <torch/csrc/autograd/function_hook.h>
+
+#include <ATen/NamedTensorUtils.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/core/VariableHooksInterface.h>
+#include <c10/util/Exception.h>
+
+#include <cstdint>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace torch::autograd {
+
+/// `Variable` is exactly the same as `Tensor` (i.e. we have `using Variable =
+/// at::Tensor`). This means you can perform all the usual mathematical and
+/// other operations you can perform on `Tensor`s also on `Variable`s.
+///
+/// The only reason we are keeping the `Variable` class is backward
+/// compatibility with external user's legacy C++ frontend code. Our intention
+/// is to eliminate the `Variable` class in the near future.
+using Variable = at::Tensor;
+
+} // namespace torch::autograd
+
+// The following are all internal APIs and should not be shown in libtorch docs.
+// Therefore, we wrap the following code with `#ifndef DOXYGEN_SHOULD_SKIP_THIS
+// ... #endif`
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+
+namespace torch::autograd {
+
+/// Check if this type is supported by the autograd engine.
+/// If you change this, update the doc at the top of the
+/// torch/autograd/__init__.py file and
+/// "test_set_requires_grad_only_for_continuous_types" in test/test_autograd.py
+inline bool isDifferentiableType(at::ScalarType t) {
+  return isFloatingType(t) || isComplexType(t);
+}
+
+struct Node;
+
+///~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+///                                Variable
+///~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/// A `Variable` augments a `Tensor` with the ability to interact in our
+/// autograd machinery. Conceptually, `Variable`s travel along `Edge`s between
+/// `Node`s in the autograd graph. A `Variable` can either be a leaf, like a
+/// weight in a neural network, or an interior variable, when it is the result
+/// of an operation between variables. Every `Variable` also stores another
+/// `Variable` called its `grad` (gradient). If the variable is a leaf, its
+/// gradient will be accumulated into this variable.
+///
+/// Every Tensor is a Variable, but sometimes we colloquially refer to Variables
+/// that don't require gradients as Tensors (since none of the autograd
+/// machinery for Variables applies).  Historically, Variables and Tensors
+/// were separate concepts, but now they are exactly the same (i.e. we have
+/// `using Variable = at::Tensor`).
+///
+///                              Gradient Edges
+///~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/// Furthermore, `Variable`s have the notion of a `gradient_edge`, which is the
+/// edge in the autograd graph that connects the variable to a particular input
+/// of the gradient function that will be invoked with the variable during the
+/// backward pass. More precisely, this gradient function can be one of two
+/// things:
+/// 1. A `grad_fn`, if the variable is in the interior of the graph. This is the
+///    gradient of the function that produced the variable.
+/// 2. A `grad_accumulator`, if the variable is a leaf, which accumulates a
+///    scalar gradient value into its `grad` variable.
+///
+///                               Versioning
+///~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/// Another major feature of `Variable`s are *versions*. Versions are
+/// incremented when an in-place mutation of a variable occurs. Versions are
+/// useful when constructing `SavedVariable`s, which take a snapshot of a
+/// `Variable` at a certain version. You can retrieve a `Variable`'s version
+/// through its `current_version()` method.
+///
+///                                 Views
+///~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/// It is possible for a  `Variable` to be a *view* of another `Variable`, in
+/// which case it tracks that `Variable`'s data and autograd history. Beyond
+/// construction, the interface of a view is identical to that of a regular
+/// `Variable`. You can determine whether `Variable` is in fact a view by
+/// probing its `is_view()` method. Note that the *view* semantics are only
+/// meaningful for `Variable` relations that are relevant to autograd.
+/// See NOTE [ Autograd View Variables ] for more details.
+///~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+struct AutogradMeta;
+struct DifferentiableViewMeta;
+
+// Private-ish functions for manipulating variables; we don't want to put them
+// on Tensor proper
+namespace impl {
+
+// WARNING: This may return a nullptr.  If you require AutogradMeta to return
+// a materialized structure, use materialize_autograd_meta instead.
+TORCH_API AutogradMeta* get_autograd_meta(const at::TensorBase&);
+
+// WARNING: This will return a nullptr if the Tensor is not a view.
+TORCH_API DifferentiableViewMeta* get_view_autograd_meta(const at::TensorBase&);
+
+// Returns the current autograd meta, materializing it if it was previously
+// none.  This counts as a *mutating* operation, so do not call it on
+// "read-only" operators; in particular, this is NOT thread safe
+TORCH_API AutogradMeta* materialize_autograd_meta(const at::TensorBase&);
+
+/// Set the gradient accumulator of the `Variable`. This is only applicable to
+/// leaf variables. Interior variables should call `set_gradient_edge()`.
+TORCH_API void set_grad_accumulator(
+    const Variable&,
+    std::weak_ptr<Node> grad_accumulator);
+
+/// Attempts to get a pointer to the gradient accumulator of the `Variable`,
+/// if it still exists. If the gradient accumulator function has been
+/// destroyed, returns a `nullptr`.
+TORCH_API std::shared_ptr<Node> try_get_grad_accumulator(const Variable&);
+
+/// Gets the gradient accumulator of the `Variable` if it has one, or else
+/// create one on the fly and return it.
+TORCH_API std::shared_ptr<Node> grad_accumulator(const Variable&);
+
+/// Returns the "canonical" gradient edge of this `Variable`, i.e. either the
+/// gradient function if this is an interior `Variable`, or the gradient
+/// accumulator otherwise. If the `Variable` is interior, the returned `Edge`
+/// will store the input index of the `Node` to which this variable is
+/// connected in its `input_nr` field. For leaves, the `input_nr` is always
+/// zero. Note that `set_gradient_edge` and `gradient_edge` are not
+/// symmetric. You must use `set_gradient_edge` to set the `grad_fn` and
+/// `set_grad_accumulator` to set the accumulator.
+TORCH_API Edge gradient_edge(const Variable&);
+
+/// Set the gradient edge -- i.e. `grad_fn` and `input_nr` -- of the
+/// `Variable`.
+/// NOTE: This will always set the `grad_fn`, even if this is a leaf variable,
+/// and never the `grad_accumulator`. For the latter, use
+/// `set_grad_accumulator`. This allows late construction of an interior
+/// `Variable`.
+TORCH_API void set_gradient_edge(const Variable&, Edge edge);
+
+// Autograd Graph Interaction
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Update the `grad_fn` of an existing Variable. Called after in-place
+/// modifications.
+///
+/// For View Variables:
+/// Called after in-place modifications. Modifies the grad_fn of the base
+/// Variable.
+TORCH_API void rebase_history(const Variable&, Edge gradient_edge);
+
+/// Gets the raw gradient function pointer, whatever it currently is.
+TORCH_API Node* grad_fn_unsafe(const Variable&);
+
+/// Increments the version count of this `Variable`.
+TORCH_API void bump_version(const Variable&);
+TORCH_API void set_version_counter(
+    const Variable&,
+    const c10::VariableVersion& version_counter);
+
+/// Retrieves this `Variable`s version counter.
+TORCH_API const c10::VariableVersion& version_counter(const Variable&);
+
+TORCH_API void set_name(const Variable&, const std::string& name);
+
+TORCH_API void add_hook(
+    const at::TensorBase&,
+    std::unique_ptr<FunctionPreHook> hook);
+TORCH_API std::vector<std::unique_ptr<FunctionPreHook>>& hooks(const Variable&);
+TORCH_API void clear_hooks(const at::TensorBase&);
+
+TORCH_API void set_post_acc_grad_hooks(
+    const at::TensorBase&,
+    std::unique_ptr<PostAccumulateGradHook> dict);
+TORCH_API std::unique_ptr<PostAccumulateGradHook>& post_acc_grad_hooks(
+    const Variable&);
+
+TORCH_API void create_cpp_hook(
+    const at::TensorBase&,
+    bool is_retains_grad_hooks = false);
+} // namespace impl
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+//                            AutogradMeta
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Each `Variable` has one unique `AutogradMeta` struct, which stores autograd
+/// metadata fields that are necessary for tracking the Variable's autograd
+/// history. As an optimization, a Variable may store a nullptr, in lieu of a
+/// default constructed AutogradMeta.
+
+struct TORCH_API AutogradMeta : public c10::AutogradMetaInterface {
+  std::string name_;
+
+  Variable grad_;
+  std::shared_ptr<Node> grad_fn_;
+  std::weak_ptr<Node> grad_accumulator_;
+
+  // This field is used to store all the forward AD gradients
+  // associated with this AutogradMeta (and the Tensor it corresponds to)
+  // There is a semantic 1:1 correspondence between AutogradMeta and
+  // ForwardGrad but:
+  //   - This field is lazily populated.
+  //   - This field is a shared_ptr but it must never be
+  //     shared by multiple Tensors. See Note [ Using ForwardGrad ]
+  // Any transition from not_initialized to initialized
+  // must be protected by mutex_
+  mutable std::shared_ptr<ForwardGrad> fw_grad_;
+
+  // The hooks_ field is actually reused by both python and cpp logic
+  // For both cases, we have a data structure, cpp_hooks_list_ (cpp)
+  // or dict (python) which is the canonical copy.
+  // Then, for both cases, we always register a single hook to
+  // hooks_ which wraps all the hooks in the list/dict.
+  // And, again in both cases, if the grad_fn exists on that tensor
+  // we will additionally register a single hook to the grad_fn.
+  //
+  // Note that the cpp and python use cases aren't actually aware of
+  // each other, so using both is not defined behavior.
+  std::vector<std::unique_ptr<FunctionPreHook>> hooks_;
+  std::shared_ptr<hooks_list> cpp_hooks_list_;
+
+  // The post_acc_grad_hooks_ field stores only Python hooks
+  // (PyFunctionTensorPostAccGradHooks) that are called after the
+  // .grad field has been accumulated into. This is less complicated
+  // than the hooks_ field, which encapsulates a lot more.
+  std::unique_ptr<PostAccumulateGradHook> post_acc_grad_hooks_ = nullptr;
+
+  // Only meaningful on leaf variables (must be false otherwise)
+  bool requires_grad_{false};
+
+  // Only meaningful on non-leaf variables (must be false otherwise)
+  bool retains_grad_{false};
+
+  bool is_view_{false};
+
+  // The "output number" of this variable; e.g., if this variable
+  // was the second output of a function, then output_nr == 1.
+  // We use this to make sure we can setup the backwards trace
+  // correctly when this variable is passed to another function.
+  uint32_t output_nr_;
+
+  // Mutex to ensure that concurrent read operations that modify internal
+  // state are still thread-safe. Used by grad_fn(), grad_accumulator(),
+  // fw_grad() and set_fw_grad()
+  // This is mutable because we need to be able to acquire this from const
+  // version of this class for the functions above
+  mutable std::mutex mutex_;
+
+  /// Sets the `requires_grad` property of `Variable`. This should be true for
+  /// leaf variables that want to accumulate gradients, and false for all other
+  /// variables.
+  void set_requires_grad(bool requires_grad, at::TensorImpl* self_impl) final {
+    TORCH_CHECK(
+        !requires_grad ||
+            isDifferentiableType(at::typeMetaToScalarType(self_impl->dtype())),
+        "Only Tensors of floating point and complex dtype can require gradients");
+    requires_grad_ = requires_grad;
+  }
+
+  bool requires_grad() const override {
+    return requires_grad_ || grad_fn_;
+  }
+
+  /// Accesses the gradient `Variable` of this `Variable`.
+  Variable& mutable_grad() override {
+    return grad_;
+  }
+
+  const Variable& grad() const override {
+    return grad_;
+  }
+
+  const Variable& fw_grad(uint64_t level, const at::TensorBase& self)
+      const override;
+
+  void set_fw_grad(
+      const at::TensorBase& new_grad,
+      const at::TensorBase& self,
+      uint64_t level,
+      bool is_inplace_op) override;
+
+  AutogradMeta(
+      at::TensorImpl* self_impl = nullptr,
+      bool requires_grad = false,
+      Edge gradient_edge = Edge())
+      : grad_fn_(std::move(gradient_edge.function)),
+
+        output_nr_(gradient_edge.input_nr) {
+    // set_requires_grad also checks error conditions.
+    if (requires_grad) {
+      TORCH_INTERNAL_ASSERT(self_impl);
+      set_requires_grad(requires_grad, self_impl);
+    }
+    TORCH_CHECK(
+        !grad_fn_ || !requires_grad_,
+        "requires_grad should be false if grad_fn is set");
+  }
+
+  ~AutogradMeta() override {
+    // If AutogradMeta is being destroyed, it means that there is no other
+    // reference to its corresponding Tensor. It implies that no other thread
+    // can be using this object and so there is no need to lock mutex_ here to
+    // guard the check if fw_grad_ is populated.
+    if (fw_grad_) {
+      // See note [ Using ForwardGrad ]
+      fw_grad_->clear();
+    }
+  }
+};
+
+/// Base class for view functions, providing reapplication of a view on a new
+/// base. Each view op should get a codegenerated subclass of this class
+/// containing any state needed to reconstruct the view. The class also provides
+/// convenience accessors for saved SymInts / tensor state. This is useful for
+/// e.g. fake-ification, where we want to use symbolic values or fake tensors
+/// instead.
+struct TORCH_API ViewFunc {
+  virtual ~ViewFunc() = default;
+  /// Returns any SymInts in the saved state.
+  virtual std::vector<c10::SymInt> get_symints() const {
+    return {};
+  }
+  /// Returns the number of SymInts in the saved state.
+  virtual size_t num_symints() const {
+    return 0;
+  }
+  /// Returns any tensors in the saved state.
+  virtual std::vector<at::Tensor> get_tensors() const {
+    return {};
+  }
+  /// Returns the number of tensors in the saved state.
+  virtual size_t num_tensors() const {
+    return 0;
+  }
+  /// Reapplies the view on the given base using the saved state.
+  virtual at::Tensor operator()(const at::Tensor&) const = 0;
+  /// Returns a clone of this ViewFunc, optionally with the specified saved
+  /// state.
+  virtual std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = std::nullopt,
+      std::optional<std::vector<at::Tensor>> = std::nullopt) const = 0;
+
+ protected:
+  /// Sets the values of any SymInts in the saved state. The input vector size
+  /// must match the number of SymInts in the saved state (i.e. the size of the
+  /// list returned by get_symints()).
+  /// NOLINTNEXTLINE(performance-unnecessary-value-param)
+  virtual void set_symints(std::vector<c10::SymInt>) {}
+  /// Sets the values of any Tensors in the saved state. The input vector size
+  /// must match the number of Tensors in the saved state (i.e. the size of the
+  /// list returned by get_tensors()).
+  /// NOLINTNEXTLINE(performance-unnecessary-value-param)
+  virtual void set_tensors(std::vector<at::Tensor>) {}
+};
+
+/// ViewFunc that represents a chain of two ViewFuncs.
+struct ChainedViewFunc : public ViewFunc {
+  ChainedViewFunc(
+      std::unique_ptr<ViewFunc> first,
+      std::unique_ptr<ViewFunc> second)
+      : first(std::move(first)), second(std::move(second)) {}
+  ~ChainedViewFunc() override = default;
+  std::vector<c10::SymInt> get_symints() const override;
+  size_t num_symints() const override {
+    return first->num_symints() + second->num_symints();
+  }
+  std::vector<at::Tensor> get_tensors() const override;
+  size_t num_tensors() const override {
+    return first->num_tensors() + second->num_tensors();
+  }
+  at::Tensor operator()(const at::Tensor&) const override;
+  std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = std::nullopt,
+      std::optional<std::vector<at::Tensor>> = std::nullopt) const override;
+
+ private:
+  std::unique_ptr<ViewFunc> first;
+  std::unique_ptr<ViewFunc> second;
+};
+
+/// ViewFunc that errors with a specified error message when called.
+struct ErroringViewFunc : public ViewFunc {
+  ErroringViewFunc(std::string error_msg) : error_msg(std::move(error_msg)) {}
+  ~ErroringViewFunc() override = default;
+  at::Tensor operator()(const at::Tensor&) const override {
+    TORCH_CHECK(false, error_msg);
+  }
+  std::unique_ptr<ViewFunc> clone_and_set(
+      std::optional<std::vector<c10::SymInt>> = std::nullopt,
+      std::optional<std::vector<at::Tensor>> = std::nullopt) const override {
+    return std::make_unique<ErroringViewFunc>(error_msg);
+  }
+
+ private:
+  std::string error_msg;
+};
+
+struct TORCH_API ViewInfo {
+  /// The base `Variable`
+  /// If this ViewInfo represents a forward (respectively backward) AD gradient,
+  /// then this Tensor cannot be a forward (respectively backward) view.
+  Variable base_;
+
+  /// By default we use as_strided to recover views which is more efficient.
+  /// view_fn is only saved when as_strided is not supported.
+  /// If view_fn has value, we use it to recover views in backward.
+  std::unique_ptr<ViewFunc> view_fn_;
+
+  /// Analogue of view_fn but in reverse: given a view -> produce the base by
+  /// applying the inverse view.
+  std::function<Variable(const Variable&)> rev_view_fn_;
+
+  /// Accessors for the view function
+  bool has_view_fn() const {
+    // assume either BOTH or NEITHER of view_fn_ and rev_view_fn_ exist
+    return view_fn_ != nullptr;
+  }
+
+  const ViewFunc& view_fn() const {
+    TORCH_CHECK(
+        has_view_fn(), "Can only access the view function if it exists.");
+    return *view_fn_;
+  }
+
+  std::function<Variable(const Variable&)> rev_view_fn() const {
+    TORCH_CHECK(
+        has_view_fn(),
+        "Can only access the reverse view function if it exists.");
+    return rev_view_fn_;
+  }
+
+  /// The chain function can be used to build a new ViewInfo for a
+  /// differentiable view function. It will return a new view info that
+  /// accurately represents how "tensor" is a view of this instance's "base_".
+  /// The "base" and "tensor" are respectively the input and output of the
+  /// differentiable view function that happened. They are required to properly
+  /// set the optional view_fn_ when it is not provided. The "view_func", if
+  /// provided, should be a function that allows to re-do the view between
+  /// "base" and "tensor".
+  ViewInfo chain(
+      const Variable& base,
+      const Variable& tensor,
+      std::unique_ptr<ViewFunc> view_func = nullptr,
+      std::function<Variable(const Variable&)> rev_view_func = nullptr) const;
+
+  ViewInfo(
+      Variable base,
+      std::unique_ptr<ViewFunc> view_fn,
+      std::function<Variable(const Variable&)> rev_view_fn)
+      : base_(std::move(base)),
+        view_fn_(std::move(view_fn)),
+        rev_view_fn_(std::move(rev_view_fn)) {
+    TORCH_CHECK(base_.defined(), "base is undefined");
+  }
+};
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+//                     DifferentiableViewMeta
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// NOTE [ Autograd View Variables ]
+///
+/// Many operations return Variable that shares storage with an input Variable.
+/// The returned Variable is called a **view** Variable on the input **base**
+/// Variable.
+///
+/// In PyTorch, we have two types of views: differentiable views, and
+/// non-differentiable views. In either type, to support proper version
+/// checking, the base and view Variables must always share the same
+/// version_counter.
+///
+///
+/// Differentiable Views
+/// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/// This class allows to track both forward and backward AD differentiable
+/// views. These views can have different base as non-differentiable view for
+/// forward and backward mode AD are not the same.
+///
+/// Most function are either both forward and backward differentiable views (for
+/// example: view, select, narrow, transpose, etc) or both not forward and not
+/// backward differentiable views (for example: indices, values, eq, lt, etc).
+/// But there are also functions that are forward but not backward
+/// differentiable views (only detach for now) or functions that are backward
+/// but not forward differentiable view (only make_dual and unpack dual for
+/// now).
+///
+/// A concrete example of two views with different bases is as follow:
+///
+///     # Have:
+///     #   dual is a dual Tensor that is neither a forward or backward view
+///     detached_dual = dual.detach()
+///     view = detached_dual.view_as(dual)
+///     # The forward base of view is dual
+///     # The backward base of view is detached_dual
+///
+/// - Backward Mode View
+/// Differentiable views are the view variables where you want gradients to flow
+/// back to the base variables. Out-of-place operations on views are quite
+/// straightforward, but in-place ones are very tricky. Even if the base
+/// variable may not require grad when we create the view, we still need to
+/// track the view relation because future in-place ops may require back-proping
+/// through it. For example, we need to support
+///
+///   (1) in-place operation on view, e.g.,
+///
+///     # Have:
+///     #   base.requires_grad = False
+///     #   var.requires_grad = True
+///     base[1] = var  # i.e., base[1].copy_(var)
+///     torch.autograd.grad(base.sum(), var)  <- should return an all ones
+///     tensor
+///
+///   (2) in-place operation on base after view is created, e.g.,
+///
+///     # Have:
+///     #   base.requires_grad = False
+///     #   var.requires_grad = True
+///     view = base[1]
+///     base.copy_(var)
+///     torch.autograd.grad(view.sum(), var)  <- should return a tensor with
+///                                              var[1] filled with all ones and
+///                                              zeros everywhere else
+///
+/// - Forward Mode View
+/// Forward differentiable views follow the same semantic as backward ones but
+/// show up differently as they are computed along with the forward evaluation.
+/// The hard examples above are thus very similar
+///
+///   (1) in-place operation on view, e.g.,
+///
+///     # Have:
+///     #   base is a regular Tensor
+///     #   var is a dual Tensor whose tangent is all ones
+///     base[1] = var  # i.e., base[1].copy_(var)
+///     # Now, base is a dual Tensor
+///     _, fw_grad = fwAD.unpack_dual(base) <- fw_grad should be a tensor with
+///                                              fw_grad[1] filled with all ones
+///                                              and zeros everywhere else
+///
+///   (2) in-place operation on base after view is created, e.g.,
+///
+///     # Have:
+///     #   base is a regular Tensor
+///     #   var is a dual Tensor whose tangent is all ones
+///     view = base[1]
+///     base.copy_(var)
+///     _, fw_grad = fwAD.unpack_dual(view) <- fw_grad should be an all ones
+///     tensor
+///
+/// See Note [Forward Grad View/inplace] for more details on how we handle these
+/// hard cases.
+///
+///
+/// DifferentiableViewMeta is created to support gradient tracking of
+/// such **in-place** operations. In particular,
+///   + if an in-place op is done on base, the grad_fn field of the view may
+///     become stale. So accesses should always go through grad_fn(), which
+///     reconstructs an updated grad_fn if the version_counter has incremented.
+///     All other fields are always valid.
+///   + if an in-place op is done on view, in rebase_history() of view, which is
+///     called after every in-place op in VariableType.cpp, the grad_fn of base
+///     is updated.
+///   + if a single autograd Node returns multiple differentiable views, if any
+///     output is modified by an inplace operation, the autograd engine will
+///     make an equivalent graph (corresponding to the view operations) without
+///     using equivalent graph, where each output is treated as if it were
+///     produced by a distinct view operation. This discards the original (e.g.,
+///     user provided) grad_fn. If the provided grad_fn does more than the
+///     backward of the view, then the DifferentiableViewMeta must be created
+///     with creation_meta= CreationMeta::MULTI_OUTPUT_NODE to prevent the
+///     engine from ignoring the provided grad_fn.
+///
+/// Interaction with GradMode:
+/// The particular case that we consider here is:
+///
+///     # Have:
+///     #   base.requires_grad = True or False
+///     with torch.no_grad():
+///         view = base[1]
+///     base.requires_grad_()
+///     view.copy_(var)
+///     torch.autograd.grad(base.sum(), var)  <- what should it return?
+///
+/// Given that this particular code example is ambiguous and can easily be
+/// replace by either moving both inside the no_grad block or both outside, we
+/// explicitly forbid it. For now, it is deprecated by a warning. This is
+/// achieved by setting creation_meta=CreationMeta::NO_GRAD_MODE for all
+/// differentiable views created in no_grad mode.
+///
+/// See Note [View + Inplace update for base tensor]
+/// and Note [View + Inplace update for view tensor] for the details how
+/// autograd handles inplace update with view ops.
+///
+/// Non-Differentiable Views
+/// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+/// In certain cases, although function outputs share storage with inputs, they
+/// will **never** require gradient history tracking. Instead of registering the
+/// view relation via DifferentiableViewMeta in autograd, the views will be
+/// using usual AutogradMeta and just share the version counters with the base
+/// Variables.
+/// Such views include:
+///   1. Views created from .detach()
+///   2. Views that are non-differentiable by its nature.
+///      E.g., `sparse_tensor.indices()` is a integral view on a (possibly)
+///      floating point tensor.
+///      See top of `derivatives.yaml` on how to specify that outputs of a
+///      function are non-differentiable.
+/// These are called non-differentiable views as the gradients do not flow
+/// through the view relation.
+///
+/// Relevant logic for both differentiable and non-differentiable views is
+/// implemented in make_variable_(non_)differentiable_view below, and
+/// wrap_output of gen_variable_type.py.
+
+/// NOTE [ View + Inplace detection ]
+///
+/// We want to detect views followed by inplace as they are often forbidden to
+/// ensure correctness of the computed gradients. But since we want to only
+/// notify the user when both happen, we tag the DifferentiableViewMeta when the
+/// view is created via the `make_variable_*_view()` functions. This tag is then
+/// checked by the `check_inplace()` function from `VariableTypeUtils.h` that
+/// should be called before every inplace operation and to detect cases where
+/// other views are modified and this one is rebased by side effect, we also
+/// check in the `VariableHooks::grad_fn()`.
+
+/// Flag that gives more information about when this view was created:
+/// - IN_CUSTOM_FUNCTION should be set when the view is created inside a custom
+///   autograd Function is returned.
+/// - NO_GRAD_MODE should be set when a view in created when GradMode is
+/// disabled
+/// - MULTI_OUTPUT_NODE should be set when a Node created by codegen code
+/// returns
+///   multiple differentiable views
+/// - Inference_MODE should be set when a view of normal tensor is created in
+/// InferenceMode.
+/// - DEFAULT is for all other cases
+enum class CreationMeta : uint8_t {
+  DEFAULT,
+  IN_CUSTOM_FUNCTION,
+  MULTI_OUTPUT_NODE,
+  NO_GRAD_MODE,
+  INFERENCE_MODE
+};
+
+/// Handles correctly propagating CreationMeta when a new view is created from a
+/// previous view. In general, we don't want the new view to be _less_
+/// restrictive than the previous view (it's okay to be _more_ restrictive). A
+/// CreationMeta value of DEFAULT is currently the least restrictive, as the
+/// behavior for all other CreationMeta values is to error out for in-place ops.
+/// A CreationMeta value of INFERENCE_MODE is currently the most restrictive, so
+/// it takes precedence in propagation. If this changes, the logic here will
+/// need to be updated to properly handle the new semantics.
+inline CreationMeta propagate_creation_meta(
+    CreationMeta prev_view_creation_meta,
+    CreationMeta new_view_creation_meta) {
+  return (new_view_creation_meta == CreationMeta::DEFAULT)
+      ? prev_view_creation_meta
+      : (prev_view_creation_meta == CreationMeta::INFERENCE_MODE
+             ? prev_view_creation_meta
+             : new_view_creation_meta);
+}
+
+/// Unified function to handle error checking when rebase happens
+/// indirect=true means that the caller is not doing the inplace, but the
+/// inplace happened somewhere else.
+TORCH_API void handle_view_on_rebase(
+    DifferentiableViewMeta* diff_view_meta,
+    bool indirect = false);
+
+struct TORCH_API DifferentiableViewMeta : public AutogradMeta {
+ private:
+  /// Information about the views
+  std::optional<ViewInfo> backward_info_;
+  std::optional<ViewInfo> forward_info_;
+
+  // Optimization to reduce the number of ViewInfo we create.
+  // In the (very common) case where backward_info_ == forward_info_, we only
+  // populate backward_info_ (that should be used as both the forward and
+  // backward view information) and set shared_view_info_ = true. Invariants:
+  //   - If shared_view_info_ is false, there is no special constraints on
+  //     backward_info_ and forward_info_
+  //   - If shared_view_info_ is true, we must have:
+  //      - backward_info_.has_value() == true
+  //      - forward_info_.has_value() == false
+  bool shared_view_info_;
+
+  /// The two following fields are extra information that we track to ensure
+  /// that any operation on this backward view is valid.
+
+  /// The value of the version_counter at the time grad_fn was created. The
+  /// grad_fn field is stale if attr_version_ !=
+  /// version_counter.current_version().
+  uint32_t attr_version_;
+  CreationMeta creation_meta_;
+
+ public:
+  /// requires_grad is a backward AD field so we only use the view specific
+  /// logic for backward differentiable views
+  bool requires_grad() const override {
+    return requires_grad_ || grad_fn_ ||
+        (has_bw_view() && get_backward_view().base_.requires_grad());
+  }
+
+  bool shared_view_info() const {
+    return shared_view_info_;
+  }
+
+  bool has_bw_view() const {
+    return backward_info_.has_value();
+  }
+
+  const ViewInfo& get_backward_view() const {
+    TORCH_CHECK(
+        has_bw_view(), "backward view info can only exist for backward views.");
+    // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
+    return backward_info_.value();
+  }
+
+  uint32_t get_attr_version() const {
+    TORCH_CHECK(
+        has_bw_view(), "attr_version can only exist for backward views.");
+    return attr_version_;
+  }
+
+  void set_attr_version(uint32_t new_attr_version) {
+    TORCH_CHECK(
+        has_bw_view(), "attr_version can only exist for backward views.");
+    attr_version_ = new_attr_version;
+  }
+
+  CreationMeta get_creation_meta() const {
+    TORCH_CHECK(
+        has_bw_view(), "creation_meta can only exist for backward views.");
+    return creation_meta_;
+  }
+
+  void set_creation_meta(CreationMeta new_creation_meta) {
+    TORCH_CHECK(
+        has_bw_view(), "creation_meta can only exist for backward views.");
+    creation_meta_ = new_creation_meta;
+  }
+
+  bool has_fw_view() const {
+    return shared_view_info_ || forward_info_.has_value();
+  }
+
+  const ViewInfo& get_forward_view() const {
+    TORCH_CHECK(
+        has_fw_view(), "forward view info can only exist for forward views.");
+    TORCH_CHECK(
+        !shared_view_info_ || has_bw_view(),
+        "forward view info can only exist for forward views.");
+    // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
+    return shared_view_info_ ? backward_info_.value() : forward_info_.value();
+  }
+
+  DifferentiableViewMeta(
+      at::TensorImpl* self_impl,
+      std::optional<ViewInfo> backward_info,
+      std::optional<ViewInfo> forward_info,
+      bool shared_view_info,
+      CreationMeta creation_meta = CreationMeta::DEFAULT);
+};
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+//                        Variable Implementation
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+// Factory Functions
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Creates a `Variable` that is a *view* of another (*base*) variable.
+/// The `gradient_edge` is an optional (gradient_function, input_number) pair.
+/// `is_differentiable` is a bool that specifies whether this view is
+/// differentiable, i.e., whether the relation should be tracked by autograd.
+/// See NOTE [ Autograd View Variables ] for details.
+
+/// NOTE: `allow_tensor_metadata_change` is set to true by default, because
+/// there are a lot of call sites to these factory functions that need to change
+/// the variable's size or storage afterwards, and they don't expect the
+/// original tensor (where the variable is created from) to be updated. Setting
+/// `allow_tensor_metadata_change_` to false by default would unnecessarily
+/// prevent those changes from happening and is undesirable.
+
+// See NOTE [ Autograd View Variables ] for details.
+// Differentiable view. Track history with DifferentiableViewMeta.
+inline Variable make_variable_differentiable_view(
+    const at::Tensor& data,
+    std::optional<ViewInfo> backward_info,
+    std::optional<ViewInfo> forward_info,
+    bool shared_view_info,
+    CreationMeta creation_meta,
+    bool allow_tensor_metadata_change = true) {
+  if (data.defined()) {
+    TORCH_CHECK(
+        data.getIntrusivePtr()->autograd_meta() == nullptr,
+        "Attempted to make a tensor into a differentiable view, but the "
+        "tensor already had autograd metadata associated with it.  If you are "
+        "using a __torch_dispatch__ mode, the most common cause for this "
+        "problem is that you used torch.overrides.enable_reentrant_dispatch() "
+        "improperly; tensors created within the extent of reentrant dispatch "
+        "MUST NOT be directly returned from __torch_dispatch__; instead, they "
+        "must be wrapped into fresh tensors that serve as the output.  If you "
+        "are not using wrappers, you probably don't need reentrant dispatch.  "
+        "If this doesn't seem applicable, please file a bug to PyTorch.");
+    at::TensorImpl* data_impl = data.unsafeGetTensorImpl();
+    data_impl->set_allow_tensor_metadata_change(allow_tensor_metadata_change);
+    data_impl->set_autograd_meta(std::make_unique<DifferentiableViewMeta>(
+        data_impl,
+        std::move(backward_info),
+        std::move(forward_info),
+        shared_view_info,
+        creation_meta));
+    return data;
+  }
+  return Variable();
+}
+
+// See NOTE [ Autograd View Variables ] for details.
+// Non-differentiable view. Just share version counter.
+inline Variable make_variable_non_differentiable_view(
+    const Variable& base,
+    const at::Tensor& data,
+    bool allow_tensor_metadata_change = true) {
+  if (data.defined()) {
+    // Currently all of non-differentiable view ops(detach/_indices/_values)
+    // share the same TensorImpl as their base Tensor. Thus a new TensorImpl
+    // allocation here is required.
+    auto data_impl_copy = data.getIntrusivePtr()->shallow_copy_and_detach(
+        /*version_counter=*/impl::version_counter(base),
+        /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
+    data_impl_copy->set_autograd_meta(nullptr);
+    return Variable(data_impl_copy);
+  }
+  return Variable();
+}
+
+/// Creates a `Variable` from the given `Tensor`, copying its underlying
+/// `TensorImpl`. `requires_grad` should be set only for leaves, and determines
+/// whether the `Variable` will accumulate gradients. NOTE: `data` must *not* be
+/// a `Variable` already. Its dynamic type *must* be `Tensor`.
+///
+/// TODO: Eliminate this function as much as possible, as it can be expressed
+/// more clearly as detach() or a no-op in most call sites (especially when
+/// there is only one use of the variable).
+inline Variable make_variable(
+    at::Tensor data,
+    bool requires_grad = false,
+    bool allow_tensor_metadata_change = true) {
+  if (data.defined()) {
+    if (data.getIntrusivePtr().use_count() == 1 &&
+        data.getIntrusivePtr()->unique_version()) {
+      auto data_impl = data.unsafeReleaseIntrusivePtr();
+      data_impl->set_allow_tensor_metadata_change(allow_tensor_metadata_change);
+      if (requires_grad) {
+        data_impl->set_autograd_meta(
+            std::make_unique<AutogradMeta>(data_impl.get(), requires_grad));
+      } else {
+        data_impl->set_autograd_meta(nullptr);
+      }
+      return Variable(std::move(data_impl));
+    } else {
+      auto data_impl_copy = data.getIntrusivePtr()->shallow_copy_and_detach(
+          /*version_counter=*/0,
+          /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
+      if (requires_grad) {
+        data_impl_copy->set_autograd_meta(std::make_unique<AutogradMeta>(
+            data_impl_copy.get(), requires_grad));
+      } else {
+        data_impl_copy->set_autograd_meta(nullptr);
+      }
+      return Variable(std::move(data_impl_copy));
+    }
+  }
+  return Variable();
+}
+
+/// Creates a `Variable` from the given `Tensor`, copying its underlying
+/// `TensorImpl`. `gradient_edge` should be a (function, input_nr) pair
+/// specifying the function in the autograd graph, and what particular input of
+/// that function, this variable is connected to.
+inline Variable make_variable(
+    const at::Tensor& data,
+    Edge gradient_edge,
+    bool allow_tensor_metadata_change = true) {
+  if (data.defined()) {
+    auto data_impl_copy = data.getIntrusivePtr()->shallow_copy_and_detach(
+        /*version_counter=*/0,
+        /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
+    data_impl_copy->set_autograd_meta(std::make_unique<AutogradMeta>(
+        data_impl_copy.get(), false, std::move(gradient_edge)));
+    return Variable(data_impl_copy);
+  }
+  return Variable();
+}
+
+struct VariableHooks final : at::impl::VariableHooksInterface {
+  at::TensorBase tensor_data(const at::TensorBase&) const override;
+  at::TensorBase variable_data(const at::TensorBase&) const override;
+  const std::shared_ptr<torch::autograd::Node>& grad_fn(
+      const at::TensorBase&) const override;
+  unsigned _register_hook(
+      const at::TensorBase&,
+      std::function<at::TensorBase(const at::TensorBase&)> hook) const override;
+  void remove_hook(const at::TensorBase&, unsigned pos) const override;
+  bool is_view(const at::TensorBase&) const override;
+  const at::TensorBase& base(const at::TensorBase&) const override;
+  const std::string& name(const at::TensorBase&) const override;
+  bool is_leaf(const at::TensorBase&) const override;
+  int64_t output_nr(const at::TensorBase&) const override;
+  void set_data(const at::TensorBase& self, const at::TensorBase& new_data)
+      const override;
+  at::TensorBase data(const at::TensorBase& self) const override;
+  int64_t _version(const at::TensorBase& self) const override;
+  void retain_grad(const at::TensorBase& self) const override;
+  bool retains_grad(const at::TensorBase& self) const override;
+  void _backward(
+      const at::Tensor& self,
+      at::TensorList inputs,
+      const std::optional<at::Tensor>& gradient,
+      std::optional<bool> keep_graph,
+      bool create_graph) const override;
+  void requires_grad_(const at::TensorBase& self, bool _requires_grad)
+      const override;
+  void basic_autograd_not_implemented_fallback(
+      const c10::OperatorHandle& op,
+      c10::DispatchKeySet dispatch_keys,
+      torch::jit::Stack* stack) const override;
+};
+
+namespace utils {
+
+TORCH_API bool has_same_meta(const Variable& base, const Variable& other);
+
+} // namespace utils
+} // namespace torch::autograd
+
+#endif /* DOXYGEN_SHOULD_SKIP_THIS */
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/variable_info.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/variable_info.h
new file mode 100644
index 0000000000000000000000000000000000000000..5f12171f6e4e7611ef6678ba52d50064e8cc442f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/autograd/variable_info.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <torch/csrc/autograd/variable.h>
+
+namespace torch::autograd {
+
+struct TORCH_API VariableInfo {
+  explicit VariableInfo();
+  explicit VariableInfo(const Variable& var, bool use_zeros_like = false);
+
+  Variable zeros(at::OptionalDeviceGuard& device_guard) const;
+
+  at::Layout layout = at::Layout::Strided;
+  at::Device device = at::kCPU;
+  at::ScalarType scalar_type = at::kFloat;
+  std::vector<c10::SymInt> size;
+  bool requires_grad;
+  bool is_empty;
+  // needed for e.g. NJTs since they only support zeros_like()
+  std::optional<Variable> the_var;
+};
+
+} // namespace torch::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/copy_utils.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/copy_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..2f76fbaed843b4883ae0e28768b97578f103fe0f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/copy_utils.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <torch/csrc/Types.h>
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils.h>
+#include <functional>
+#include <vector>
+
+typedef std::function<void(PyObject*, PyObject*, bool)> THPCopyFunction;
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+struct THPCopyInfo {
+  PyTypeObject* srcType; // Python type of src tensor/storage
+  THPCopyFunction copy; // copy function
+  bool non_blocking; // true if copy implements an 'non_blocking' copy
+  bool broadcast; // true if the copy implements a broadcast copy
+};
+typedef std::vector<THPCopyInfo> THPCopyList;
+
+inline bool tryTHPCopy(
+    const THPCopyList& v,
+    PyObject* dst,
+    PyObject* src,
+    bool non_blocking,
+    bool broadcast) {
+  for (auto& i : v) {
+    if (i.non_blocking == non_blocking &&
+        PyType_IsSubtype(Py_TYPE(src), i.srcType)) {
+      (i.copy)(dst, src, broadcast);
+      return true;
+    }
+  }
+  return false;
+}
+
+inline bool THPCopy(
+    const THPCopyList& v,
+    PyObject* dst,
+    PyObject* src,
+    bool non_blocking,
+    bool broadcast) {
+  // NOLINTNEXTLINE(bugprone-branch-clone)
+  if (tryTHPCopy(v, dst, src, non_blocking, broadcast)) {
+    return true;
+  } else if (non_blocking && tryTHPCopy(v, dst, src, false, broadcast)) {
+    return true;
+  }
+  THPUtils_setError(
+      "copy from %s to %s isn't implemented",
+      THPUtils_typename(src),
+      THPUtils_typename(dst));
+  return false;
+}
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/cpu/Module.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/cpu/Module.h
new file mode 100644
index 0000000000000000000000000000000000000000..b9c2df514ad3518115b411ded845054f688650a8
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/cpu/Module.h
@@ -0,0 +1,8 @@
+#pragma once
+#include <torch/csrc/python_headers.h>
+
+namespace torch::cpu {
+
+void initModule(PyObject* module);
+
+} // namespace torch::cpu
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/CUDAPluggableAllocator.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/CUDAPluggableAllocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..b8286d4867a98778f79bb2f8989668ebbb845769
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/CUDAPluggableAllocator.h
@@ -0,0 +1,194 @@
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <c10/cuda/CUDAGraphsC10Utils.h>
+#include <c10/cuda/CUDAMacros.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include <c10/cuda/CUDACachingAllocator.h>
+
+#include <mutex>
+
+namespace torch::cuda::CUDAPluggableAllocator {
+
+using MallocFuncType = void*(size_t, int, cudaStream_t);
+using FreeFuncType = void(void*, size_t, int, cudaStream_t);
+
+// A CUDAPluggableAllocatorDeleterContext object is used as the `ctx`
+// argument for DataPtr. We need context because a user can use
+// multiple allocators in the same PyTorch program, and
+// the allocators can have different free functions, such as:
+// free, cudaFree, cudaFreeAsync, ncclMemFree etc.
+struct TORCH_CUDA_CPP_API CUDAPluggableAllocatorDeleterContext {
+  explicit CUDAPluggableAllocatorDeleterContext(
+      std::function<FreeFuncType> free_fn,
+      void* data,
+      size_t size,
+      int device,
+      cudaStream_t stream);
+
+  void free();
+
+ private:
+  std::function<FreeFuncType> free_fn_;
+  void* data_;
+  size_t size_;
+  int device_;
+  cudaStream_t stream_{};
+};
+
+#if defined(USE_ROCM)
+using streamType = c10::hip::HIPStream;
+#else
+using streamType = c10::cuda::CUDAStream;
+#endif
+
+TORCH_CUDA_CPP_API std::shared_ptr<
+    c10::cuda::CUDACachingAllocator::CUDAAllocator>
+getCurrentAllocator();
+TORCH_CUDA_CPP_API std::shared_ptr<
+    c10::cuda::CUDACachingAllocator::CUDAAllocator>
+createCustomAllocator(
+    std::function<MallocFuncType> alloc_fn,
+    std::function<FreeFuncType> free_fn);
+TORCH_CUDA_CPP_API void changeCurrentAllocator(
+    const std::shared_ptr<c10::cuda::CUDACachingAllocator::CUDAAllocator>&
+        allocator);
+
+struct _AllocationMetadata {
+  _AllocationMetadata();
+  _AllocationMetadata(
+      size_t size,
+      c10::DeviceIndex device_idx,
+      cudaStream_t stream);
+  size_t size;
+  c10::DeviceIndex device_idx;
+  cudaStream_t stream{};
+};
+
+struct TORCH_CUDA_CPP_API CUDAPluggableAllocator
+    : public c10::cuda::CUDACachingAllocator::CUDAAllocator {
+  CUDAPluggableAllocator(
+      std::function<MallocFuncType> alloc_fn,
+      std::function<FreeFuncType> free_fn);
+
+  CUDAPluggableAllocator(CUDAPluggableAllocator& other);
+  CUDAPluggableAllocator(CUDAPluggableAllocator&& other) = delete;
+  CUDAPluggableAllocator& operator=(const CUDAPluggableAllocator& other) =
+      delete;
+  CUDAPluggableAllocator& operator=(CUDAPluggableAllocator&& other) = delete;
+  ~CUDAPluggableAllocator() override = default;
+
+  void set_init_fn(std::function<void(int)> init_fn);
+
+  void set_reset_fn(std::function<void()> reset_fn);
+
+  void set_memory_fraction_fn(
+      std::function<void(double, int)> memory_fraction_fn);
+
+  void set_base_alloc_fn(std::function<void*(void*, size_t*)> base_alloc_fn);
+
+  void set_record_stream_fn(
+      std::function<void(void* ptr, cudaStream_t stream)> record_stream_fn);
+
+  void set_begin_allocate_to_pool(
+      std::function<
+          void(int, c10::cuda::MempoolId_t, std::function<bool(cudaStream_t)>)>
+          capture_begin_fn);
+
+  void set_end_allocate_to_pool_fn(
+      std::function<void(int, c10::cuda::MempoolId_t)> capture_about_to_end_fn);
+
+  void set_release_pool(
+      std::function<void(int, c10::cuda::MempoolId_t)> capture_destroy_fn);
+
+  void* malloc(size_t size, c10::DeviceIndex device, cudaStream_t stream);
+
+  c10::DataPtr allocate(size_t size) override;
+  c10::DeleterFnPtr raw_deleter() const override;
+
+  void* raw_alloc(size_t nbytes) override;
+  void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) override;
+  void raw_delete(void* ptr) override;
+  void init(int device_count) override;
+  bool initialized() override;
+  double getMemoryFraction(c10::DeviceIndex device) override;
+  void setMemoryFraction(double fraction, c10::DeviceIndex device) override;
+  void emptyCache(c10::cuda::MempoolId_t mempool_id = {0, 0}) override;
+  void enable(bool) override {}
+  bool isEnabled() const override {
+    return true;
+  }
+  void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) override;
+  void* getBaseAllocation(void* ptr, size_t* size) override;
+
+  void recordStream(const c10::DataPtr&, streamType stream) override;
+
+  c10::CachingDeviceAllocator::DeviceStats getDeviceStats(
+      c10::DeviceIndex device) override;
+  void resetAccumulatedStats(c10::DeviceIndex device) override;
+  void resetPeakStats(c10::DeviceIndex device) override;
+  c10::cuda::CUDACachingAllocator::SnapshotInfo snapshot(
+      c10::cuda::MempoolId_t mempool) override;
+  void beginAllocateToPool(
+      c10::DeviceIndex device,
+      c10::cuda::MempoolId_t mempool_id,
+      std::function<bool(cudaStream_t)>) override;
+  void endAllocateToPool(
+      c10::DeviceIndex device,
+      c10::cuda::MempoolId_t mempool_id) override;
+  void releasePool(c10::DeviceIndex device, c10::cuda::MempoolId_t mempool_id)
+      override;
+  std::shared_ptr<void> getIpcDevPtr(std::string handle) override;
+  c10::cuda::CUDACachingAllocator::ShareableHandle shareIpcHandle(
+      void*) override;
+  void recordHistory(
+      bool enabled,
+      c10::cuda::CUDACachingAllocator::CreateContextFn context_recorder,
+      size_t alloc_trace_max_entries,
+      c10::cuda::CUDACachingAllocator::RecordContext when,
+      bool clearHistory) override;
+  void attachOutOfMemoryObserver(
+      c10::cuda::CUDACachingAllocator::OutOfMemoryObserver observer) override;
+  void attachAllocatorTraceTracker(
+      c10::cuda::CUDACachingAllocator::AllocatorTraceTracker tracker) override;
+  std::shared_ptr<c10::cuda::CUDACachingAllocator::AllocatorState>
+  getCheckpointState(c10::DeviceIndex device, at::cuda::MempoolId_t id)
+      override;
+  c10::cuda::CUDACachingAllocator::CheckpointDelta setCheckpointPoolState(
+      c10::DeviceIndex device,
+      std::shared_ptr<c10::cuda::CUDACachingAllocator::AllocatorState> pps)
+      override;
+  void enablePeerAccess(c10::DeviceIndex dev, c10::DeviceIndex dev_to_access)
+      override;
+  cudaError_t memcpyAsync(
+      void* dst,
+      int dstDevice,
+      const void* src,
+      int srcDevice,
+      size_t count,
+      cudaStream_t stream,
+      bool p2p_enabled) override;
+  std::string name() override;
+  void copy_data(void* dest, const void* src, std::size_t count) const final;
+
+ protected:
+  std::function<MallocFuncType> alloc_fn_;
+  std::function<FreeFuncType> free_fn_;
+  std::function<void(int)> init_fn_;
+  std::function<void()> reset_fn_;
+  std::function<void(double, int)> memory_fraction_fn_;
+  std::function<void*(void*, size_t*)> base_alloc_fn_;
+  std::function<void(void* ptr, cudaStream_t stream)> record_stream_fn_;
+  std::function<
+      void(int, c10::cuda::MempoolId_t, std::function<bool(cudaStream_t)>)>
+      begin_allocate_to_pool_fn_;
+  std::function<void(int, c10::cuda::MempoolId_t)> end_allocate_to_pool_fn_;
+  std::function<void(int, c10::cuda::MempoolId_t)> relase_pool_fn_;
+  std::mutex allocator_mutex_;
+  // We do the bookkeeping here in order to simplify custom allocators
+  std::unordered_map<void*, _AllocationMetadata> allocation_metadata_;
+
+  bool initialized_ = false;
+};
+} // namespace torch::cuda::CUDAPluggableAllocator
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/Event.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/Event.h
new file mode 100644
index 0000000000000000000000000000000000000000..09f76530e145ab056bf89726d01026fa559a6812
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/Event.h
@@ -0,0 +1,19 @@
+#ifndef THCP_EVENT_INC
+#define THCP_EVENT_INC
+
+#include <ATen/cuda/CUDAEvent.h>
+#include <torch/csrc/Event.h>
+#include <torch/csrc/python_headers.h>
+
+struct THCPEvent : THPEvent {
+  at::cuda::CUDAEvent cuda_event;
+};
+extern PyObject* THCPEventClass;
+
+void THCPEvent_init(PyObject* module);
+
+inline bool THCPEvent_Check(PyObject* obj) {
+  return THCPEventClass && PyObject_IsInstance(obj, THCPEventClass);
+}
+
+#endif // THCP_EVENT_INC
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/GdsFile.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/GdsFile.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a8c9cd955c641708114f7c1f42241e7fe8d3118
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/GdsFile.h
@@ -0,0 +1,9 @@
+#ifndef THCP_GDSFILE_INC
+#define THCP_GDSFILE_INC
+
+#include <torch/csrc/python_headers.h>
+
+namespace torch::cuda::shared {
+void initGdsBindings(PyObject* module);
+}
+#endif // THCP_GDSFILE_INC
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/Module.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/Module.h
new file mode 100644
index 0000000000000000000000000000000000000000..da1250fec53c531b1713930cac000561dc706ba0
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/Module.h
@@ -0,0 +1,12 @@
+#ifndef THCP_CUDA_MODULE_INC
+#define THCP_CUDA_MODULE_INC
+#include <torch/csrc/utils/pythoncapi_compat.h>
+
+PyObject* THCPModule_getDevice_wrap(PyObject* self);
+PyObject* THCPModule_setDevice_wrap(PyObject* self, PyObject* arg);
+PyObject* THCPModule_getDeviceName_wrap(PyObject* self, PyObject* arg);
+PyObject* THCPModule_getDriverVersion(PyObject* self);
+PyObject* THCPModule_isDriverSufficient(PyObject* self);
+PyObject* THCPModule_getCurrentBlasHandle_wrap(PyObject* self);
+
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/Stream.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/Stream.h
new file mode 100644
index 0000000000000000000000000000000000000000..472a77a130ebbc6292d93ec1770c8ca09c1dc66f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/Stream.h
@@ -0,0 +1,20 @@
+#ifndef THCP_STREAM_INC
+#define THCP_STREAM_INC
+
+#include <c10/cuda/CUDAStream.h>
+#include <torch/csrc/Stream.h>
+#include <torch/csrc/python_headers.h>
+
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+struct THCPStream : THPStream {
+  at::cuda::CUDAStream cuda_stream;
+};
+extern PyObject* THCPStreamClass;
+
+void THCPStream_init(PyObject* module);
+
+inline bool THCPStream_Check(PyObject* obj) {
+  return THCPStreamClass && PyObject_IsInstance(obj, THCPStreamClass);
+}
+
+#endif // THCP_STREAM_INC
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/THCP.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/THCP.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e3865af8a7d7cb830ccc78dbd50cf0d86a3db90
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/THCP.h
@@ -0,0 +1,8 @@
+#pragma once
+
+#include <torch/csrc/THP.h>
+#include <torch/csrc/cuda/Event.h>
+#include <torch/csrc/cuda/Module.h>
+#include <torch/csrc/cuda/Stream.h>
+#include <torch/csrc/cuda/utils.h>
+#include <torch/csrc/python_headers.h>
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/comm.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/comm.h
new file mode 100644
index 0000000000000000000000000000000000000000..e140cfecedcf9f603bb6c68ae78c156316b72e8f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/comm.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/ATenCUDAGeneral.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/csrc/Export.h>
+#include <optional>
+
+#include <cstddef>
+#include <vector>
+
+namespace torch::cuda {
+
+using tensor_list2d = std::vector<std::vector<at::Tensor>>;
+
+TORCH_CUDA_CU_API std::vector<at::Tensor>& broadcast_out(
+    const at::Tensor& tensor,
+    std::vector<at::Tensor>& out_tensors);
+TORCH_CUDA_CU_API std::vector<at::Tensor> broadcast(
+    const at::Tensor& tensor,
+    at::IntArrayRef devices);
+TORCH_CUDA_CU_API tensor_list2d broadcast_coalesced(
+    at::TensorList tensors,
+    at::IntArrayRef devices,
+    size_t buffer_size);
+
+TORCH_CUDA_CU_API std::vector<at::Tensor>& scatter_out(
+    const at::Tensor& tensor,
+    std::vector<at::Tensor>& out_tensors,
+    int64_t dim = 0,
+    const std::optional<std::vector<std::optional<at::cuda::CUDAStream>>>&
+        streams = std::nullopt);
+
+TORCH_CUDA_CU_API std::vector<at::Tensor> scatter(
+    const at::Tensor& tensor,
+    at::IntArrayRef devices,
+    const std::optional<std::vector<int64_t>>& chunk_sizes = std::nullopt,
+    int64_t dim = 0,
+    const std::optional<std::vector<std::optional<at::cuda::CUDAStream>>>&
+        streams = std::nullopt);
+
+TORCH_CUDA_CU_API at::Tensor& gather_out(
+    at::TensorList tensors,
+    at::Tensor& out_tensor,
+    int64_t dim);
+
+TORCH_CUDA_CU_API at::Tensor gather(
+    at::TensorList tensors,
+    int64_t dim,
+    std::optional<int32_t> destination_index);
+
+} // namespace torch::cuda
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/device_set.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/device_set.h
new file mode 100644
index 0000000000000000000000000000000000000000..5798dbae72862222f1627b6f70e09f5f8d68c3d4
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/device_set.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <c10/cuda/CUDAMacros.h>
+#include <bitset>
+#include <cstddef>
+
+namespace torch {
+
+using device_set = std::bitset<C10_COMPILE_TIME_MAX_GPUS>;
+
+} // namespace torch
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/memory_snapshot.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/memory_snapshot.h
new file mode 100644
index 0000000000000000000000000000000000000000..afbde6008c08ae80a020bdb2f9837be081452df4
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/memory_snapshot.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <cstdint>
+#include <optional>
+#include <string>
+
+namespace torch::cuda {
+
+// C++-only versions of these, for python use
+// those defined in cuda/Module.cpp which also record python state.
+TORCH_CUDA_CU_API void _record_memory_history(
+    bool enabled,
+    bool record_context = true,
+    int64_t trace_alloc_max_entries = 1,
+    bool trace_alloc_record_context = false,
+    bool record_cpp_context = false,
+    bool clearHistory = false,
+    bool compileContext = false,
+    bool globalRecordAllocations = false);
+
+TORCH_CUDA_CU_API void _record_memory_history(
+    std::optional<std::string> enabled = "all",
+    std::optional<std::string> context = "all",
+    const std::string& stacks = "all",
+    size_t max_entries = SIZE_MAX,
+    bool clearHistory = false,
+    bool compileContext = false,
+    bool globalRecordAllocations = false);
+
+TORCH_CUDA_CU_API std::string _memory_snapshot_pickled();
+
+} // namespace torch::cuda
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/nccl.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/nccl.h
new file mode 100644
index 0000000000000000000000000000000000000000..1db742004ac7ba1a7d0e0853a744cd5ece6e855d
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/nccl.h
@@ -0,0 +1,219 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include <cstddef>
+#include <optional>
+#include <vector>
+
+// NCCL BFloat16 is enabled only for CUDA 11+ and NCCL versions 2.10+, or for
+// HIP 3.1+
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+#define HAS_NCCL_BF16_DATATYPE \
+  ((NCCL_MAJOR > 2) || (NCCL_MAJOR == 2) && (NCCL_MINOR >= 10))
+#elif defined(USE_ROCM) && (TORCH_HIP_VERSION >= 301)
+#define HAS_NCCL_BF16_DATATYPE 1
+#else
+#define HAS_NCCL_BF16_DATATYPE 0
+#endif
+
+namespace torch::cuda::nccl {
+
+/* The following are copied from <nccl.h> and redefined in torch::cuda::nccl
+ * namespace */
+/* pytorch should only use the following definition within pytorch scope */
+
+/* Opaque handle to communicator to ncclComm*, this will reinterpret as ncclComm
+ * in nccl.cpp */
+typedef void* ncclComm_t;
+
+/** redefine nccl unique ID in torch scope. this should be identical to native
+ * nccl impp. */
+#define NCCL_UNIQUE_ID_BYTES 128
+typedef struct {
+  // NOLINTNEXTLINE(*array*)
+  char internal[NCCL_UNIQUE_ID_BYTES];
+} ncclUniqueId;
+
+/* Error type */
+enum class ncclResult {
+  Success = 0,
+  UnhandledCudaError = 1,
+  SystemError = 2,
+  InternalError = 3,
+  InvalidArgument = 4,
+  InvalidUsage = 5,
+  RemoteError = 6,
+  InProgress = 7,
+  NumResults = 8
+};
+
+/* Reduction operation selector */
+enum class ncclRedOp { Sum = 0, Prod = 1, Max = 2, Min = 3, NumOps = 4 };
+
+/* Data types */
+enum class ncclDataType {
+  Int8 = 0,
+  Char = 0,
+  Uint8 = 1,
+  Int32 = 2,
+  Int = 2,
+  Uint32 = 3,
+  Int64 = 4,
+  Uint64 = 5,
+  Float16 = 6,
+  Half = 6,
+  Float32 = 7,
+  Float = 7,
+  Float64 = 8,
+  Double = 8,
+  Bfloat16 = 9,
+  NumTypes = 10
+};
+
+// RAII helper class to manage NCCL group API and CUDA free mutex.
+// The destructor is allowed to throw since this helper class only
+// manages group and lock lifetimes.
+struct TORCH_CUDA_CPP_API AutoNcclGroup {
+  AutoNcclGroup();
+  AutoNcclGroup(ncclComm_t comm, bool comm_nonblocking);
+  ~AutoNcclGroup() noexcept(false);
+  ncclComm_t comm_;
+  bool comm_nonblocking_;
+};
+
+// NOTE: this is exposed only so that python_nccl.cpp can some of these helpers.
+// Don't use them outside of these files.
+namespace detail {
+
+TORCH_CUDA_CPP_API void throw_nccl_error(ncclResult status);
+
+inline void NCCL_CHECK(ncclResult status) {
+  if (status != ncclResult::Success) {
+    throw_nccl_error(status);
+  }
+}
+
+TORCH_CUDA_CPP_API at::ArrayRef<ncclComm_t> get_communicators(
+    at::TensorList inputs);
+TORCH_CUDA_CPP_API void check_inputs(
+    at::TensorList inputs,
+    at::TensorList outputs,
+    size_t input_multiplier,
+    size_t output_multiplier);
+TORCH_CUDA_CPP_API void check_inputs(
+    at::TensorList inputs,
+    const at::Tensor& output,
+    int root,
+    size_t input_multiplier,
+    size_t output_multiplier);
+
+} // namespace detail
+
+using comm_list = std::vector<ncclComm_t>;
+using stream_list = std::vector<std::optional<at::cuda::CUDAStream>>;
+
+TORCH_CUDA_CPP_API std::uint64_t version();
+TORCH_CUDA_CPP_API const char* version_suffix();
+
+bool is_available(at::TensorList tensors);
+
+TORCH_CUDA_CPP_API void get_unique_id(ncclUniqueId& id);
+TORCH_CUDA_CPP_API ncclComm_t
+comm_init_rank(int nranks, const ncclUniqueId& comm_id, int rank);
+TORCH_CUDA_CPP_API void comm_destroy(ncclComm_t comm);
+
+TORCH_CUDA_CPP_API void broadcast(
+    at::TensorList tensors,
+    const stream_list& streams = {},
+    const comm_list& user_comms = {});
+
+size_t get_max_count();
+
+TORCH_CUDA_CPP_API void reduce(
+    const std::vector<at::Tensor>& inputs,
+    at::Tensor& output,
+    int32_t root = 0,
+    int32_t op = static_cast<int>(ncclRedOp::Sum),
+    const stream_list& streams = {},
+    const comm_list& user_comms = {});
+
+TORCH_CUDA_CPP_API void reduce(
+    std::vector<at::Tensor>& inputs,
+    int32_t root = 0,
+    int32_t op = static_cast<int>(ncclRedOp::Sum),
+    const stream_list& streams = {},
+    const comm_list& user_comms = {});
+
+TORCH_CUDA_CPP_API void all_reduce(
+    const std::vector<at::Tensor>& inputs,
+    std::vector<at::Tensor>& outputs,
+    int32_t op = static_cast<int>(ncclRedOp::Sum),
+    const stream_list& streams = {},
+    const comm_list& user_comms = {});
+
+TORCH_CUDA_CPP_API void reduce_scatter(
+    const std::vector<at::Tensor>& inputs,
+    std::vector<at::Tensor>& outputs,
+    int32_t op = static_cast<int>(ncclRedOp::Sum),
+    const stream_list& streams = {},
+    const comm_list& user_comms = {});
+
+TORCH_CUDA_CPP_API void scatter(
+    const std::vector<at::Tensor>& inputs,
+    at::Tensor& outputs,
+    ncclComm_t comm,
+    at::cuda::CUDAStream& stream,
+    int32_t root = 0);
+
+TORCH_CUDA_CPP_API void all_gather(
+    const std::vector<at::Tensor>& inputs,
+    std::vector<at::Tensor>& outputs,
+    const stream_list& streams = {},
+    const comm_list& user_comms = {});
+
+TORCH_CUDA_CPP_API void gather(
+    const at::Tensor& inputs,
+    std::vector<at::Tensor>& outputs,
+    ncclComm_t comm,
+    at::cuda::CUDAStream& stream,
+    int32_t root = 0);
+
+TORCH_CUDA_CPP_API void all2all_single_equal_split(
+    at::Tensor& input,
+    at::Tensor& output,
+    int size,
+    ncclComm_t comm,
+    at::cuda::CUDAStream& stream);
+
+TORCH_CUDA_CPP_API void all2all_single_unequal_split(
+    void* sendbuff,
+    const size_t* sendcounts,
+    const size_t* senddispls,
+    void* recvbuff,
+    const size_t* recvcounts,
+    const size_t* recvdispls,
+    size_t size,
+    c10::ScalarType type,
+    ncclComm_t comm,
+    at::cuda::CUDAStream& stream);
+
+TORCH_CUDA_CPP_API void all2all(
+    std::vector<at::Tensor>& outputTensors,
+    std::vector<at::Tensor>& inputTensors,
+    ncclComm_t _comm,
+    at::cuda::CUDAStream& stream);
+
+TORCH_CUDA_CPP_API void send(
+    const at::Tensor& input,
+    ncclComm_t comm,
+    at::cuda::CUDAStream stream,
+    int dst);
+
+TORCH_CUDA_CPP_API void recv(
+    at::Tensor& output,
+    ncclComm_t comm,
+    at::cuda::CUDAStream stream,
+    int src);
+} // namespace torch::cuda::nccl
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/python_comm.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/python_comm.h
new file mode 100644
index 0000000000000000000000000000000000000000..8777858335fc3e330664f270ebb674be2491a896
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/python_comm.h
@@ -0,0 +1,8 @@
+#pragma once
+
+#include <torch/csrc/utils/pythoncapi_compat.h>
+namespace torch::cuda::python {
+
+void initCommMethods(PyObject* module);
+
+} // namespace torch::cuda::python
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/python_nccl.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/python_nccl.h
new file mode 100644
index 0000000000000000000000000000000000000000..c0b3bcce6698f2dd34e8613e156c3f0f278b1b73
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/python_nccl.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+PyObject* THCPModule_nccl_version(PyObject* self, PyObject* args);
+PyObject* THCPModule_nccl_version_suffix(PyObject* self, PyObject* args);
+PyObject* THCPModule_nccl_unique_id(PyObject* self, PyObject* args);
+PyObject* THCPModule_nccl_init_rank(PyObject* self, PyObject* args);
+PyObject* THCPModule_nccl_reduce(PyObject* self, PyObject* args);
+PyObject* THCPModule_nccl_all_reduce(PyObject* self, PyObject* args);
+PyObject* THCPModule_nccl_broadcast(PyObject* self, PyObject* args);
+PyObject* THCPModule_nccl_all_gather(PyObject* self, PyObject* args);
+PyObject* THCPModule_nccl_reduce_scatter(PyObject* self, PyObject* args);
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/utils.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..8b000a306328674a4adde925a460da4a6a007cda
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/cuda/utils.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <c10/cuda/CUDAStream.h>
+#include <torch/csrc/utils/python_numbers.h>
+
+#include <vector>
+
+std::vector<std::optional<at::cuda::CUDAStream>>
+THPUtils_PySequence_to_CUDAStreamList(PyObject* obj);
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/autograd.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/autograd.h
new file mode 100644
index 0000000000000000000000000000000000000000..39a315cfecce3e4400a649f8eab0aaa21b8847b5
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/autograd.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <torch/csrc/distributed/autograd/context/container.h>
+#include <torch/csrc/distributed/autograd/engine/dist_engine.h>
+
+namespace torch::distributed::autograd {
+
+using torch::autograd::variable_list;
+
+/// C++ API of Distributed Autograd that kicks off the distributed backward pass
+/// using the provided roots. This currently implements the
+/// :ref:`fast-mode-algorithm` which assumes all RPC messages sent in the same
+/// distributed autograd context across workers would be part of the autograd
+/// graph during the backward pass.
+///
+/// We use the provided roots to discover the autograd graph and compute
+/// appropriate dependencies. This method blocks until the entire
+/// autograd computation is done.
+/// This function accumulates gradients in the leaves - you might need to zero
+/// them before calling it.
+///
+/// \param context_id The autograd context id for which we should retrieve the
+///                   gradients.
+/// \param roots Tensors which represent the roots of the autograd computation.
+///              All the tensors should be scalars.
+/// \param retain_graph If `false`, the graph used to compute the grad will be
+///                     freed. Note that in nearly all cases setting this
+///                     option to `true` is not needed and often can be worked
+///                     around in a much more efficient way. Usually, you need
+///                     to set this to `true` to run backward multiple times.
+TORCH_API void backward(
+    int64_t context_id,
+    const variable_list& roots,
+    bool retain_graph = false);
+
+} // namespace torch::distributed::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/context/container.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/context/container.h
new file mode 100644
index 0000000000000000000000000000000000000000..d73ef49ec3504d2095b6fdb68a64b9ff3b779406
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/context/container.h
@@ -0,0 +1,162 @@
+#pragma once
+
+#include <mutex>
+#include <unordered_map>
+
+#include <torch/csrc/distributed/autograd/context/context.h>
+
+namespace torch::distributed::autograd {
+
+// Singleton class per worker which is responsible for storing the distributed
+// autograd context for each autograd pass and also cleans up data for an
+// autograd pass once its done.
+//
+// Each autograd pass is assigned a unique autograd_context_id and all data for
+// that pass (DistAutogradContext) is stored in this container indexed by the
+// autograd_context_id. The autograd_context_id itself is a 64 bit globally
+// unique id. The first 16 bits is the worker_id and the next 48 bits is an
+// auto-incrementing id for each worker.
+//
+// This container is also responsible for maintaining a globally unique message
+// id, which is used to associate send/recv autograd function pairs. The format
+// is similar to the autograd_context_id where we have a 64 bit integer with
+// first 16 bits being the worker id and next 48 bits are auto-incrementing.
+class TORCH_API DistAutogradContainer {
+ public:
+  explicit DistAutogradContainer(uint32_t num_shards);
+
+  // One time initialization of the container.
+  static DistAutogradContainer& init(int64_t worker_id);
+
+  // Retrieve the singleton instance of the container, ensures we have
+  // initialized the container.
+  static DistAutogradContainer& getInstance();
+
+  // Create a new context for a distributed autograd pass.
+  const ContextPtr newContext();
+
+  // Clean up resources for a given context_id once the autograd pass is done.
+  // Sends RPC to other workers this worker knows about, telling them to clean
+  // up their context as well. Throws an exception if the context_id does not
+  // exist.
+  void releaseContext(int64_t context_id);
+
+  // Releases an autograd context if it is present on this node. Also sends RPC
+  // to other workers this worker knows about, telling them to clean up their
+  // context. Does nothing if it is not present.
+  void releaseContextIfPresent(int64_t context_id);
+
+  // Checks if the passed in context_id is valid.
+  void isValidContext(int64_t context_id);
+
+  // Retrieve the autograd context for a given context_id.
+  ContextPtr retrieveContext(int64_t context_id);
+
+  // Retrieves the currently active autograd context for the current thread.
+  ContextPtr currentContext();
+
+  // Checks whether or not the current thread has a valid autograd context.
+  bool hasValidContext() const;
+
+  // Generate a new autograd_message_id for send/recv autograd functions.
+  int64_t newAutogradMessageId();
+
+  // Creates a new autograd context with the provided context_id. If a context
+  // already exists with the provided context_id, we just return it.
+  // This does not set the current context for the current thread.
+  ContextPtr getOrCreateContext(int64_t context_id);
+
+  // Retrieves the maximum possible autograd_context_id/autograd_message_id that
+  // can be generated by this worker.
+  int64_t getMaxId();
+
+  // Retrieves the worker ID for this node
+  rpc::worker_id_t getWorkerId() const;
+
+  // Can set current context id if there is no valid context yet
+  static void setCurrentContextId(int64_t contextId);
+
+  // Forcibly sets the thread local current context id. Should only be used in
+  // cases where you know what you're doing and need to override the thread
+  // local. Otherwise, use setCurrentContextId instead.
+  static void forceCurrentContextId(int64_t contextId);
+
+  // Clear current context id
+  void clearCurrentContext();
+
+  // Returns the number of autograd contexts in the container.
+  size_t numAutogradContexts() const;
+
+  // Returns the current thread local context id for this thread.
+  static int64_t currentContextId();
+
+  DistAutogradContainer() = delete;
+  ~DistAutogradContainer() = default;
+  DistAutogradContainer(const DistAutogradContainer&) = delete;
+  DistAutogradContainer& operator=(const DistAutogradContainer&) = delete;
+  DistAutogradContainer(DistAutogradContainer&&) = delete;
+  DistAutogradContainer& operator=(DistAutogradContainer&&) = delete;
+
+ private:
+  // Number of shards for the map storing autograd contexts. We'd like this
+  // to be a power of 2 and we don't expect a value much higher than the
+  // number of cores would provide much benefit.
+  static constexpr uint32_t kNumDefaultShards = 128;
+
+  // Use cache line size for alignment.
+  static constexpr int kCacheLineSize = 64;
+
+  // Structure holding one shard of the sharded autograd context map with its
+  // associated lock. Align to cache line size to avoid contention between
+  // adjacent entries.
+  struct alignas(kCacheLineSize) ContextsShard {
+    // Lock for this shard.
+    mutable std::mutex lock;
+
+    // Map storing autograd contexts for this shard.
+    std::unordered_map<int64_t, ContextPtr> contexts;
+  };
+
+  static DistAutogradContainer& getInstanceInternal();
+
+  // Retrieve the shard for given context_id.
+  ContextsShard& getShard(int64_t context_id);
+
+  // Sends an RPC to the workers that have a context corresponding to passed in
+  // context_id. This function should be called with the lock.
+  void sendReleaseContextRpc(
+      const std::unordered_set<rpc::worker_id_t>& workerIds,
+      int64_t context_id);
+
+  // Erase context_id from the autograd context map, and reset the thread local
+  // current context id if it corresponds to the passed in context id. This
+  // function should be called with the lock.
+  void eraseContextIdAndReset(ContextsShard& shard, int64_t context_id);
+
+  // Compute the number of shards for the autograd_contexts_ map.
+  static uint32_t computeNumShards();
+
+  // Auto incrementing context id used to identify unique autograd passes.
+  // Initialized with the first 16 bits being the worker_id.
+  std::atomic<int64_t> next_context_id_;
+
+  // Unique id to identify a worker in the distributed setting.
+  int16_t worker_id_;
+
+  // Whether or not the container has been initialized appropriately.
+  bool initialized_;
+
+  // Sharded autograd context map.
+  std::vector<ContextsShard> autograd_contexts_;
+
+  // Number of shards for the sharded autograd_contexts_ map.
+  uint32_t num_shards_;
+
+  // Autograd message id to identify unique send/recv autograd function pairs.
+  std::atomic<int64_t> next_autograd_message_id_;
+
+  // Maximum allowed value for autograd_context_id or autograd_message_id.
+  int64_t max_id_;
+};
+
+} // namespace torch::distributed::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/context/context.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/context/context.h
new file mode 100644
index 0000000000000000000000000000000000000000..a08c2b78fbdf7c3a19d559cb109ab5b99a0c5edd
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/context/context.h
@@ -0,0 +1,171 @@
+#pragma once
+
+#include <cstdint>
+#include <functional>
+
+#include <ATen/core/Dict.h>
+#include <torch/csrc/autograd/engine.h>
+#include <torch/csrc/distributed/autograd/functions/recvrpc_backward.h>
+#include <torch/csrc/distributed/autograd/functions/sendrpc_backward.h>
+#include <torch/csrc/distributed/rpc/rpc_agent.h>
+
+namespace torch::distributed::autograd {
+
+class RecvRpcBackward;
+
+// DistAutogradContext which stores information for a single distributed
+// autograd pass on a worker.
+class TORCH_API DistAutogradContext {
+ public:
+  using GradCallback = std::function<bool(torch::Tensor&)>;
+
+  explicit DistAutogradContext(int64_t contextId);
+  ~DistAutogradContext() = default;
+
+  // Retrieves the autograd context id for this context.
+  int64_t contextId() const;
+
+  // Records a 'send' autograd function for this context with the provided
+  // message id.
+  void addSendFunction(
+      const std::shared_ptr<SendRpcBackward>& func,
+      int64_t autograd_message_id);
+
+  // Records a 'recv' autograd function for this context with the provided
+  // message id.
+  void addRecvFunction(
+      std::shared_ptr<RecvRpcBackward>& func,
+      int64_t autograd_message_id);
+
+  // Given an autograd_message_id, retrieve the appropriate send function.
+  std::shared_ptr<SendRpcBackward> retrieveSendFunction(
+      int64_t autograd_message_id);
+
+  // Return all send functions for this context.
+  std::unordered_map<int64_t, std::shared_ptr<SendRpcBackward>> sendFunctions()
+      const;
+
+  // Return all recv functions for this context.
+  std::unordered_map<int64_t, std::shared_ptr<RecvRpcBackward>> recvFunctions()
+      const;
+
+  // Adds a future message recording an outstanding RPC.
+  void addOutstandingRpc(const c10::intrusive_ptr<rpc::JitFuture>& jitFuture);
+
+  // Returns all gradients.
+  const c10::Dict<torch::Tensor, torch::Tensor> getGradients() const;
+
+  // This function gives a mutable grad reference to the callback.
+  // If the callback returns true, it means the grad in the context
+  // needs to be updated.
+  void runGradCallbackForVariable(
+      const torch::autograd::Variable& variable,
+      const GradCallback& cb);
+
+  DistAutogradContext(const DistAutogradContext&) = delete;
+  DistAutogradContext& operator=(const DistAutogradContext&) = delete;
+  DistAutogradContext(DistAutogradContext&&) = delete;
+  DistAutogradContext& operator=(DistAutogradContext&&) = delete;
+
+  // records the workerID of a node that we sent an RPC to.
+  // workerIDs are added here when we attach a send function to this autograd
+  // context
+  void addKnownWorkerId(const rpc::worker_id_t workerId);
+
+  // Retrieves a set containing the known workerIds for this context
+  // These are the different workers that this context has sent RPCs to.
+  std::unordered_set<rpc::worker_id_t> getKnownWorkerIds() const;
+
+ private:
+  friend class BackwardPassCleanupGuard;
+  friend class DistEngine;
+  friend class RecvRpcBackward;
+  friend class DistAccumulateGradCaptureHook;
+
+  // Record that we would like to accumulate the provided gradient on the given
+  // variable.
+  void accumulateGrad(
+      const torch::autograd::Variable& variable,
+      const torch::Tensor& grad,
+      size_t num_expected_refs);
+
+  // Retrieve the GraphTask.
+  std::shared_ptr<torch::autograd::GraphTask> retrieveGraphTask();
+
+  // Set the appropriate graph task for the backward pass. Can be called only
+  // once.
+  void setGraphTask(std::shared_ptr<torch::autograd::GraphTask> graphTask);
+
+  // Resets the graph task to ensure we can run another distributed backward
+  // pass for the same autograd context.
+  void resetGraphTask();
+
+  // Waits for all outstanding RPCs for this context to finish and clears all
+  // outstanding rpcs held in this context. This should be called only once.
+  c10::intrusive_ptr<c10::ivalue::Future> clearAndWaitForOutstandingRpcsAsync();
+
+  void clearOutstandingRpcs();
+
+  // Record an event to mark the completion of gradient computation. These
+  // events will later help to properly synchronize gradients consumptions
+  // in getGradients(). We need these events because backward and
+  // optimizer.step are separate RPC calls, and will occur on different CUDA
+  // streams. Without synchronization, it is possible that gradients are
+  // consumed before they are ready.
+  void recordGradEvent(c10::Device device);
+
+  const int64_t contextId_;
+
+  // Set containing known worker IDs, used in cleaning up autograd context.
+  // Whenever a sendRpcBackward is attached to the autograd graph for this
+  // context, the destination is added here.
+  std::unordered_set<rpc::worker_id_t> knownWorkerIds_;
+
+  // Map from autograd_message_id to appropriate 'send' autograd function.
+  std::unordered_map<int64_t, std::shared_ptr<SendRpcBackward>>
+      sendAutogradFunctions_;
+
+  // Map from autograd_message_id to appropriate 'recv' autograd function.
+  std::unordered_map<int64_t, std::shared_ptr<RecvRpcBackward>>
+      recvAutogradFunctions_;
+
+  // Gradients accumulated in this context so far. The key is the variable on
+  // which the gradient needs to be accumulated and the value is the gradient
+  // that needs to be accumulated on that variable..
+  c10::Dict<torch::Tensor, torch::Tensor> accumulatedGrads_;
+
+  // See comments for recordGradEvent(c10::Device device);
+  std::unordered_map<c10::Device, c10::Event> gradReadyEvents_;
+  const c10::impl::VirtualGuardImpl impl_;
+
+  // The autograd GraphTask for the backward pass on this node for this context.
+  std::shared_ptr<torch::autograd::GraphTask> graphTask_;
+
+  // List of futures for RPCs initiated by this node to propagate gradients to
+  // other nodes. The distributed autograd engine on this node can return
+  // successfully only if all these futures are done and are successful.
+  std::vector<c10::intrusive_ptr<rpc::JitFuture>> outStandingRpcs_;
+
+  // Lock to protect concurrent modification of the context.
+  mutable std::mutex lock_;
+};
+
+using ContextPtr = std::shared_ptr<DistAutogradContext>;
+
+// This class stores a shared_ptr to a DistAutogradContext instance in a
+// thread local variable. The instance is given by the call site. The class
+// doesn't know the current context. It's just a util class.
+class TORCH_API ThreadLocalDistAutogradContext {
+ public:
+  // Store 'new_context' to the thread local variable maintained by this class.
+  explicit ThreadLocalDistAutogradContext(ContextPtr&& new_context);
+  ~ThreadLocalDistAutogradContext();
+
+  // Retrieve the stored DistAutogradContext instance.
+  static ContextPtr getContextPtr();
+
+ private:
+  ContextPtr prev_context_ptr_;
+};
+
+} // namespace torch::distributed::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/engine/dist_engine.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/engine/dist_engine.h
new file mode 100644
index 0000000000000000000000000000000000000000..32920dc40901b298523b64e367a4f285f18dbbd2
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/engine/dist_engine.h
@@ -0,0 +1,172 @@
+#pragma once
+
+#include <mutex>
+#include <unordered_set>
+
+#include <torch/csrc/autograd/engine.h>
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/autograd/functions/basic_ops.h>
+#include <torch/csrc/distributed/autograd/context/context.h>
+
+namespace torch::distributed::autograd {
+
+// Forward declaration.
+class BackwardPassCleanupGuard;
+
+// This is a singleton class responsible for running distributed backward
+// passes. This engine relies heavily on the vanilla autograd engine and tries
+// to reuse it as much as possible. This class is mostly responsible for the
+// distributed aspects of autograd and tries to hook into the autograd engine
+// where convenient.
+
+// Unlike the vanilla autograd engine, the distributed autograd engine
+// accumulates the gradients in the appropriate DistAutogradContext. This avoids
+// multiple trainer nodes stomping on each others gradients.
+class TORCH_API DistEngine {
+ public:
+  // Retrieve the singleton instance.
+  static DistEngine& getInstance();
+
+  // Given a list of root variables, start the distributed backwards pass from
+  // these variables and accumulate all the gradients in the current autograd
+  // context on each node. This method is used to kickoff distributed autograd
+  // on a single node.
+  void execute(
+      int64_t context_id,
+      const torch::autograd::variable_list& roots,
+      bool retainGraph);
+
+  // Given a send function to execute in the autograd engine, ensures we compute
+  // dependencies once for this node and enqueues the send function for execute
+  // in the engine.
+  // This method is used to kick off the autograd computation on a node when it
+  // receives gradients from the corresponding 'recv' method on another node.
+  // The gradients are accumulated in the provided autograd context.
+  c10::intrusive_ptr<c10::ivalue::Future> executeSendFunctionAsync(
+      const ContextPtr& autogradContext,
+      const std::shared_ptr<SendRpcBackward>& sendFunction,
+      bool retainGraph);
+
+  // Number of backward passes currently running for the Distributed Engine.
+  size_t numBackwardPasses() const;
+
+  // Returns key-value pairs consisting of useful debugging information related
+  // to distributed autograd.
+  std::unordered_map<std::string, int64_t> getDebugInfo() const;
+
+  DistEngine(const DistEngine&) = delete;
+  DistEngine& operator=(const DistEngine&) = delete;
+  DistEngine(DistEngine&&) = delete;
+  DistEngine& operator=(DistEngine&&) = delete;
+
+ private:
+  // Make sure this is a singleton.
+  DistEngine();
+  ~DistEngine();
+
+  // Validates the input roots for the backward computations and retrieves the
+  // appropriate root edges and corresponding gradients. Populates root_edges
+  // with the appropriate gradient edges and grads with the gradients for each
+  // edge.
+  void validateRootsAndRetrieveEdges(
+      const torch::autograd::variable_list& roots,
+      torch::autograd::edge_list& rootEdges,
+      torch::autograd::variable_list& grads);
+
+  // Given the autograd context, root edges and grads, we compute dependencies
+  // for the local node and fill out the provided GraphTask and GraphRoot with
+  // appropriate information for the local autograd engine.
+  // We also determine all leaf nodes(functions) in the graph and accumulate
+  // them in outputEdges.
+  void computeDependencies(
+      const ContextPtr& context,
+      const torch::autograd::edge_list& rootEdges,
+      const torch::autograd::variable_list& grads,
+      const std::shared_ptr<torch::autograd::Node>& graphRoot,
+      torch::autograd::edge_list& outputEdges,
+      bool retainGraph);
+
+  // Given a pre-populated GraphTask and a root node, compute the backward pass
+  // for the autograd graph until the graph task ready queue is empty.
+  //
+  // This method assumes that the appropriate GraphTask has already been
+  // initialized appropriately. It will construct a local ready queue to
+  // traverse the GraphTask instead of using the GraphTask embedded
+  // cpu_ready_queue, this is because dist engine might run the same GraphTask
+  // from different SendFunctions concurrently in different threads. The method
+  // will only mark the GraphTask as completed when it needs to, which means it
+  // might not mark as completed for every call as dist engine would like to
+  // keep the GraphTask alive when it not receives all gradients.
+  //
+  // When `incrementOutstandingTasks=false`, the function does not increment
+  // 'outstanding_tasks_' in the appropriate GraphTask. It is assumed we've
+  // already done this before hand for this task (to ensure we don't pre-mark
+  // this graph_task as completed). This is useful in the distributed autograd
+  // case where we need to increment 'outstanding_tasks_' first to indicate the
+  // local autograd engine the graph task is not completed until it receives the
+  // signals from other workers over the network.
+  //
+  // XXX: calling this function assumes that we will have NO GPU nodetasks be
+  // executed for the graph_task, the caller of this function need to ensure
+  // this otherwise there will be undefined behaviors. A correct way to fix this
+  // is to re-design the autograd engine so that GPU worker thread to behave the
+  // same as CPU caller thread, record the operation/thread for the device, and
+  // reuse it in backward.
+  // TODO: 1. Add assert in the dist engine to ensure no GPU NodeTasks during
+  // backward
+  //       2. properly setup the thread local ready queue to enable reentrant
+  //       backwards
+  void execute_graph_task_until_ready_queue_empty(
+      torch::autograd::NodeTask&& node_task,
+      bool incrementOutstandingTasks = true);
+
+  // Run the local autograd engine using the provided graphTask and graphRoot
+  // and accumulate the gradients part 'outputEdges' in the provided autograd
+  // context.
+  c10::intrusive_ptr<c10::ivalue::Future> runEngineAndAccumulateGradients(
+      const ContextPtr& autogradContext,
+      const std::shared_ptr<torch::autograd::Node>& graphRoot,
+      const torch::autograd::edge_list& outputEdges,
+      bool incrementOutStandingTasks = true);
+
+  // Run after the backward pass is done to appropriately cleanup structures.
+  void cleanupBackwardPass(const ContextPtr& autogradContext);
+
+  // Global thread to execute CPU continuations.
+  void globalCpuThread(
+      const std::shared_ptr<torch::autograd::ReadyQueue>& ready_queue);
+
+  // Set of autograd context_ids, which we have already initialized for
+  // distributed autograd on this node (e.g.: already computed dependencies)
+  std::unordered_set<int64_t> initializedContextIds_;
+
+  mutable std::mutex initializedContextIdsLock_;
+
+  // Reference to local autograd engine.
+  torch::autograd::Engine& engine_;
+
+  // Ready queue used by the CPU thread in distributed engine.
+  // See Note [GPU to CPU continuations]
+  std::shared_ptr<torch::autograd::ReadyQueue> global_cpu_ready_queue_;
+
+  // See Note [GPU to CPU continuations]
+  std::thread global_cpu_thread_;
+
+  friend class BackwardPassCleanupGuard;
+};
+
+// Guard to clean up resources once the backward pass is done.
+class BackwardPassCleanupGuard {
+ public:
+  explicit BackwardPassCleanupGuard(ContextPtr autogradContext)
+      : autogradContext_(std::move(autogradContext)) {}
+
+  ~BackwardPassCleanupGuard() {
+    DistEngine::getInstance().cleanupBackwardPass(autogradContext_);
+  }
+
+ private:
+  ContextPtr autogradContext_;
+};
+
+} // namespace torch::distributed::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/functions/recvrpc_backward.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/functions/recvrpc_backward.h
new file mode 100644
index 0000000000000000000000000000000000000000..1b6fbb73c62d030c4513da1ec3094f48705c48f8
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/functions/recvrpc_backward.h
@@ -0,0 +1,45 @@
+#pragma once
+
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/distributed/autograd/context/context.h>
+#include <torch/csrc/distributed/autograd/rpc_messages/autograd_metadata.h>
+#include <torch/csrc/distributed/rpc/rpc_agent.h>
+
+namespace torch::distributed::autograd {
+
+// Forward declarations.
+class DistAutogradContext;
+
+// As part of our distributed autograd implementation, whenever we receive an
+// RPC from a node, we add a 'RecvRpcBackward' autograd function to the
+// autograd graph. This is more or less a placeholder function that is used to
+// pass gradients to the remote host during the backward pass. The inputs to the
+// RPC function are the inputs to this autograd function.
+class TORCH_API RecvRpcBackward : public torch::autograd::Node {
+ public:
+  explicit RecvRpcBackward(
+      const AutogradMetadata& autogradMetadata,
+      const std::shared_ptr<DistAutogradContext>& autogradContext,
+      rpc::worker_id_t fromWorkerId,
+      rpc::DeviceMap deviceMap);
+
+  torch::autograd::variable_list apply(
+      torch::autograd::variable_list&& grads) override;
+
+ private:
+  const AutogradMetadata autogradMetadata_;
+
+  // Hold a weak reference to the autograd context to avoid circular
+  // dependencies with the context (since it holds a reference to
+  // RecvRpcBackward).
+  std::weak_ptr<DistAutogradContext> autogradContext_;
+
+  // The worker id from which the RPC was received. During the backward pass,
+  // we need to propagate the gradients to this workerId.
+  rpc::worker_id_t fromWorkerId_;
+
+  // Device mapping for tensors sent over RPC.
+  const rpc::DeviceMap deviceMap_;
+};
+
+} // namespace torch::distributed::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/functions/sendrpc_backward.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/functions/sendrpc_backward.h
new file mode 100644
index 0000000000000000000000000000000000000000..81ca2e8744397dca34fa0680fdec6ab44b2585c8
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/functions/sendrpc_backward.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <torch/csrc/autograd/function.h>
+
+namespace torch::distributed::autograd {
+
+// As part of our distributed autograd implementation, whenever we send an RPC
+// from one node to another, we add a 'SendRpcBackward' autograd function to the
+// autograd graph. This is more or less a placeholder function that is used to
+// kickoff the autograd engine on the current worker on the backward pass. The
+// edges for this autograd function are the inputs to the RPC method.
+//
+// During the backward pass, this function is queued for execution in the
+// autograd engine which eventually runs the rest of the autograd graph.
+struct TORCH_API SendRpcBackward : public torch::autograd::Node {
+ public:
+  torch::autograd::variable_list apply(
+      torch::autograd::variable_list&& inputs) override;
+
+  // SendRpcBackward is actually the root of an autograd graph on the local
+  // node. As a result, it doesn't receive any 'inputs', but rather the RPC
+  // framework passes gradients over to this function to kickoff local autograd
+  // computation.
+  void setGrads(const torch::autograd::variable_list& grads);
+
+  // Retrieve the grads for the function.
+  const torch::autograd::variable_list& getGrads() const;
+
+ private:
+  torch::autograd::variable_list grads_;
+};
+
+} // namespace torch::distributed::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/python_autograd.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/python_autograd.h
new file mode 100644
index 0000000000000000000000000000000000000000..fd375fee4bd93cb3a113dbd5412a0673d5a73a72
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/python_autograd.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+namespace torch::distributed::autograd {
+
+PyMethodDef* python_functions();
+
+} // namespace torch::distributed::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/autograd_metadata.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/autograd_metadata.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c1fdb411bd19a77f7dfab693d76d6ca721dc42b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/autograd_metadata.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <cstdint>
+
+namespace torch::distributed::autograd {
+
+// This structure represents autograd metadata that we need to pass across
+// different nodes when we call an RPC which needs autograd computation.
+struct TORCH_API AutogradMetadata {
+  AutogradMetadata(int64_t autogradContextId, int64_t autogradMessageId);
+
+  // autogradContextId_ is a globally unique integer that identifies a
+  // particular distributed autograd pass.
+  int64_t autogradContextId;
+  // autogradMessageId_ is a globally unique integer that identifies a pair
+  // of send/recv autograd functions.
+  int64_t autogradMessageId;
+};
+
+} // namespace torch::distributed::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/cleanup_autograd_context_req.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/cleanup_autograd_context_req.h
new file mode 100644
index 0000000000000000000000000000000000000000..03773223beb8364af6c7b9c4e8911ae5416cda9b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/cleanup_autograd_context_req.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <torch/csrc/distributed/autograd/rpc_messages/autograd_metadata.h>
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+
+namespace torch::distributed::autograd {
+
+// Used to request other workers to clean up their autograd context.
+class TORCH_API CleanupAutogradContextReq : public rpc::RpcCommandBase {
+ public:
+  explicit CleanupAutogradContextReq(int64_t context_id);
+  // Serialization and deserialization methods.
+  c10::intrusive_ptr<rpc::Message> toMessageImpl() && override;
+  static std::unique_ptr<CleanupAutogradContextReq> fromMessage(
+      const rpc::Message& message);
+
+  // Retrieve the context id we are cleaning up with this message.
+  int64_t getContextId();
+
+ private:
+  int64_t context_id_;
+};
+
+} // namespace torch::distributed::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/cleanup_autograd_context_resp.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/cleanup_autograd_context_resp.h
new file mode 100644
index 0000000000000000000000000000000000000000..cb481c3e5a37623c543ff394739597eb77560868
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/cleanup_autograd_context_resp.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+
+namespace torch::distributed::autograd {
+
+// Empty response for CleanupAutogradContextReq. Send to acknowledge receipt of
+// a CleanupAutogradContextReq.
+class TORCH_API CleanupAutogradContextResp : public rpc::RpcCommandBase {
+ public:
+  CleanupAutogradContextResp() = default;
+  // Serialization and deserialization methods.
+  c10::intrusive_ptr<rpc::Message> toMessageImpl() && override;
+  static std::unique_ptr<CleanupAutogradContextResp> fromMessage(
+      const rpc::Message& message);
+};
+
+} // namespace torch::distributed::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/propagate_gradients_req.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/propagate_gradients_req.h
new file mode 100644
index 0000000000000000000000000000000000000000..dfbc72b3da4bbdbf82b17ddf20962a54ff4ab34f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/propagate_gradients_req.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <torch/csrc/distributed/autograd/rpc_messages/autograd_metadata.h>
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+#include <vector>
+
+namespace torch::distributed::autograd {
+
+// Used to propagate gradients from one node to another during a distributed
+// backwards pass. This RPC call is invoked when we hit a `recv` autograd
+// function during backward pass execution.
+class TORCH_API PropagateGradientsReq : public rpc::RpcCommandBase {
+ public:
+  PropagateGradientsReq(
+      const AutogradMetadata& autogradMetadata,
+      std::vector<torch::autograd::Variable> grads,
+      bool retainGraph = false);
+
+  const AutogradMetadata& getAutogradMetadata();
+
+  const std::vector<torch::autograd::Variable>& getGrads();
+
+  // Serialization and deserialization methods.
+  c10::intrusive_ptr<rpc::Message> toMessageImpl() && override;
+  static std::unique_ptr<PropagateGradientsReq> fromMessage(
+      const rpc::Message& message);
+
+  // Whether or not to retain the autograd graph.
+  bool retainGraph();
+
+ private:
+  AutogradMetadata autogradMetadata_;
+  std::vector<torch::autograd::Variable> grads_;
+  bool retainGraph_;
+};
+
+} // namespace torch::distributed::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/propagate_gradients_resp.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/propagate_gradients_resp.h
new file mode 100644
index 0000000000000000000000000000000000000000..59589b687ef0356d9f7880c5930de65a2c4dca6b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/propagate_gradients_resp.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+
+namespace torch::distributed::autograd {
+
+// Response for the PropagateGradients call. Currently, this class is mostly
+// just a placeholder and sends an empty message over the wire. The purpose of
+// this RPC command is to indicate whether or not the PropagateGradientsReq call
+// was successfully or not.
+class TORCH_API PropagateGradientsResp : public rpc::RpcCommandBase {
+ public:
+  PropagateGradientsResp() = default;
+  c10::intrusive_ptr<rpc::Message> toMessageImpl() && override;
+  static std::unique_ptr<PropagateGradientsResp> fromMessage(
+      const rpc::Message& message);
+};
+
+} // namespace torch::distributed::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.h
new file mode 100644
index 0000000000000000000000000000000000000000..7f4387b1a4136e57a91cbdf53507e017058e4638
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.h
@@ -0,0 +1,94 @@
+#pragma once
+
+#include <torch/csrc/distributed/autograd/rpc_messages/autograd_metadata.h>
+#include <torch/csrc/distributed/rpc/rpc_agent.h>
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+
+namespace torch::distributed::autograd {
+
+// Represents an RPC that includes autograd information. This class basically
+// wraps another `RpcCommandBase` object which represents the actual RPC and has
+// additional autograd information associated with that RPC.
+class TORCH_API RpcWithAutograd final : public rpc::RpcCommandBase {
+ public:
+  // Used when we are sending an RPC over the wire.
+  RpcWithAutograd(
+      rpc::worker_id_t fromWorkerId,
+      rpc::MessageType messageType,
+      const AutogradMetadata& autogradMetadata,
+      c10::intrusive_ptr<rpc::Message> wrappedMessage,
+      rpc::DeviceMap deviceMap = {});
+
+  // Used when receiving an RPC over the wire.
+  RpcWithAutograd(
+      rpc::worker_id_t fromWorkerId,
+      rpc::MessageType messageType,
+      const AutogradMetadata& autogradMetadata,
+      std::unique_ptr<rpc::RpcCommandBase> wrappedRpc,
+      rpc::MessageType wrappedMessageType,
+      std::vector<torch::Tensor> tensors,
+      rpc::DeviceMap deviceMap = {});
+
+  c10::intrusive_ptr<rpc::Message> toMessageImpl() && override;
+
+  static std::unique_ptr<RpcWithAutograd> fromMessage(
+      const rpc::Message& message);
+
+  // Retrieves tensors as part of this RPC, which need to be considered for
+  // autograd computations.
+  std::vector<torch::Tensor>& tensors();
+
+  const AutogradMetadata& autogradMetadata() const;
+
+  RpcCommandBase& wrappedRpc();
+
+  void setWrappedRpc(std::unique_ptr<RpcCommandBase> wrappedRpc);
+
+  std::unique_ptr<RpcCommandBase> moveWrappedRpc() &&;
+
+  // Message type of the wrapped RPC.
+  rpc::MessageType wrappedMessageType() const;
+
+  // Retrieve the worker id from which the RPC originated.
+  rpc::worker_id_t fromWorkerId() const;
+
+  // Retrieve the device map.
+  const rpc::DeviceMap& deviceMap();
+
+ private:
+  // WorkerId from which this RPC originated. This is necessary for knowing
+  // which worker we need to contact during the backward pass.
+  rpc::worker_id_t fromWorkerId_;
+
+  // Message type for this call.
+  rpc::MessageType messageType_;
+
+  AutogradMetadata autogradMetadata_;
+
+  // Since wrappedMessage_ is destructively constructed from wrappedRpc_,
+  // they are valid exclusively. They are used for different purpose.
+  // wrappedRpc_ is used while constructing receive rpcWithAutograd;
+  // wrappedMessage_ is used while constructing send rpcWithAutograd;
+
+  // When receive rpcWithAutograd is constructed fromMessage, it is valid;
+  // When send rpcWithAutograd is constructed before toMessage, it is nullptr;
+  std::unique_ptr<RpcCommandBase> wrappedRpc_;
+
+  // Serialized message representing wrappedRpc_. Used mostly as a cache to
+  // avoid serializing the request twice.
+  // When receive rpcWithAutograd is constructed fromMessage, it is nullptr;
+  // When send rpcWithAutograd is constructed before toMessage, it is valid;
+  c10::intrusive_ptr<rpc::Message> wrappedMessage_;
+
+  // message type of the wrappedMessage, this is stored separately since
+  // wrappedMessage_ is not always guaranteed to be populated.
+  rpc::MessageType wrappedMessageType_;
+
+  // Tensors part of the wrappedRpc that need to be considered for autograd.
+  std::vector<torch::Tensor> tensors_;
+
+  // Device mapping for tensors that are sent across an RPC to another node.
+  rpc::DeviceMap deviceMap_;
+};
+
+} // namespace torch::distributed::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_req.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_req.h
new file mode 100644
index 0000000000000000000000000000000000000000..12846320ee9a8f071e9bec4fbc9043a39a5f8e74
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_req.h
@@ -0,0 +1,61 @@
+#pragma once
+
+#include <torch/csrc/autograd/profiler.h>
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/rpc_agent.h>
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+#include <torch/csrc/distributed/rpc/types.h>
+
+namespace torch::distributed::autograd {
+
+class TORCH_API RpcWithProfilingReq : public rpc::RpcCommandBase {
+ public:
+  // For sending RPCs, invoked when client is creating this RPC command.
+  RpcWithProfilingReq(
+      rpc::MessageType messageType,
+      c10::intrusive_ptr<rpc::Message> wrappedMessage,
+      torch::autograd::profiler::ProfilerConfig&& profilerConfig,
+      rpc::ProfilingId profilingKeyId);
+
+  // For receiving an RPC
+  // Used in fromMessage.
+  RpcWithProfilingReq(
+      rpc::MessageType messageType,
+      std::unique_ptr<rpc::RpcCommandBase> wrappedRpc,
+      rpc::MessageType wrappedMessageType,
+      std::vector<torch::Tensor> tensors,
+      torch::autograd::profiler::ProfilerConfig&& profilerConfig,
+      rpc::ProfilingId profilingKeyId);
+
+  // Convert this RPC Command to a Message that can be sent over the wire.
+  c10::intrusive_ptr<rpc::Message> toMessageImpl() && override;
+  static std::unique_ptr<RpcWithProfilingReq> fromMessage(
+      const rpc::Message& message);
+
+  // Retrieve the profiling data that is associated with this command.
+  torch::autograd::profiler::ProfilerConfig getProfilingConfig() const;
+  // Retrieve the globally unique profiling ID corresponding to this command.
+  const rpc::ProfilingId& getProfilingId() const;
+  // Retrieve the original RPC which this ProfilingRPC wraps.
+  RpcCommandBase& wrappedRpc();
+  // Destructively move the wrapped RPC.
+  std::unique_ptr<RpcCommandBase> moveWrappedRpc() &&;
+  // Message type of the wrapped RPC
+  rpc::MessageType wrappedMessageType() const;
+  void setWrappedRpc(std::unique_ptr<RpcCommandBase> wrappedRpc);
+
+ private:
+  // message type
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const rpc::MessageType messageType_;
+  // wrapped message
+  c10::intrusive_ptr<rpc::Message> wrappedMessage_;
+  std::unique_ptr<RpcCommandBase> wrappedRpc_;
+  rpc::MessageType wrappedMessageType_;
+  std::vector<torch::Tensor> tensors_;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const torch::autograd::profiler::ProfilerConfig profilerConfig_;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const rpc::ProfilingId profilingKeyId_;
+};
+} // namespace torch::distributed::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_resp.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_resp.h
new file mode 100644
index 0000000000000000000000000000000000000000..8139eaa38091093315e6ebe836b2584bb25f31cd
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_resp.h
@@ -0,0 +1,58 @@
+#pragma once
+
+#include <torch/csrc/autograd/profiler.h>
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/rpc_agent.h>
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+#include <torch/csrc/distributed/rpc/types.h>
+
+namespace torch::distributed::autograd {
+class TORCH_API RpcWithProfilingResp : public rpc::RpcCommandBase {
+ public:
+  // For sending RPCs over the wire
+  RpcWithProfilingResp(
+      rpc::MessageType messageType,
+      c10::intrusive_ptr<rpc::Message> wrappedMessage,
+      std::vector<torch::autograd::profiler::LegacyEvent> profiledEvents,
+      rpc::ProfilingId profilingId);
+
+  // For receiving RPCs. Used in from message when converting a message received
+  // over the wire.
+  RpcWithProfilingResp(
+      rpc::MessageType messageType,
+      std::unique_ptr<rpc::RpcCommandBase> wrappedRpc,
+      rpc::MessageType wrappedMessageType,
+      std::vector<torch::Tensor> tensors,
+      std::vector<torch::autograd::profiler::LegacyEvent> profiledEvents,
+      rpc::ProfilingId profilingId);
+  c10::intrusive_ptr<rpc::Message> toMessageImpl() && override;
+  static std::unique_ptr<RpcWithProfilingResp> fromMessage(
+      const rpc::Message& message);
+  // Retrieve remote Events
+  std::vector<torch::autograd::profiler::LegacyEvent> getProfiledEvents() const;
+  // Retrieve the globally unique profiling ID corresponding to this command.
+  const rpc::ProfilingId& getProfilingId() const;
+  // Retrieve the original RPC which this ProfilingRPC wraps.
+  RpcCommandBase& wrappedRpc();
+  // Destructively move the wrapped RPC.
+  std::unique_ptr<RpcCommandBase> moveWrappedRpc() &&;
+  // Message type of the wrapped RPC
+  rpc::MessageType wrappedMessageType() const;
+  // Set the wrapped RPC for this RPC.
+  void setWrappedRpc(std::unique_ptr<RpcCommandBase> wrappedRpc);
+
+ private:
+  // message type
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const rpc::MessageType messageType_;
+  // wrapped message
+  c10::intrusive_ptr<rpc::Message> wrappedMessage_;
+  std::unique_ptr<RpcCommandBase> wrappedRpc_;
+  rpc::MessageType wrappedMessageType_;
+  std::vector<torch::Tensor> tensors_;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const std::vector<torch::autograd::profiler::LegacyEvent> profiledEvents_;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const rpc::ProfilingId profilingId_;
+};
+} // namespace torch::distributed::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/rref_backward_req.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/rref_backward_req.h
new file mode 100644
index 0000000000000000000000000000000000000000..66a9f12502291f1e5bf8a07387ebae2d9f0b4381
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/rref_backward_req.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+#include <torch/csrc/distributed/rpc/types.h>
+
+namespace torch::distributed::autograd {
+
+// Internal system RPC to invoke distributed backward pass on remote nodes when
+// 'rref.backward()' is invoked.
+class TORCH_API RRefBackwardReq : public rpc::RpcCommandBase {
+ public:
+  RRefBackwardReq(
+      const rpc::RRefId& rrefId,
+      int64_t autogradContextId,
+      bool retainGraph = false);
+
+  const rpc::RRefId& getRRefId() const;
+
+  int64_t getAutogradContextId() const;
+
+  bool retainGraph() const;
+
+  // Serialization and deserialization methods.
+  c10::intrusive_ptr<rpc::Message> toMessageImpl() && override;
+  static std::unique_ptr<RRefBackwardReq> fromMessage(
+      const rpc::Message& message);
+
+ private:
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const rpc::RRefId rrefId_;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const int64_t autogradContextId_;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const bool retainGraph_;
+};
+
+} // namespace torch::distributed::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/rref_backward_resp.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/rref_backward_resp.h
new file mode 100644
index 0000000000000000000000000000000000000000..89d1d80ea4a4c15210510818bab6e22490c8741a
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/rpc_messages/rref_backward_resp.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+
+namespace torch::distributed::autograd {
+
+// Response for the RRefBackwardReq.
+class TORCH_API RRefBackwardResp : public rpc::RpcCommandBase {
+ public:
+  RRefBackwardResp() = default;
+  c10::intrusive_ptr<rpc::Message> toMessageImpl() && override;
+  static std::unique_ptr<RRefBackwardResp> fromMessage(
+      const rpc::Message& message);
+};
+
+} // namespace torch::distributed::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/utils.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..6ee9434528ad07ea4be302b2a4415346a5b1cbba
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/autograd/utils.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <torch/csrc/distributed/autograd/context/context.h>
+#include <torch/csrc/distributed/autograd/rpc_messages/rpc_with_autograd.h>
+#include <torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_req.h>
+#include <torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_resp.h>
+
+namespace torch::distributed::autograd {
+
+// This method is used to attach the 'send' autograd function to the autograd
+// graph when we use RPC. This method creates a new 'send' autograd function
+// and attaches the provided tensors as next_edges to the 'send' function. In
+// addition to this, it also registers the send function in the provided
+// autograd context. Finally, the RPC message is updated with appropriate
+// autograd information for the recipient.
+TORCH_API void addSendRpcBackward(
+    const ContextPtr& autogradContext,
+    const AutogradMetadata& autogradMetadata,
+    std::vector<torch::Tensor>& tensors);
+
+// This method is used to attach the 'recv' autograd function to the autograd
+// graph when we use RPC. This method creates a new 'recv' autograd function
+// and attaches the provided tensors as inputs to the 'recv' function. It
+// creates a new autograd context if needed and registers the 'recv' function
+// with this context.
+//
+// Returns a pointer to the autograd context created.
+TORCH_API ContextPtr addRecvRpcBackward(
+    const AutogradMetadata& autogradMetadata,
+    std::vector<torch::Tensor>& tensors,
+    rpc::worker_id_t fromWorkerId,
+    const rpc::DeviceMap& deviceMap);
+
+// This method is a wrapper utility used internally to wrap autograd info
+// and attach autograd function for each type of rpc call if it has valid
+// context and tensors require grads or forceGradRecording is true, in this
+// case, return RpcWithAutograd message; otherwise return original rpc message.
+// NB: forceGradRecording is useful when the request does not contain any tensor
+// but the corresponding response does.
+TORCH_API c10::intrusive_ptr<rpc::Message> getMessageWithAutograd(
+    const rpc::worker_id_t dstId,
+    c10::intrusive_ptr<rpc::Message> wrappedRpcMsg,
+    rpc::MessageType msgType,
+    bool forceGradRecording = false,
+    const rpc::DeviceMap& deviceMap = {});
+
+// Send message after autograd checking
+TORCH_API c10::intrusive_ptr<c10::ivalue::Future> sendMessageWithAutograd(
+    rpc::RpcAgent& agent,
+    const rpc::WorkerInfo& dst,
+    c10::intrusive_ptr<rpc::Message> wrappedRpcMsg,
+    bool forceGradRecording = false,
+    const float rpcTimeoutSeconds = torch::distributed::rpc::kUnsetRpcTimeout,
+    bool forceDisableProfiling = false);
+
+} // namespace torch::distributed::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/Backend.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/Backend.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..518cf755d07bd62123312b754bb978b4903ec2e9
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/Backend.hpp
@@ -0,0 +1,471 @@
+#pragma once
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include <ATen/ATen.h>
+#include <c10/core/Allocator.h>
+#include <c10/macros/Macros.h>
+
+#include <torch/csrc/distributed/c10d/Types.hpp>
+#include <torch/csrc/distributed/c10d/Utils.hpp>
+#include <torch/csrc/distributed/c10d/Work.hpp>
+#include <torch/csrc/distributed/c10d/debug.h>
+
+constexpr auto kBackendDefaultTimeout =
+    std::chrono::milliseconds(30 * 60 * 1000);
+
+namespace c10d {
+
+enum class ErrorType {
+  SUCCESS = 0,
+  TIMEOUT = 1,
+  // e.g., NCCL error, etc
+  COMM_ERROR = 2,
+  // TODO, do we need to distinguish between remote timeout or remote COMM
+  // errors?
+  REMOTE_ERROR = 3
+};
+
+class TORCH_API Backend : public torch::CustomClassHolder {
+ public:
+  // Backend Options is a base struct that defines the basic options
+  // when constructing a Backend. Each Backend subclass should
+  // extend this struct and define its options if it wants to provide more
+  // config options (beyond basic ones defined here) to end user.
+  struct TORCH_API Options : torch::CustomClassHolder {
+    explicit Options(
+        std::string backend,
+        std::chrono::milliseconds timeout = kBackendDefaultTimeout)
+        : timeout(timeout), backend(std::move(backend)) {}
+    ~Options() override = default;
+
+    std::chrono::milliseconds timeout;
+
+    // backend name
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+    const std::string backend;
+  };
+
+  explicit Backend(int rank, int size);
+  ~Backend() override = 0;
+
+  int getRank() const {
+    return rank_;
+  }
+
+  int getSize() const {
+    return size_;
+  }
+
+  // Returns an unique opaque ID of this backend that can be used to correlate
+  // with its collectives.
+  int64_t getID() const {
+    return reinterpret_cast<std::intptr_t>(this);
+  }
+
+  virtual bool supportsSplitting() const {
+    return false;
+  }
+
+  virtual bool supportsCoalescing() const {
+    return false;
+  }
+
+  virtual bool supportsTimeEstimation() const {
+    return false;
+  }
+
+  virtual void startCoalescing() {
+    TORCH_CHECK(
+        false,
+        c10::str(
+            "Backend ",
+            getBackendName(),
+            " does not implement startCoalescing"));
+  }
+
+  virtual c10::intrusive_ptr<Work> endCoalescing() {
+    TORCH_CHECK(
+        false,
+        c10::str(
+            "Backend ", getBackendName(), " does not implement endCoalescing"));
+  }
+
+  // Subclasses must override this method to return the backend name
+  virtual const std::string getBackendName() const {
+    TORCH_INTERNAL_ASSERT(false, "getBackendName is not implemented.");
+  }
+
+  virtual c10::intrusive_ptr<Work> broadcast(
+      std::vector<at::Tensor>& /* tensors */,
+      const BroadcastOptions& /* opts */ = BroadcastOptions()) {
+    TORCH_CHECK(
+        false,
+        c10::str("Backend ", getBackendName(), " does not support broadcast"));
+  }
+
+  virtual c10::intrusive_ptr<Work> allreduce(
+      std::vector<at::Tensor>& /* tensors */,
+      const AllreduceOptions& /* opts */ = AllreduceOptions()) {
+    TORCH_CHECK(
+        false,
+        c10::str("Backend ", getBackendName(), " does not support allreduce"));
+  }
+
+  virtual c10::intrusive_ptr<Work> allreduce_sparse(
+      std::vector<at::Tensor>& /* tensors */,
+      const AllreduceOptions& /* opts */ = AllreduceOptions()) {
+    TORCH_CHECK(
+        false,
+        c10::str(
+            "Backend ",
+            getBackendName(),
+            " does not support allreduce sparse"));
+  }
+
+  virtual c10::intrusive_ptr<Work> allreduce_coalesced(
+      std::vector<at::Tensor>& /* tensors */,
+      const AllreduceCoalescedOptions& /* opts */ =
+          AllreduceCoalescedOptions()) {
+    TORCH_CHECK(
+        false,
+        c10::str(
+            "Backend ",
+            getBackendName(),
+            " does not support allreduce_coalesced"));
+  }
+
+  virtual c10::intrusive_ptr<Work> reduce(
+      std::vector<at::Tensor>& /* tensors */,
+      const ReduceOptions& /* opts */ = ReduceOptions()) {
+    TORCH_CHECK(
+        false,
+        c10::str("Backend ", getBackendName(), " does not support reduce"));
+  }
+
+  virtual c10::intrusive_ptr<Work> allgather(
+      std::vector<std::vector<at::Tensor>>& /* outputTensors */,
+      std::vector<at::Tensor>& /* inputTensors */,
+      const AllgatherOptions& /* opts */ = AllgatherOptions()) {
+    TORCH_CHECK(
+        false,
+        c10::str("Backend ", getBackendName(), " does not support allgather"));
+  }
+
+  // Gathers a single tensor inputBuffer into a single buffer outputBuffer that
+  // is interpreted as a contiguous collection of size inputBuffer * WORLD_SIZE.
+  // For implementers of ProcessGroup API and advanced users only.
+  // Note: this function will be deprecated in near future.
+  virtual c10::intrusive_ptr<Work> _allgather_base(
+      at::Tensor& /* outputBuffer */,
+      at::Tensor& /* inputBuffer */,
+      const AllgatherOptions& /* opts */ = AllgatherOptions()) {
+    TORCH_CHECK(
+        false,
+        c10::str(
+            "Backend ", getBackendName(), " does not support _allgather_base"));
+  }
+
+  // This function is deprecated and will be moved out of Backend to comms:
+  // * do not add dependencies on this function,
+  // * do not implement it in your Backend, implement _allgather_base
+  //   instead.
+  virtual c10::intrusive_ptr<Work> allgather_coalesced(
+      std::vector<std::vector<at::Tensor>>& /* outputTensorLists */,
+      std::vector<at::Tensor>& /* inputTensors */,
+      const AllgatherOptions& /* opts */ = AllgatherOptions()) {
+    TORCH_CHECK(
+        false,
+        c10::str(
+            "Backend ",
+            getBackendName(),
+            " does not support allgather_coalesced"));
+  }
+
+  // This function is a coalesced version of `allgather_into_tensor` (currently
+  // still named as `_allgather_base`). Each tensor in the vector corresponds to
+  // an input/output of one `allgather_into_tensor` operation.
+  virtual c10::intrusive_ptr<Work> allgather_into_tensor_coalesced(
+      std::vector<at::Tensor>& /* outputs */,
+      std::vector<at::Tensor>& /* inputs */,
+      const AllgatherOptions& /* opts */ = AllgatherOptions()) {
+    TORCH_CHECK(
+        false,
+        c10::str(
+            "Backend ",
+            getBackendName(),
+            " does not support allgather_into_tensor_coalesced"));
+  }
+
+  virtual c10::intrusive_ptr<Work> gather(
+      std::vector<std::vector<at::Tensor>>& /* outputTensors */,
+      std::vector<at::Tensor>& /* inputTensors */,
+      const GatherOptions& /* opts */ = GatherOptions()) {
+    TORCH_CHECK(
+        false,
+        c10::str("Backend ", getBackendName(), " does not support gather"));
+  }
+
+  virtual c10::intrusive_ptr<Work> scatter(
+      std::vector<at::Tensor>& /* outputTensors */,
+      std::vector<std::vector<at::Tensor>>& /* inputTensors */,
+      const ScatterOptions& /* opts */ = ScatterOptions()) {
+    TORCH_CHECK(
+        false,
+        c10::str("Backend ", getBackendName(), " does not support scatter"));
+  }
+
+  virtual c10::intrusive_ptr<Work> reduce_scatter(
+      std::vector<at::Tensor>& /* outputTensors */,
+      std::vector<std::vector<at::Tensor>>& /* inputTensors */,
+      const ReduceScatterOptions& /* opts */ = ReduceScatterOptions()) {
+    TORCH_CHECK(
+        false,
+        c10::str(
+            "Backend ", getBackendName(), " does not support reduce_scatter"));
+  }
+
+  virtual c10::intrusive_ptr<Work> _reduce_scatter_base(
+      at::Tensor& /* outputBuffer */,
+      at::Tensor& /* inputBuffer */,
+      const ReduceScatterOptions& /* opts */ = ReduceScatterOptions()) {
+    TORCH_CHECK(
+        false,
+        c10::str(
+            "Backend ",
+            getBackendName(),
+            " does not support _reduce_scatter_base"));
+  }
+
+  // This function is a coalesced version of `reduce_scatter_tensor` (currently
+  // still named as `_reduce_scatter_base`). Each tensor in the vector
+  // corresponds to an input/output of one `reduce_scatter_tensor` operation.
+  virtual c10::intrusive_ptr<Work> reduce_scatter_tensor_coalesced(
+      std::vector<at::Tensor>& /* outputs */,
+      std::vector<at::Tensor>& /* inputs */,
+      const ReduceScatterOptions& /* opts */ = ReduceScatterOptions()) {
+    TORCH_CHECK(
+        false,
+        c10::str(
+            "Backend ",
+            getBackendName(),
+            " does not support reduce_scatter_tensor_coalesced"));
+  }
+
+  virtual c10::intrusive_ptr<Work> alltoall_base(
+      at::Tensor& /* outputBuffer */,
+      at::Tensor& /* inputBuffer */,
+      std::vector<int64_t>& /* outputSplitSizes */,
+      std::vector<int64_t>& /* inputSplitSizes */,
+      const AllToAllOptions& /* opts */ = AllToAllOptions()) {
+    TORCH_CHECK(
+        false,
+        c10::str(
+            "Backend ", getBackendName(), " does not support alltoall_base"));
+  }
+
+  virtual c10::intrusive_ptr<Work> alltoall(
+      std::vector<at::Tensor>& /* outputTensors */,
+      std::vector<at::Tensor>& /* inputTensors */,
+      const AllToAllOptions& opts = AllToAllOptions()) {
+    TORCH_CHECK(
+        false,
+        c10::str("Backend ", getBackendName(), " does not support alltoall"));
+  }
+
+  virtual void monitoredBarrier(
+      const BarrierOptions& /* unused */,
+      bool /* unused */ = false) {
+    auto backendName = getBackendName();
+    TORCH_CHECK(
+        false,
+        c10::str(
+            "Backend ",
+            backendName,
+            " does not support monitoredBarrier, only GLOO supports monitored barrier."));
+  }
+
+  // Agrees on an initial sequence number for the whole group by having rank 0
+  // create it and broadcast it to other ranks using the store. Only implemented
+  // for GLOO and NCCL backends currently.
+  virtual void setSequenceNumberForGroup() {
+    auto backendName = getBackendName();
+    TORCH_CHECK(
+        false,
+        c10::str(
+            "Backend ",
+            backendName,
+            " does not yet support sequence numbers."));
+  }
+
+  // Retrieves the current sequence number for the whole group, which should be
+  // in sync. If the returned number is not consistent across the group, it
+  // may indicate that there is some sort of collective desynchronization.
+  virtual uint64_t getSequenceNumberForGroup() {
+    auto backendName = getBackendName();
+    TORCH_CHECK(
+        false,
+        c10::str(
+            "Backend ",
+            backendName,
+            " does not yet support sequence numbers."));
+  }
+
+  virtual c10::intrusive_ptr<Work> send(
+      std::vector<at::Tensor>& /* tensors */,
+      int /* dstRank */,
+      int /* tag */) {
+    TORCH_CHECK(
+        false,
+        c10::str("Backend ", getBackendName(), " does not support send"));
+  }
+
+  virtual c10::intrusive_ptr<Work> recv(
+      std::vector<at::Tensor>& /* tensors */,
+      int /* srcRank */,
+      int /* tag */) {
+    TORCH_CHECK(
+        false,
+        c10::str("Backend ", getBackendName(), " does not support recv"));
+  }
+
+  virtual c10::intrusive_ptr<Work> recvAnysource(
+      std::vector<at::Tensor>& /* tensors */,
+      int /* tag */) {
+    TORCH_CHECK(
+        false,
+        c10::str(
+            "Backend ", getBackendName(), " does not support recvAnysource"));
+  }
+
+  virtual c10::intrusive_ptr<Work> barrier(
+      const BarrierOptions& /* opts */ = BarrierOptions()) {
+    TORCH_CHECK(
+        false,
+        c10::str("Backend ", getBackendName(), " does not support barrier"));
+  }
+
+  virtual void registerOnCompletionHook(
+      std::function<void(std::shared_ptr<WorkInfo>)>&& hook) {
+    TORCH_CHECK(
+        false,
+        "Only ProcessGrouppNCCL supports onCompletion hook, but got ",
+        getBackendName(),
+        " backend.");
+  }
+
+  virtual void waitForPendingWorks() {
+    TORCH_CHECK(
+        false,
+        "Only ProcessGrouppNCCL supports waitForPendingWorks, but got ",
+        getBackendName(),
+        " backend.");
+  }
+
+  virtual void enableCollectivesTiming() {
+    TORCH_CHECK(
+        false,
+        "Backend ",
+        getBackendName(),
+        " is missing implementation of enableCollectivesTiming.");
+  }
+
+  bool hasHooks() const {
+    return onCompletionHook_ != nullptr;
+  }
+
+  // Do not call this directly, use ProcessGroup::setGroupName instead.
+  void setGroupUid(const std::string& pg_uid) {
+    pg_uid_ = pg_uid;
+  }
+
+  const std::string& getGroupUid() const {
+    return pg_uid_;
+  }
+
+  void setGroupDesc(const std::string& desc) {
+    pg_desc_ = desc;
+  }
+
+  const std::string& getGroupDesc() const {
+    return pg_desc_;
+  }
+
+  // See similar functions in ProcessGroup.hpp for context.
+  std::optional<at::Device> getBoundDeviceId() const {
+    return bound_device_id_;
+  }
+
+  // Perform an eager connect to the specified device if the backend supports
+  // it.
+  virtual void eagerConnectSingleDevice(at::Device device) {
+    // no-op in the default case; this is an optimization some
+    // backends may perform
+  }
+
+  void setBoundDeviceId(std::optional<at::Device> device) {
+    if (device) {
+      TORCH_CHECK(device->has_index(), "setBoundDeviceId must have an index");
+    }
+    bound_device_id_ = device;
+  }
+
+  virtual ErrorType getError() {
+    TORCH_CHECK(
+        false,
+        c10::str("Backend ", getBackendName(), " does not support getError"));
+  }
+
+  virtual std::shared_ptr<c10::Allocator> getMemAllocator() {
+    TORCH_CHECK(
+        false,
+        c10::str(
+            "Backend ", getBackendName(), " does not support getMemAllocator"));
+  }
+
+  // Allocate tensor (aten::empty) from backend's communication-optimized memory
+  // pool
+  virtual at::Tensor allocateTensor(long size, at::TensorOptions options = {}) {
+    TORCH_CHECK(
+        false,
+        c10::str(
+            "Backend ", getBackendName(), " does not support allocateTensor"));
+  }
+
+  // Returns true if backend supports tensor allocation
+  virtual bool supportsTensorAlloc(c10::DeviceIndex deviceIdx) {
+    // Change to true in concrete backend if supported
+    return false;
+  }
+
+  // Aborts all pending operations and connections in the backend if the backend
+  // supports it.
+  virtual void abort() {}
+
+  // Shutdown the backend if the backend supports it. This should be used for
+  // normal shutdown.
+  virtual void shutdown() {}
+
+ protected:
+  // Implementations of this interface need to call this to setup
+  // appropriate logging etc.
+  void init();
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const int rank_;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const int size_;
+  // Debug level setting. It is parsed once when ProcessGroup is constructed and
+  // remains the same across use of this process group.
+  DebugLevel dist_debug_level_;
+  std::string pg_uid_;
+  std::string pg_desc_;
+
+  std::function<void(std::shared_ptr<WorkInfo>)> onCompletionHook_;
+
+  std::optional<at::Device> bound_device_id_;
+};
+
+} // namespace c10d
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/Backoff.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/Backoff.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..763283b842d1b0bdefd343d6d636dd78ec6b034d
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/Backoff.hpp
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <chrono>
+#include <random>
+#include <thread>
+
+#include <c10/macros/Macros.h>
+
+namespace c10d {
+
+class TORCH_API Backoff {
+ public:
+  virtual ~Backoff() = default;
+
+  virtual std::chrono::milliseconds nextBackoff() = 0;
+  virtual void reset() = 0;
+
+  void sleepBackoff() {
+    std::this_thread::sleep_for(nextBackoff());
+  }
+};
+
+class TORCH_API ExponentialBackoffWithJitter : public Backoff {
+ public:
+  ExponentialBackoffWithJitter();
+
+  std::chrono::milliseconds nextBackoff() override;
+  void reset() override;
+
+ public:
+  std::chrono::milliseconds initialInterval{500};
+  double randomizationFactor{0.5};
+  double multiplier{1.5};
+  std::chrono::milliseconds maxInterval{60000};
+
+ private:
+  std::mt19937 gen_;
+  std::chrono::milliseconds currentInterval_{0};
+};
+
+class TORCH_API FixedBackoff : public Backoff {
+ public:
+  FixedBackoff(std::chrono::milliseconds interval);
+
+  std::chrono::milliseconds nextBackoff() override;
+  void reset() override;
+
+ private:
+  std::chrono::milliseconds interval_;
+};
+
+} // namespace c10d
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/FakeProcessGroup.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/FakeProcessGroup.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6731746b89747c088f5680dc1d2b9a2aaa902c80
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/FakeProcessGroup.hpp
@@ -0,0 +1,199 @@
+#pragma once
+
+#include <torch/csrc/distributed/c10d/Backend.hpp>
+
+namespace c10d {
+
+class FakeWork : public Work {
+ public:
+  int seq_id = -1;
+  bool wait(std::chrono::milliseconds timeout = kNoTimeout) override {
+    return true;
+  }
+
+  c10::intrusive_ptr<c10::ivalue::Future> getFuture() override {
+    auto fut = c10::make_intrusive<c10::ivalue::Future>(c10::NoneType::get());
+    fut->markCompleted();
+    return fut;
+  }
+};
+
+class FakeProcessGroup : public Backend {
+ public:
+  FakeProcessGroup(int rank, int size) : Backend(rank, size) {}
+
+  c10::intrusive_ptr<Work> broadcast(
+      std::vector<at::Tensor>& /* tensors */,
+      const BroadcastOptions& /* opts */ = BroadcastOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> allreduce(
+      std::vector<at::Tensor>& /* tensors */,
+      const AllreduceOptions& /* opts */ = AllreduceOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> allreduce_sparse(
+      std::vector<at::Tensor>& /* tensors */,
+      const AllreduceOptions& /* opts */ = AllreduceOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> allreduce_coalesced(
+      std::vector<at::Tensor>& /* tensors */,
+      const AllreduceCoalescedOptions& /* opts */ =
+          AllreduceCoalescedOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> reduce(
+      std::vector<at::Tensor>& /* tensors */,
+      const ReduceOptions& /* opts */ = ReduceOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  // NOTE [allgather on FakeProcessGroup]
+  // Assume each rank have the same input tensor so we just copy to the results
+  // since it's not a real allgather, we simply make this copying logic to let
+  // some simple validation works (i.e. calling allgather to see if each rank
+  // have the same tensor or not).
+  //
+  // NOTE: in general it's not good form to try to make FakeProcessGroup work
+  // with real data, but the reasoning here is that we want FakeProcessGroup to
+  // work with DeviceMesh's init code that have the data validation, which
+  // makes it worth the tradeoff.
+  c10::intrusive_ptr<Work> allgather(
+      std::vector<std::vector<at::Tensor>>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const AllgatherOptions& /* opts */ = AllgatherOptions()) override {
+    for (auto& tensor : outputTensors[0]) {
+      tensor.copy_(inputTensors[0]);
+    }
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> _allgather_base(
+      at::Tensor& outputBuffer,
+      at::Tensor& inputBuffer,
+      const AllgatherOptions& /* opts */ = AllgatherOptions()) override {
+    auto chunks = outputBuffer.chunk(size_);
+    for (auto& tensor : chunks) {
+      tensor.copy_(inputBuffer);
+    }
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> allgather_coalesced(
+      std::vector<std::vector<at::Tensor>>& /* outputTensorLists */,
+      std::vector<at::Tensor>& /* inputTensors */,
+      const AllgatherOptions& /* opts */ = AllgatherOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> allgather_into_tensor_coalesced(
+      std::vector<at::Tensor>& outputs,
+      std::vector<at::Tensor>& inputs,
+      const AllgatherOptions& /* opts */ = AllgatherOptions()) override {
+    for (size_t i = 0; i < outputs.size(); ++i) {
+      auto chunks = outputs[i].chunk(size_);
+      for (auto& chunk : chunks) {
+        chunk.copy_(inputs[i]);
+      }
+    }
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> gather(
+      std::vector<std::vector<at::Tensor>>& /* outputTensors */,
+      std::vector<at::Tensor>& /* inputTensors */,
+      const GatherOptions& /* opts */ = GatherOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> scatter(
+      std::vector<at::Tensor>& /* outputTensors */,
+      std::vector<std::vector<at::Tensor>>& /* inputTensors */,
+      const ScatterOptions& /* opts */ = ScatterOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> reduce_scatter(
+      std::vector<at::Tensor>& /* outputTensors */,
+      std::vector<std::vector<at::Tensor>>& /* inputTensors */,
+      const ReduceScatterOptions& /* opts */ =
+          ReduceScatterOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> _reduce_scatter_base(
+      at::Tensor& /* outputBuffer */,
+      at::Tensor& /* inputBuffer */,
+      const ReduceScatterOptions& /* opts */ =
+          ReduceScatterOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> reduce_scatter_tensor_coalesced(
+      std::vector<at::Tensor>& /* outputs */,
+      std::vector<at::Tensor>& /* inputs */,
+      const ReduceScatterOptions& /* opts */ =
+          ReduceScatterOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> alltoall_base(
+      at::Tensor& /* outputBuffer */,
+      at::Tensor& /* inputBuffer */,
+      std::vector<int64_t>& /* outputSplitSizes */,
+      std::vector<int64_t>& /* inputSplitSizes */,
+      const AllToAllOptions& /* opts */ = AllToAllOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> alltoall(
+      std::vector<at::Tensor>& /* outputTensors */,
+      std::vector<at::Tensor>& /* inputTensors */,
+      const AllToAllOptions& opts = AllToAllOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> send(
+      std::vector<at::Tensor>& /* tensors */,
+      int /* dstRank */,
+      int /* tag */) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> recv(
+      std::vector<at::Tensor>& /* tensors */,
+      int /* srcRank */,
+      int /* tag */) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> recvAnysource(
+      std::vector<at::Tensor>& /* tensors */,
+      int /* tag */) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  void startCoalescing() override {
+    // No-op
+  }
+
+  c10::intrusive_ptr<Work> endCoalescing(OpType /* optype */) {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> endCoalescing() override {
+    return c10::make_intrusive<FakeWork>();
+  }
+
+  c10::intrusive_ptr<Work> barrier(
+      const BarrierOptions& /* opts */ = BarrierOptions()) override {
+    return c10::make_intrusive<FakeWork>();
+  }
+};
+
+} // namespace c10d
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/FileStore.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/FileStore.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..42aec683af857c074b6343037e925c746ac43a0c
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/FileStore.hpp
@@ -0,0 +1,65 @@
+#pragma once
+
+#include <sys/types.h>
+
+#include <mutex>
+#include <unordered_map>
+
+#include <torch/csrc/distributed/c10d/Store.hpp>
+
+namespace c10d {
+
+class TORCH_API FileStore : public Store {
+ public:
+  explicit FileStore(std::string path, int numWorkers);
+
+  c10::intrusive_ptr<Store> clone() override;
+
+  ~FileStore() override;
+
+  void set(const std::string& key, const std::vector<uint8_t>& value) override;
+
+  std::vector<uint8_t> compareSet(
+      const std::string& key,
+      const std::vector<uint8_t>& expectedValue,
+      const std::vector<uint8_t>& desiredValue) override;
+
+  std::vector<uint8_t> get(const std::string& key) override;
+
+  int64_t add(const std::string& key, int64_t value) override;
+
+  int64_t getNumKeys() override;
+
+  bool deleteKey(const std::string& key) override;
+
+  bool check(const std::vector<std::string>& keys) override;
+
+  void wait(const std::vector<std::string>& keys) override;
+
+  void wait(
+      const std::vector<std::string>& keys,
+      const std::chrono::milliseconds& timeout) override;
+
+  // Returns the path used by the FileStore.
+  const std::string& getPath() const noexcept {
+    return path_;
+  }
+
+ protected:
+  int64_t addHelper(const std::string& key, int64_t i);
+
+  std::string path_;
+  off_t pos_{0};
+
+  int numWorkers_;
+  const std::string cleanupKey_;
+  const std::string refCountKey_;
+  const std::string regularPrefix_;
+  const std::string deletePrefix_;
+
+  std::unordered_map<std::string, std::vector<uint8_t>> cache_;
+
+  std::mutex activeFileOpLock_;
+};
+
+} // namespace c10d
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/FlightRecorder.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/FlightRecorder.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..381e418dba1ce1b11b951a201cceb7e8932c249f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/FlightRecorder.hpp
@@ -0,0 +1,272 @@
+#pragma once
+#include <cstdio>
+#include <cstdlib>
+
+#include <memory>
+#include <mutex>
+
+#include <ATen/ATen.h>
+#include <c10/util/Exception.h>
+#include <torch/csrc/distributed/c10d/TraceUtils.h>
+#include <torch/csrc/distributed/c10d/logger.hpp>
+#include <optional>
+
+namespace c10d {
+
+#define DEFINE_CONSTANT(name, value) \
+  static c10::IValue name = value;   \
+  static std::string name##_str = value;
+// Update whenever changing contents or formatting of the dump
+// (minor when adding fields, major when changing existing fields)
+// Also update both JSON and Pickle dumps to make use of the newly defined
+// field(s).
+DEFINE_CONSTANT(version_val, "2.9")
+DEFINE_CONSTANT(entries_key, "entries")
+DEFINE_CONSTANT(nccl_comm_key, "nccl_comm_state")
+DEFINE_CONSTANT(nccl_version_key, "nccl_version")
+DEFINE_CONSTANT(version_key, "version")
+DEFINE_CONSTANT(pg_config_key, "pg_config")
+DEFINE_CONSTANT(pg_status_key, "pg_status")
+DEFINE_CONSTANT(record_id_key, "record_id")
+DEFINE_CONSTANT(pg_id_key, "pg_id")
+DEFINE_CONSTANT(pg_name_key, "process_group")
+DEFINE_CONSTANT(collective_seq_id_key, "collective_seq_id")
+DEFINE_CONSTANT(p2p_seq_id_key, "p2p_seq_id")
+DEFINE_CONSTANT(is_p2p_key, "is_p2p")
+DEFINE_CONSTANT(op_id_key, "op_id")
+DEFINE_CONSTANT(profiling_name_key, "profiling_name")
+DEFINE_CONSTANT(input_sizes_key, "input_sizes")
+DEFINE_CONSTANT(input_dtypes_key, "input_dtypes")
+DEFINE_CONSTANT(output_sizes_key, "output_sizes")
+DEFINE_CONSTANT(output_dtypes_key, "output_dtypes")
+DEFINE_CONSTANT(time_created_key, "time_created_ns")
+DEFINE_CONSTANT(duration_key, "duration_ms")
+DEFINE_CONSTANT(timeout_key, "timeout_ms")
+DEFINE_CONSTANT(frames_key, "frames")
+DEFINE_CONSTANT(state_key, "state")
+DEFINE_CONSTANT(line_key, "line")
+DEFINE_CONSTANT(name_key, "name")
+DEFINE_CONSTANT(filename_key, "filename")
+DEFINE_CONSTANT(retired_key, "retired")
+DEFINE_CONSTANT(time_discovered_started_key, "time_discovered_started_ns")
+DEFINE_CONSTANT(time_discovered_completed_key, "time_discovered_completed_ns")
+DEFINE_CONSTANT(completed_state, "completed")
+DEFINE_CONSTANT(scheduled_state, "scheduled")
+DEFINE_CONSTANT(started_state, "started")
+DEFINE_CONSTANT(thread_id_key, "thread_id")
+DEFINE_CONSTANT(thread_name_key, "thread_name")
+#undef DEFINE_CONSTANT
+
+// Write NCCL debug info to local disk or any storage users define.
+// There are some constrains we set for the debug info writer:
+// 1. The writer should only be registered once.
+// 2. Once registered, users cannot change it including un-register.
+// 3. It is recommended to register the customized writer in the trainer setup,
+//    If users don't register before calling launchAsyncDebugDump, then users
+//    lose the chance to register (and the default writer will be
+//    auto-registered).
+class TORCH_API DebugInfoWriter {
+ public:
+  virtual ~DebugInfoWriter() = default;
+  virtual void write(const std::string& trace);
+  static DebugInfoWriter& getWriter(int rank);
+  static void registerWriter(std::unique_ptr<DebugInfoWriter> writer);
+  virtual std::string getWriterTarget() {
+    return filename_;
+  }
+
+ protected:
+  DebugInfoWriter(const std::string& namePrefix, int rank) {
+    filename_ = c10::str(namePrefix, rank);
+  }
+  std::string filename_;
+
+ private:
+  static std::unique_ptr<DebugInfoWriter> writer_;
+  static std::atomic<bool> hasWriterRegistered_;
+};
+
+template <typename EventType>
+struct FlightRecorder {
+  static FlightRecorder<EventType>* get() {
+    // intentionally leak on exit
+    // because this will hold python state that may get destructed
+    static FlightRecorder<EventType>* instance =
+        new FlightRecorder<EventType>();
+    return instance;
+  }
+  FlightRecorder() {
+    max_entries_ =
+        getCvarInt({"TORCH_FR_BUFFER_SIZE", "TORCH_NCCL_TRACE_BUFFER_SIZE"}, 0);
+    capture_cpp_stack_ = getCvarBool(
+        {"TORCH_FR_CPP_STACK", "TORCH_NCCL_TRACE_CPP_STACK"}, false);
+    enabled_ = max_entries_ > 0;
+  }
+  struct Entry {
+    size_t id_; // incremented id in the trace buffer
+                // used to figure out where in the circular entries
+                // buffer this entry will be located to
+                // update state information
+    size_t pg_id_;
+    std::tuple<std::string, std::string> pg_name_; // <group_name, group_desc>
+
+    // collective_seq_id and p2p_seq_id refer to actual kernel launches (e.g. 1
+    // per coalesced group).
+    // collective_seq_id only increments for true collective operations (over
+    // all ranks in the group). p2p_seq_id only increments over non-collective
+    // operations in the group. op_id refers to logical operations (e.g. one per
+    // op inside coalesced group)
+    size_t collective_seq_id_;
+    size_t p2p_seq_id_;
+    size_t op_id_;
+    std::string profiling_name_;
+
+    std::shared_ptr<torch::CapturedTraceback> traceback_;
+    // we borrow pointers to start_ and end_ so we can query the state
+    // on reporting. However, once the event is completed, the call
+    // to `complete` will clear these.
+    EventType *start_, *end_;
+
+    // timestamp when the entry was created, likely close to the time the work
+    // was 'enqueued'- not necessarily started
+    c10::time_t time_created_;
+
+    // configured timeout for this entry
+    c10::time_t timeout_ms_;
+
+    // Is this a P2P event?
+    bool isP2P_;
+
+    std::optional<float> duration_;
+
+    // timestamp when our CPU threads discovered that the kernel started.
+    // will always be _after_ it actually started, and can be very late
+    // if the watchdog thread got stuck on CUDA APIs.
+    std::optional<c10::time_t> time_discovered_started_;
+
+    // timestamp when our CPU threads discovered that the kernel completed.
+    // will always be _after_ it actually completed, and can be the same time
+    // as the discovery of the start if the watchdog thread is stuck on CUDA
+    // APIs
+    std::optional<c10::time_t> time_discovered_completed_;
+
+    // size information for input/output tensors
+    c10::SmallVector<int64_t, 4> input_dims_;
+    std::vector<c10::ScalarType> input_dtypes_;
+    c10::SmallVector<int64_t, 4> output_dims_;
+    std::vector<c10::ScalarType> output_dtypes_;
+    c10::SmallVector<int64_t, 8> sizes_; // flattened from inputs, outputs
+    std::thread::id thread_id_;
+    std::string thread_name_;
+    bool retired_ = false; // is this work entry no longer in the workMetaList_?
+                           // a retired but not completed event has timed out
+
+    // Returns the traceback of current entry, in string form.
+    // Note: `getTraceback` invokes `torch::symbolize`, which may need to
+    // acquire the GIL. If you don't want to block the current thread or take
+    // the risk of a GIL deadlock, you can use an asynchronous calling mechanism
+    // like std::async.
+    TORCH_API std::string getTraceback();
+  };
+
+  bool enabled_ = false;
+  bool capture_cpp_stack_ = false;
+  std::mutex mutex_;
+  std::vector<Entry> entries_;
+  size_t max_entries_ = 0;
+  size_t next_ = 0;
+  size_t id_ = 0;
+  std::map<size_t, std::shared_ptr<ProcessGroupStatus>> all_pg_status_ = {};
+  std::map<std::tuple<std::string, std::string>, std::vector<uint64_t>>
+      pg_name_to_ranks_ = {};
+  std::string nccl_version_;
+
+  std::optional<size_t> record(
+      size_t pg_id,
+      const std::tuple<std::string, std::string>& pg_name,
+      size_t collective_seq_id,
+      size_t p2p_seq_id,
+      size_t op_id,
+      std::string profiling_name,
+      const std::vector<at::Tensor>& inputs,
+      const std::vector<at::Tensor>& outputs,
+      EventType* start,
+      EventType* end,
+      std::chrono::milliseconds timeout_ms,
+      std::shared_ptr<ProcessGroupStatus> pg_status,
+      bool isP2P);
+
+  TORCH_API void record_pg_ranks(
+      const std::tuple<std::string, std::string>& pg_name,
+      std::vector<uint64_t> ranks);
+
+  void record_accelerator_version(const std::string nccl_version);
+
+  void update_state(Entry& r);
+
+  std::vector<Entry> dump_entries();
+
+  // Returns the entry with the given id, if it exists. Otherwise, returns
+  // std::nullopt.
+  TORCH_API std::optional<Entry> getEntry(std::optional<size_t> id);
+
+  /*
+  Mark an Event as completed and free its events.
+  This is called by the watchdog thread, and is asynchronous from the
+  perspective of the main thread.
+  compute_duration defaults to true since retire_id is only called in the
+  watchdog thread, which is currently a place we call cuda APIs which may hang,
+  but care should be taken to avoid computing duration in any function that must
+  never hang. (timing must also be enabled for compute_duration - see
+  TORCH_NCCL_ENABLE_TIMING).
+  */
+  TORCH_API void retire_id(
+      std::optional<size_t> id,
+      bool compute_duration = true);
+
+  const c10::List<c10::IValue> getCollectiveTrace(
+      bool includeStacktraces,
+      bool onlyActive);
+
+  // dump pg_entries
+  const c10::Dict<c10::IValue, c10::IValue> getPgConfig();
+
+  const std::map<std::string, std::map<std::string, std::string>>
+  getPgConfigJson();
+
+  // dump pg_status
+  const c10::Dict<c10::IValue, c10::IValue> getPgStatus();
+
+  const std::map<std::string, std::map<std::string, std::string>>
+  getPgStatusJson();
+
+  std::string dump_json(
+      const std::optional<std::unordered_map<
+          std::string,
+          std::unordered_map<std::string, std::string>>>& extraDumpMap,
+      bool includeCollectives,
+      bool onlyActive);
+
+  std::string dump(
+      const std::optional<std::unordered_map<
+          std::string,
+          std::unordered_map<std::string, std::string>>>& extraDumpMap,
+      bool includeCollectives,
+      bool includeStackTraces,
+      bool onlyActive);
+};
+
+// Dumps the fr traces and additional information about the Process
+// Group.
+TORCH_API std::string dump_fr_trace(
+    bool includeCollectives,
+    bool includeStackTraces,
+    bool onlyActive);
+
+// Dumps the fr traces and additional information about the Process
+// Group in JSON formatted string.
+// We don't include stack traces in JSON format as it is far too much data.
+TORCH_API std::string dump_fr_trace_json(
+    bool includeCollectives,
+    bool onlyActive);
+} // namespace c10d
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/FlightRecorderDetail.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/FlightRecorderDetail.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..aa8e739206169f545949c7de097bc5425d749168
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/FlightRecorderDetail.hpp
@@ -0,0 +1,550 @@
+#include <nlohmann/json.hpp>
+
+#include <c10/util/WaitCounter.h>
+#include <c10/util/thread_name.h>
+
+#include <torch/csrc/distributed/c10d/FlightRecorder.hpp>
+
+namespace c10d {
+
+template <typename EventType>
+float getDurationFromEvent(EventType& start, EventType& end);
+
+// Returns the traceback of current entry, in string form.
+// Note: `getTraceback` invokes `torch::symbolize`, which may need to acquire
+// the GIL. If you don't want to block the current thread or take the risk of a
+// GIL deadlock, you can use an asynchronous calling mechanism like std::async.
+template <typename EventType>
+std::string FlightRecorder<EventType>::Entry::getTraceback() {
+  torch::CapturedTraceback* traceback = traceback_.get();
+  torch::SymbolizedTracebacks s_tbs = torch::symbolize({traceback});
+  // We use 0 because we only have one traceback here.
+  const auto& s_tb = s_tbs.tracebacks.at(0);
+  std::stringstream oss;
+  for (auto idx : c10::irange(s_tb.size())) {
+    auto frame_id = s_tb[idx];
+    const auto& frame = s_tbs.all_frames.at(frame_id);
+    oss << "#" << idx << " " << frame.funcname << " from " << frame.filename
+        << ":" << frame.lineno << '\n';
+  }
+  /* Resulted format is like:
+    #0 all_reduce from pytorch/torch/distributed/distributed_c10d.py:2696
+    #1 wrapper from pytorch/torch/distributed/c10d_logger.py:83
+    #2 bar from /home/user/repro.py:15
+    #3 foo from /home/user/repro.py:24
+    #4 main from /home/user/repro.py:34
+    #5 <module> from /home/user/repro.py:40
+  */
+  return oss.str();
+}
+
+template <typename EventType>
+std::optional<size_t> FlightRecorder<EventType>::record(
+    size_t pg_id,
+    const std::tuple<std::string, std::string>& pg_name,
+    size_t collective_seq_id,
+    size_t p2p_seq_id,
+    size_t op_id,
+    std::string profiling_name,
+    const std::vector<at::Tensor>& inputs,
+    const std::vector<at::Tensor>& outputs,
+    EventType* start,
+    EventType* end,
+    std::chrono::milliseconds timeout_ms,
+    std::shared_ptr<ProcessGroupStatus> pg_status,
+    bool isP2P) {
+  if (!enabled_) {
+    return std::nullopt;
+  }
+  if (all_pg_status_.find(pg_id) == all_pg_status_.end()) {
+    // Current pg_status is not in FR.
+    all_pg_status_[pg_id] = std::move(pg_status);
+  }
+  auto traceback =
+      torch::CapturedTraceback::gather(true, true, capture_cpp_stack_);
+  std::lock_guard<std::mutex> guard(mutex_);
+
+  auto te = Entry{
+      id_,
+      pg_id,
+      pg_name,
+      collective_seq_id,
+      p2p_seq_id,
+      op_id,
+      std::move(profiling_name),
+      std::move(traceback),
+      start,
+      end,
+      c10::getTime(),
+      timeout_ms.count(),
+      isP2P,
+      std::nullopt,
+      std::nullopt,
+      std::nullopt,
+      {},
+      {},
+      {},
+      {},
+      {},
+      std::this_thread::get_id(),
+      c10::getThreadName(),
+      false};
+
+  for (const auto& input : inputs) {
+    c10::IntArrayRef sizes = input.sizes();
+    te.input_dtypes_.push_back(input.dtype().toScalarType());
+    te.input_dims_.push_back(static_cast<int64_t>(sizes.size()));
+    te.sizes_.insert(te.sizes_.end(), sizes.begin(), sizes.end());
+  }
+
+  for (const auto& output : outputs) {
+    c10::IntArrayRef sizes = output.sizes();
+    te.output_dtypes_.push_back(output.dtype().toScalarType());
+    te.output_dims_.push_back(static_cast<int64_t>(sizes.size()));
+    te.sizes_.insert(te.sizes_.end(), sizes.begin(), sizes.end());
+  }
+
+  if (entries_.size() < max_entries_) {
+    entries_.emplace_back(std::move(te));
+  } else {
+    entries_[next_++] = std::move(te);
+    if (next_ == max_entries_) {
+      next_ = 0;
+    }
+  }
+  return id_++;
+}
+
+template <typename EventType>
+void FlightRecorder<EventType>::record_pg_ranks(
+    const std::tuple<std::string, std::string>& pg_name,
+    std::vector<uint64_t> ranks) {
+  if (!enabled_) {
+    return;
+  }
+  std::lock_guard<std::mutex> guard(mutex_);
+  pg_name_to_ranks_[pg_name] = std::move(ranks);
+}
+
+template <typename EventType>
+void FlightRecorder<EventType>::record_accelerator_version(
+    const std::string nccl_version) {
+  if (!enabled_) {
+    return;
+  }
+  std::lock_guard<std::mutex> guard(mutex_);
+  nccl_version_ = std::move(nccl_version);
+}
+
+template <typename EventType>
+void FlightRecorder<EventType>::update_state(Entry& r) {
+  try {
+    if (r.start_ != nullptr) {
+      bool started = r.start_->query();
+      if (started && !r.time_discovered_started_) {
+        r.time_discovered_started_ = c10::getTime();
+      }
+    }
+    if (r.end_ != nullptr) {
+      bool completed = r.end_->query();
+      if (completed && !r.time_discovered_completed_) {
+        r.time_discovered_completed_ = c10::getTime();
+      }
+    }
+  } catch (std::exception& e) {
+    LOG(ERROR) << "Failed to update state for entry " << r.id_ << ": "
+               << r.profiling_name_ << " with error: " << e.what();
+  }
+}
+
+template <typename EventType>
+std::vector<typename FlightRecorder<EventType>::Entry> FlightRecorder<
+    EventType>::dump_entries() {
+  std::vector<Entry> result;
+  {
+    std::lock_guard<std::mutex> guard(mutex_);
+    result.reserve(entries_.size());
+    result.insert(
+        result.end(),
+        entries_.begin() + static_cast<std::ptrdiff_t>(next_),
+        entries_.end());
+    result.insert(
+        result.end(),
+        entries_.begin(),
+        entries_.begin() + static_cast<std::ptrdiff_t>(next_));
+  }
+  // query any remaining events
+  for (auto& r : result) {
+    update_state(r);
+    r.start_ = r.end_ = nullptr;
+  }
+  return result;
+}
+
+template <typename EventType>
+// Returns the entry with the given id, if it exists. Otherwise, returns
+// std::nullopt.
+std::optional<typename FlightRecorder<EventType>::Entry> FlightRecorder<
+    EventType>::getEntry(std::optional<size_t> id) {
+  if (!enabled_ || !id) {
+    return std::nullopt;
+  }
+
+  std::unique_lock<std::mutex> guard(mutex_);
+  Entry entry = entries_.at(*id % max_entries_);
+  if (entry.id_ == *id) {
+    return entry;
+  } else {
+    return std::nullopt;
+  }
+}
+
+template <typename EventType>
+void FlightRecorder<EventType>::retire_id(
+    std::optional<size_t> id,
+    bool compute_duration) {
+  if (!enabled_ || !id) {
+    return;
+  }
+
+  bool can_compute_duration = false;
+  EventType* startEvent = nullptr;
+  EventType* endEvent = nullptr;
+  std::optional<float> duration = std::nullopt;
+
+  std::unique_lock<std::mutex> guard(mutex_);
+
+  Entry* entry = &entries_.at(*id % max_entries_);
+  if (entry->id_ == *id) {
+    update_state(*entry);
+
+    if (compute_duration) {
+      can_compute_duration = entry->time_discovered_completed_.has_value() &&
+          entry->start_ && entry->end_;
+      startEvent = entry->start_;
+      endEvent = entry->end_;
+    }
+    entry->retired_ = true;
+    entry->start_ = entry->end_ = nullptr;
+  }
+
+  if (can_compute_duration) {
+    // Compute duration without without holding the lock, because
+    // cudaEventDuration() can hang, and we need to acquire the lock before we
+    // can dump(), which we never want to block.
+    guard.unlock();
+    duration = getDurationFromEvent<EventType>(*startEvent, *endEvent);
+    guard.lock();
+
+    // Refresh the entry pointer, see if the entry has been overwritten
+    entry = &entries_.at(*id % max_entries_);
+    if (entry->id_ != *id) {
+      LOG(INFO) << "retire_id abandoned for id " << *id
+                << ", event was overwritten while waiting to compute duration.";
+      return;
+    }
+    if (duration.has_value()) {
+      entry->duration_ = duration;
+    }
+  }
+}
+
+template <typename EventType>
+const c10::List<c10::IValue> FlightRecorder<EventType>::getCollectiveTrace(
+    bool includeStacktraces,
+    bool onlyActive) {
+  auto entries = new_list();
+  // Entries are returned in the order they were recorded
+  auto result = dump_entries();
+  std::vector<torch::CapturedTraceback*> tracebacks;
+  torch::SymbolizedTracebacks stracebacks;
+  std::vector<c10::IValue> all_frames;
+  if (includeStacktraces) {
+    for (auto& e : result) {
+      tracebacks.push_back(e.traceback_.get());
+    }
+    stracebacks = torch::symbolize(tracebacks);
+    for (const auto& f : stracebacks.all_frames) {
+      auto d = new_dict();
+      d.insert(name_key, f.funcname);
+      d.insert(filename_key, f.filename);
+      d.insert(line_key, int64_t(f.lineno));
+      all_frames.emplace_back(std::move(d));
+    }
+  }
+  for (auto i : c10::irange(result.size())) {
+    auto dict = new_dict();
+    auto& e = result.at(i);
+    // Skip completed events
+    if (onlyActive && e.time_discovered_completed_.has_value()) {
+      continue;
+    }
+    if (includeStacktraces) {
+      auto& tb = stracebacks.tracebacks.at(i);
+      auto frames = new_list();
+      for (auto frame : tb) {
+        frames.push_back(all_frames.at(frame));
+      }
+      dict.insert(frames_key, frames);
+    }
+
+    dict.insert(record_id_key, int64_t(e.id_));
+    dict.insert(pg_id_key, int64_t(e.pg_id_));
+    dict.insert(pg_name_key, e.pg_name_);
+    dict.insert(thread_name_key, e.thread_name_);
+    dict.insert(thread_id_key, c10::str(e.thread_id_));
+    dict.insert(collective_seq_id_key, int64_t(e.collective_seq_id_));
+    dict.insert(p2p_seq_id_key, int64_t(e.p2p_seq_id_));
+    dict.insert(op_id_key, int64_t(e.op_id_));
+    dict.insert(profiling_name_key, e.profiling_name_);
+    dict.insert(time_created_key, int64_t(e.time_created_));
+    if (e.duration_) {
+      dict.insert(duration_key, *e.duration_);
+    }
+
+    auto it = e.sizes_.begin();
+    auto read_sizes = [&](const c10::SmallVector<int64_t, 4>& dims) {
+      auto sizes = new_list();
+      for (auto dim : dims) {
+        auto arg_sizes = new_list();
+        for ([[maybe_unused]] auto i : c10::irange(dim)) {
+          arg_sizes.push_back(*it++);
+        }
+        sizes.push_back(arg_sizes);
+      }
+      return sizes;
+    };
+
+    dict.insert(input_sizes_key, read_sizes(e.input_dims_));
+    std::vector<std::string> input_dtypes_strs;
+    input_dtypes_strs.reserve(e.input_dtypes_.size());
+    for (const auto& input_dtype : e.input_dtypes_) {
+      input_dtypes_strs.emplace_back(c10::toString(input_dtype));
+    }
+    dict.insert(input_dtypes_key, input_dtypes_strs);
+    dict.insert(output_sizes_key, read_sizes(e.output_dims_));
+    std::vector<std::string> output_dtypes_strs;
+    output_dtypes_strs.reserve(e.output_dtypes_.size());
+    for (const auto& output_dtype : e.output_dtypes_) {
+      output_dtypes_strs.emplace_back(c10::toString(output_dtype));
+    }
+    dict.insert(output_dtypes_key, output_dtypes_strs);
+    if (e.time_discovered_completed_.has_value()) {
+      dict.insert(state_key, completed_state);
+    } else if (e.time_discovered_started_.has_value()) {
+      dict.insert(state_key, started_state);
+    } else {
+      dict.insert(state_key, scheduled_state);
+    }
+
+    dict.insert(
+        time_discovered_started_key,
+        e.time_discovered_started_.has_value()
+            ? int64_t(*e.time_discovered_started_)
+            : c10::IValue());
+    dict.insert(
+        time_discovered_completed_key,
+        e.time_discovered_completed_.has_value()
+            ? int64_t(*e.time_discovered_completed_)
+            : c10::IValue());
+    dict.insert(retired_key, e.retired_);
+    dict.insert(timeout_key, e.timeout_ms_);
+    dict.insert(is_p2p_key, e.isP2P_);
+
+    entries.push_back(dict);
+  }
+  return entries;
+}
+
+template <typename EventType>
+const c10::Dict<c10::IValue, c10::IValue> FlightRecorder<
+    EventType>::getPgConfig() {
+  auto pg_config = new_dict();
+  for (const auto& [pg_name, ranks] : pg_name_to_ranks_) {
+    auto pg_info = new_dict();
+    pg_info.insert("name", std::get<0>(pg_name));
+    pg_info.insert("desc", std::get<1>(pg_name));
+    pg_info.insert("ranks", ranks_str(ranks));
+    pg_config.insert(std::get<0>(pg_name), pg_info);
+  }
+  return pg_config;
+}
+
+template <typename EventType>
+const std::map<std::string, std::map<std::string, std::string>> FlightRecorder<
+    EventType>::getPgConfigJson() {
+  std::map<std::string, std::map<std::string, std::string>> result;
+  for (const auto& [pg_name, ranks] : pg_name_to_ranks_) {
+    auto pg_info = std::map<std::string, std::string>();
+    pg_info["name"] = std::get<0>(pg_name);
+    pg_info["desc"] = std::get<1>(pg_name);
+    pg_info["ranks"] = ranks_str(ranks);
+    result.emplace(std::get<0>(pg_name), pg_info);
+  }
+  return result;
+}
+
+template <typename EventType>
+const c10::Dict<c10::IValue, c10::IValue> FlightRecorder<
+    EventType>::getPgStatus() {
+  auto all_pg_status = new_dict();
+  for (const auto& [pg_id, status] : all_pg_status_) {
+    auto pg_status = new_dict();
+    pg_status.insert("last_enqueued_collective", status->lastEnqueuedSeq);
+    pg_status.insert("last_started_collective", status->lastStartedSeq);
+    pg_status.insert("last_completed_collective", status->lastCompletedSeq);
+    all_pg_status.insert(std::to_string(pg_id), pg_status);
+  }
+  return all_pg_status;
+}
+
+template <typename EventType>
+const std::map<std::string, std::map<std::string, std::string>> FlightRecorder<
+    EventType>::getPgStatusJson() {
+  std::map<std::string, std::map<std::string, std::string>> result;
+  for (const auto& [pg_id, status] : all_pg_status_) {
+    auto pg_status = std::map<std::string, std::string>();
+    pg_status["last_enqueued_collective"] =
+        std::to_string(status->lastEnqueuedSeq);
+    pg_status["last_started_collective"] =
+        std::to_string(status->lastStartedSeq);
+    pg_status["last_completed_collective"] =
+        std::to_string(status->lastCompletedSeq);
+    result[std::to_string(pg_id)] = pg_status;
+  }
+  return result;
+}
+
+using json = nlohmann::json;
+template <typename EventType>
+std::string FlightRecorder<EventType>::dump_json(
+    const std::optional<std::unordered_map<
+        std::string,
+        std::unordered_map<std::string, std::string>>>& extraDumpMap,
+    bool includeCollectives,
+    bool onlyActive) {
+  json result;
+  result[version_key_str] = version_val_str;
+  result[nccl_version_key_str] = nccl_version_;
+  result[pg_config_key_str] = getPgConfigJson();
+  result[pg_status_key_str] = getPgStatusJson();
+
+  // collective trace
+  if (includeCollectives) {
+    std::list<json> entries;
+    for (auto& e : dump_entries()) {
+      json j;
+      if (onlyActive && e.time_discovered_completed_.has_value()) {
+        continue;
+      }
+      j[record_id_key_str] = int64_t(e.id_);
+      j[pg_id_key_str] = int64_t(e.pg_id_);
+      j[pg_name_key_str] = e.pg_name_;
+      j[thread_name_key_str] = e.thread_name_;
+      j[thread_id_key_str] = c10::str(e.thread_id_);
+      j[collective_seq_id_key_str] = int64_t(e.collective_seq_id_);
+      j[p2p_seq_id_key_str] = int64_t(e.p2p_seq_id_);
+      j[op_id_key_str] = int64_t(e.op_id_);
+      j[profiling_name_key_str] = e.profiling_name_;
+      j[time_created_key_str] = int64_t(e.time_created_);
+      if (e.duration_) {
+        j[duration_key_str] = *e.duration_;
+      }
+      auto it = e.sizes_.begin();
+      auto read_sizes = [&](const c10::SmallVector<int64_t, 4>& dims) {
+        auto sizes = std::list<std::list<int64_t>>();
+        for (auto dim : dims) {
+          auto arg_sizes = std::list<int64_t>();
+          for (auto i : c10::irange(dim)) {
+            (void)i;
+            arg_sizes.push_back(*it++);
+          }
+          sizes.push_back(arg_sizes);
+        }
+        return sizes;
+      };
+      j[input_sizes_key_str] = read_sizes(e.input_dims_);
+      std::vector<std::string> input_dtypes_strs;
+      input_dtypes_strs.reserve(e.input_dtypes_.size());
+      for (const auto& input_dtype : e.input_dtypes_) {
+        input_dtypes_strs.emplace_back(c10::toString(input_dtype));
+      }
+      j[input_dtypes_key_str] = input_dtypes_strs;
+      j[output_sizes_key_str] = read_sizes(e.output_dims_);
+      std::vector<std::string> output_dtypes_strs;
+      output_dtypes_strs.reserve(e.output_dtypes_.size());
+      for (const auto& output_dtype : e.output_dtypes_) {
+        output_dtypes_strs.emplace_back(c10::toString(output_dtype));
+      }
+      j[output_dtypes_key_str] = output_dtypes_strs;
+      if (e.time_discovered_completed_.has_value()) {
+        j[state_key_str] = completed_state_str;
+      } else if (e.time_discovered_started_.has_value()) {
+        j[state_key_str] = started_state_str;
+      } else {
+        j[state_key_str] = scheduled_state_str;
+      }
+      j[time_discovered_started_key_str] =
+          e.time_discovered_started_.has_value()
+          ? int64_t(*e.time_discovered_started_)
+          : 0;
+      j[time_discovered_completed_key_str] =
+          e.time_discovered_completed_.has_value()
+          ? int64_t(*e.time_discovered_completed_)
+          : 0;
+      j[retired_key_str] = e.retired_;
+      j[timeout_key_str] = e.timeout_ms_;
+      j[is_p2p_key_str] = e.isP2P_;
+      entries.emplace_back(j);
+    }
+
+    if (!entries.empty()) {
+      result[entries_key_str] = entries;
+    }
+  }
+
+  if (extraDumpMap.has_value()) {
+    result[nccl_comm_key_str] = extraDumpMap.value();
+  }
+  return result.dump();
+}
+
+template <typename EventType>
+std::string FlightRecorder<EventType>::dump(
+    const std::optional<std::unordered_map<
+        std::string,
+        std::unordered_map<std::string, std::string>>>& extraDumpMap,
+    bool includeCollectives,
+    bool includeStackTraces,
+    bool onlyActive) {
+  STATIC_SCOPED_WAIT_COUNTER(pytorch.wait_counter.FlightRecorder__dump);
+  auto result = new_dict();
+  // common values
+  result.insert(version_key, version_val);
+  result.insert(pg_config_key, getPgConfig());
+  result.insert(nccl_version_key_str, nccl_version_);
+  result.insert(pg_status_key, getPgStatus());
+
+  // collective trace
+  if (includeCollectives) {
+    result.insert(
+        entries_key, getCollectiveTrace(includeStackTraces, onlyActive));
+  }
+
+  // convert extraDumpMap into a dictionary
+  auto per_comm_dict = new_dict();
+  if (extraDumpMap.has_value()) {
+    for (const auto& [ncclId, ncclDump] : extraDumpMap.value()) {
+      auto inner_dict = new_dict();
+      for (const auto& [key, value] : ncclDump) {
+        inner_dict.insert(key, value);
+      }
+      per_comm_dict.insert(ncclId, inner_dict);
+    }
+  }
+  if (!per_comm_dict.empty()) {
+    result.insert(nccl_comm_key, per_comm_dict);
+  }
+  return pickle_str(result);
+}
+} // namespace c10d
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/Functional.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/Functional.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4efff07821daf76626bb7acf9ffcf93f86597885
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/Functional.hpp
@@ -0,0 +1,3 @@
+#pragma once
+
+#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/GlooDeviceFactory.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/GlooDeviceFactory.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b138272862e73d16d96f075a59d20d6449e1e1c4
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/GlooDeviceFactory.hpp
@@ -0,0 +1,35 @@
+#pragma once
+
+#ifdef USE_C10D_GLOO
+
+#include <string>
+
+#include <c10/util/Registry.h>
+#include <gloo/config.h>
+#include <gloo/transport/device.h>
+
+namespace c10d {
+
+class TORCH_API GlooDeviceFactory {
+ public:
+  // Create new device instance for specific interface.
+  static std::shared_ptr<::gloo::transport::Device> makeDeviceForInterface(
+      const std::string& interface,
+      bool lazyInit);
+
+  // Create new device instance for specific hostname or address.
+  static std::shared_ptr<::gloo::transport::Device> makeDeviceForHostname(
+      const std::string& hostname,
+      bool lazyInit);
+};
+
+TORCH_DECLARE_SHARED_REGISTRY(
+    GlooDeviceRegistry,
+    ::gloo::transport::Device,
+    const std::string&, /* interface */
+    const std::string&, /* hostname */
+    bool /* lazyInit */);
+
+} // namespace c10d
+
+#endif // USE_C10D_GLOO
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/GroupRegistry.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/GroupRegistry.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d383cd9de85d3a210c4246f9c7839b7424d45174
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/GroupRegistry.hpp
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
+
+namespace c10d {
+
+C10_EXPORT void set_thread_isolation_mode(bool enable);
+
+bool get_thread_isolation_mode();
+
+C10_EXPORT void register_process_group(
+    const std::string& group_name,
+    const c10::intrusive_ptr<c10d::ProcessGroup>& group);
+
+C10_EXPORT c10::intrusive_ptr<c10d::ProcessGroup> resolve_process_group(
+    const std::string& group_name);
+
+C10_EXPORT void unregister_process_group(const std::string& group_name);
+
+C10_EXPORT void unregister_all_process_groups();
+
+} // namespace c10d
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/HashStore.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/HashStore.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e599c2fccc1a6e33f7647b2263eca5cd753bb1ca
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/HashStore.hpp
@@ -0,0 +1,79 @@
+#pragma once
+
+#include <condition_variable>
+#include <mutex>
+#include <unordered_map>
+
+#include <torch/csrc/distributed/c10d/Store.hpp>
+
+namespace c10d {
+
+class TORCH_API HashStore : public Store {
+ public:
+  c10::intrusive_ptr<Store> clone() override;
+
+  ~HashStore() override = default;
+
+  void set(const std::string& key, const std::vector<uint8_t>& data) override;
+
+  std::vector<uint8_t> compareSet(
+      const std::string& key,
+      const std::vector<uint8_t>& expectedValue,
+      const std::vector<uint8_t>& desiredValue) override;
+
+  std::vector<uint8_t> get(const std::string& key) override;
+
+  void wait(const std::vector<std::string>& keys) override {
+    wait(keys, timeout_);
+  }
+
+  void wait(
+      const std::vector<std::string>& keys,
+      const std::chrono::milliseconds& timeout) override;
+
+  int64_t add(const std::string& key, int64_t value) override;
+
+  int64_t getNumKeys() override;
+
+  bool check(const std::vector<std::string>& keys) override;
+
+  bool deleteKey(const std::string& key) override;
+
+  void append(const std::string& key, const std::vector<uint8_t>& value)
+      override;
+
+  std::vector<std::vector<uint8_t>> multiGet(
+      const std::vector<std::string>& keys) override;
+
+  void multiSet(
+      const std::vector<std::string>& keys,
+      const std::vector<std::vector<uint8_t>>& values) override;
+
+  // Returns true if this store support append, multiGet and multiSet
+  bool hasExtendedApi() const override;
+
+  void queuePush(const std::string& key, const std::vector<uint8_t>& value)
+      override;
+
+  std::vector<uint8_t> queuePop(const std::string& key, bool block) override;
+
+  int64_t queueLen(const std::string& key) override;
+
+ protected:
+  bool checkLocked(
+      const std::unique_lock<std::mutex>& lock,
+      const std::vector<std::string>& keys);
+
+  void waitLocked(
+      std::unique_lock<std::mutex>& lock,
+      const std::vector<std::string>& keys,
+      const std::chrono::milliseconds& timeout);
+
+ protected:
+  std::unordered_map<std::string, std::vector<uint8_t>> map_;
+  std::unordered_map<std::string, std::deque<std::vector<uint8_t>>> queues_;
+  std::mutex m_;
+  std::condition_variable cv_;
+};
+
+} // namespace c10d
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/NCCLUtils.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/NCCLUtils.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0893be97ff0e0a09d358fd951f6c9221a2caa5c1
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/NCCLUtils.hpp
@@ -0,0 +1,418 @@
+#pragma once
+
+#ifdef USE_C10D_NCCL
+
+#include <sched.h>
+#include <cstdio>
+#include <cstdlib>
+
+#include <memory>
+#include <mutex>
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAEvent.h>
+#include <c10/util/Exception.h>
+#include <nccl.h>
+#include <torch/csrc/cuda/nccl.h>
+#include <torch/csrc/distributed/c10d/TraceUtils.h>
+#include <optional>
+
+constexpr int64_t kCommInitBusyWaitMillis = 2;
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 14, 0)
+#define NCCL_HAS_COMM_NONBLOCKING
+#endif
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 18, 0)
+#define NCCL_HAS_COMM_SPLIT
+#endif
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 23, 0)
+#define NCCL_HAS_INIT_RANK_SCALABLE
+#endif
+
+// ncclGetLastError() is enabled only for NCCL versions 2.13+
+// ncclRemoteError only exists in NCCL versions 2.13+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 13, 0)
+#define ENABLE_NCCL_GET_LAST_ERROR
+#define NCCL_REMOTE_ERROR
+#endif
+
+static_assert(
+    NCCL_VERSION_CODE >= NCCL_VERSION(2, 7, 0),
+    "NCCL version must be 2.7 or later");
+// The following macros represent features supported prior to NCCL 2.7,
+// therefore we can define them unconditionally, given the static_assert above.
+// TODO: remove these macros from code.
+#define ENABLE_NCCL_ERROR_CHECKING
+#define ENABLE_NCCL_P2P_SUPPORT
+// End of macros for NCCL 2.7 and below.
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 11, 0)
+#define ENABLE_NCCL_PREMUL_SUM_SUPPORT
+#endif
+
+// Note: the first version that supports ncclConfig_t is 2.14. Here we
+// fast-forward the version requirement to 2.17 where ncclConfig_t has CTA and
+// CGA fields because they have already been pybinded out.
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 17, 0)
+#define NCCL_HAS_CONFIG
+#endif
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 19, 0)
+#define NCCL_HAS_COMM_REGISTER
+#endif
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 27, 0)
+#define NCCL_HAS_COMM_WINDOW_REGISTER
+#endif
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 19, 0)
+#define NCCL_HAS_MEM_ALLOC
+#endif
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 26, 0)
+#define NCCL_HAS_QOS
+#endif
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 24, 0)
+#define NCCL_SUPPORTS_FP8
+#endif
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 27, 0)
+#define NCCL_HAS_COLLNET
+#endif
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 27, 0)
+#define NCCL_HAS_CTA_POLICY
+#endif
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2, 27, 0)
+#define NCCL_HAS_NVLS_CTAS
+#endif
+
+// Macro to throw on a non-successful NCCL return value.
+#define C10D_NCCL_CHECK(cmd, failureReason)                                   \
+  do {                                                                        \
+    ncclResult_t result = cmd;                                                \
+    if (result != ncclSuccess) {                                              \
+      std::string err = "NCCL error in: " + std::string(__FILE__) + ":" +     \
+          std::to_string(__LINE__) + ", " + ncclGetErrorWithVersion(result) + \
+          "\n" + getNcclErrorDetailStr(result, failureReason);                \
+      TORCH_CHECK_WITH(DistBackendError, false, err);                         \
+    }                                                                         \
+  } while (0)
+
+// Macro to throw on a non-successful NCCL return value for NONBLOCKING calls.
+#define C10D_NCCL_CHECK_NONBLOCKING(cmd, failureReason)                       \
+  do {                                                                        \
+    ncclResult_t result = cmd;                                                \
+    if (result != ncclSuccess && result != ncclInProgress) {                  \
+      std::string err = "NCCL error in: " + std::string(__FILE__) + ":" +     \
+          std::to_string(__LINE__) + ", " + ncclGetErrorWithVersion(result) + \
+          "\n" + getNcclErrorDetailStr(result, failureReason);                \
+      TORCH_CHECK_WITH(DistBackendError, false, err);                         \
+    }                                                                         \
+  } while (0)
+
+// Error out if (current time - startTime) is greater than timeout (sec).
+#define C10D_CHECK_TIMEOUT(startTime, timeout)                              \
+  do {                                                                      \
+    auto currentTime = std::chrono::steady_clock::now();                    \
+    auto timeElapsed = std::chrono::duration_cast<std::chrono::seconds>(    \
+                           currentTime - startTime)                         \
+                           .count();                                        \
+    if (timeElapsed > timeout) {                                            \
+      std::string err = "NCCL timeout in: " + std::string(__FILE__) + ":" + \
+          std::to_string(__LINE__);                                         \
+      TORCH_CHECK_WITH(DistBackendError, false, err);                       \
+    }                                                                       \
+  } while (0)
+
+// Macro to throw on a non-successful NCCL return value, non-blocking.
+#define C10D_NCCL_CHECK_TIMEOUT_BASE(cmd, comm, failureReason, yield_fn)      \
+  do {                                                                        \
+    ncclResult_t result = cmd;                                                \
+    auto startTimepoint = std::chrono::steady_clock::now();                   \
+    auto timeout = nccl_nonblocking_timeout();                                \
+    while (result == ncclInProgress) {                                        \
+      C10D_CHECK_TIMEOUT(startTimepoint, timeout);                            \
+      yield_fn;                                                               \
+      ncclCommGetAsyncError(comm, &result);                                   \
+    }                                                                         \
+    if (result != ncclSuccess) {                                              \
+      std::string err = "NCCL error in: " + std::string(__FILE__) + ":" +     \
+          std::to_string(__LINE__) + ", " + ncclGetErrorWithVersion(result) + \
+          "\n" + getNcclErrorDetailStr(result, failureReason);                \
+      TORCH_CHECK_WITH(DistBackendError, false, err);                         \
+    }                                                                         \
+  } while (0)
+
+// Sleep for kCommInitBusyWaitMillis milliseconds.
+#define C10D_SCHED_SLEEP()     \
+  std::this_thread::sleep_for( \
+      std::chrono::milliseconds(kCommInitBusyWaitMillis))
+
+// Macro to throw exception on a non-successful NCCL return value or timeout.
+// This macro uses sched_yield() to yield the CPU.
+// Thus suitable for NCCL calls that would quickly turn ncclSuccess, e.g.
+// collectives.
+#define C10D_NCCL_CHECK_TIMEOUT(cmd, comm, failureReason) \
+  C10D_NCCL_CHECK_TIMEOUT_BASE(cmd, comm, failureReason, sched_yield())
+
+// Macro to throw exception on a non-successful NCCL return value or timeout.
+// This macro uses sleep to yield the CPU.
+// Thus suitable for NCCL calls that would take longer to turn ncclSuccess, e.g.
+// ncclCommInitRankConfig, ncclCommFinalize, etc.
+#define C10D_NCCL_CHECK_TIMEOUT_SLEEP(cmd, comm, failureReason) \
+  C10D_NCCL_CHECK_TIMEOUT_BASE(cmd, comm, failureReason, C10D_SCHED_SLEEP())
+
+#define C10D_NCCL_CHECK_TIMEOUT_GROUPEND(cmd, comm, failureReason)           \
+  do {                                                                       \
+    ncclResult_t state = cmd;                                                \
+    auto startTimepoint = std::chrono::steady_clock::now();                  \
+    auto timeout = nccl_nonblocking_timeout();                               \
+    if (state == ncclInProgress) {                                           \
+      do {                                                                   \
+        C10D_CHECK_TIMEOUT(startTimepoint, timeout);                         \
+        sched_yield();                                                       \
+        ncclCommGetAsyncError(comm->getNcclComm(), &state);                  \
+      } while (state == ncclInProgress);                                     \
+    }                                                                        \
+    if (state != ncclSuccess) {                                              \
+      std::string err = "NCCL error in: " + std::string(__FILE__) + ":" +    \
+          std::to_string(__LINE__) + ", " + ncclGetErrorWithVersion(state) + \
+          "\n" + getNcclErrorDetailStr(state, failureReason);                \
+      TORCH_CHECK_WITH(DistBackendError, false, err);                        \
+    }                                                                        \
+  } while (0)
+
+// Macro to print and abort on a non-successful NCCL return value.
+#define C10D_NCCL_ASSERT(cmd)                            \
+  do {                                                   \
+    ncclResult_t result = cmd;                           \
+    if (result != ncclSuccess) {                         \
+      std::string err = ncclGetErrorWithVersion(result); \
+      fprintf(                                           \
+          stderr,                                        \
+          "NCCL error in: %s:%d, %s\n",                  \
+          __FILE__,                                      \
+          __LINE__,                                      \
+          err.c_str());                                  \
+      abort();                                           \
+    }                                                    \
+  } while (0)
+
+namespace c10d {
+
+// NCCL type typing
+static std::map<at::ScalarType, ncclDataType_t> ncclDataType = {
+    {at::kChar, ncclInt8},
+    {at::kByte, ncclUint8},
+    {at::kFloat, ncclFloat},
+    {at::kDouble, ncclDouble},
+    {at::kInt, ncclInt32},
+    {at::kLong, ncclInt64},
+    {at::kHalf, ncclHalf},
+    {at::kBool, ncclUint8},
+#ifdef NCCL_SUPPORTS_FP8
+    {at::kFloat8_e5m2, ncclFloat8e5m2},
+    {at::kFloat8_e4m3fn, ncclFloat8e4m3},
+#else
+    {at::kFloat8_e5m2, ncclUint8},
+    {at::kFloat8_e4m3fn, ncclUint8},
+#endif
+    // NVIDIA GPUs does not support the UZ version standing for "no negative
+    // zero".  See https://onnx.ai/onnx/technical/float8.html
+    {at::kFloat8_e4m3fnuz, ncclUint8},
+    {at::kFloat8_e5m2fnuz, ncclUint8},
+#if HAS_NCCL_BF16_DATATYPE
+    {at::kBFloat16, ncclBfloat16},
+#endif // HAS_NCCL_BF16_DATATYPE
+};
+
+TORCH_API size_t hashTensors(const std::vector<at::Tensor>& tensors);
+TORCH_API std::string getNcclVersion();
+TORCH_API std::tuple<int, int, int> getNcclVersionTuple();
+TORCH_API int getNcclVersionNumber();
+TORCH_API std::string ncclGetErrorWithVersion(ncclResult_t error);
+int nccl_nonblocking_timeout();
+
+// Provides additional detail into NCCL error codes based on when these are
+// thrown in the NCCL codebase.
+TORCH_API std::string getNcclErrorDetailStr(
+    ncclResult_t error,
+    std::optional<std::string> processGroupFailureReason = std::nullopt);
+
+// Helper function that gets the data type and issues error if not supported
+ncclDataType_t getNcclDataType(at::ScalarType type);
+
+// RAII wrapper for NCCL communicator
+class NCCLComm {
+  using MutexType = std::recursive_mutex;
+  using LockType = std::unique_lock<MutexType>;
+
+ public:
+  explicit NCCLComm(ncclComm_t ncclComm);
+
+  NCCLComm() = default;
+
+  ~NCCLComm() noexcept;
+
+  static std::shared_ptr<NCCLComm> create(
+      int numRanks,
+      int rank,
+      ncclUniqueId commId,
+      at::DeviceIndex deviceIndex);
+
+#ifdef NCCL_HAS_CONFIG
+  static std::shared_ptr<NCCLComm> create(
+      int numRanks,
+      int rank,
+      ncclUniqueId commId,
+      at::DeviceIndex deviceIndex,
+      ncclConfig_t& config);
+#ifdef NCCL_HAS_INIT_RANK_SCALABLE
+  static std::shared_ptr<NCCLComm> create_scalable(
+      int numRanks,
+      int rank,
+      std::vector<ncclUniqueId>& commIds,
+      at::DeviceIndex deviceIndex,
+      ncclConfig_t& config);
+#endif // NCCL_HAS_INIT_RANK_SCALABLE
+#endif // NCCL_HAS_CONFIG
+
+#ifdef NCCL_HAS_COMM_SPLIT
+  static std::shared_ptr<NCCLComm> split(
+      NCCLComm* source,
+      int color_id,
+      int rank,
+      ncclConfig_t& config,
+      std::vector<uint64_t>& ranks_ull);
+#endif // NCCL_HAS_COMM_SPLIT
+
+#if (defined(IS_NCCLX) || defined(USE_ROCM)) && defined(NCCL_COMM_DUMP)
+  std::unordered_map<std::string, std::string> ncclCommDump();
+#endif
+
+  ncclUniqueId getNcclId();
+  at::DeviceIndex getDeviceIndex();
+
+  // Must not be copyable
+  NCCLComm(const NCCLComm&) = delete;
+  NCCLComm& operator=(const NCCLComm&) = delete;
+
+  // Do not support move assignment as there is no valid use case
+  NCCLComm& operator=(NCCLComm&& other) = delete;
+
+  // Move constructable
+  // NOLINTNEXTLINE(*-noexcept-move-*)
+  NCCLComm(NCCLComm&& other);
+
+  ncclComm_t getNcclComm();
+
+  // Wait for the communicator to be ready. This is a blocking function.
+  // Useful in nonblocking mode: NCCL requires the communicator to be ready
+  // before issuing a second command.
+  // Arguments:
+  //   longInterval: if true, wait with sleep of an interval; otherwise, wait
+  //   with `sched_yield` which is faster (but acquires CPU more frequently).
+  //   Use `longInterval=true` when waiting for initialization or finalize to
+  //   complete. Use `longInterval=false` when waiting collective call to return
+  //   ncclSuccess.
+  void waitReady(bool longInterval);
+
+  std::optional<std::string> getNcclCommFailureReason() const;
+
+  void abort(std::optional<std::string> commFailureReason = std::nullopt);
+
+  // Finalize a communicator -- asking it to flush its operations. When the
+  // communicator is marked as nonblocking, this is a nonblocking function;
+  // otherwise, it will block till all operations complete.
+  void finalize();
+
+  // Destroy a communicator. This is a blocking function.
+  void destroy();
+
+  bool isInitialized() const;
+
+  bool isAborted() const;
+
+  uint64_t getCommSplitCounter() const;
+
+  ncclResult_t checkForNcclError();
+
+  ncclResult_t registerSegment(
+      void* ptr,
+      size_t size,
+      bool errorOnRereg = true,
+      bool window = false);
+
+  ncclResult_t deregisterSegment(void* ptr, bool window = false);
+
+  std::string repr() const;
+
+  friend class ProcessGroupNCCL;
+
+ protected:
+  // Unique nccl_id for this communicator.
+  ncclUniqueId ncclId_{};
+  bool aborted_{false};
+  uint64_t ncclCommSplitCounter_{0};
+  ncclResult_t ncclAsyncErr_{ncclSuccess};
+  mutable MutexType mutex_;
+  // Rank that this communicator corresponds to.
+  int rank_{};
+  // Optional reason for communicator failure, provided by ProcessGroupNCCL for
+  // better error messaging.
+  std::optional<std::string> commFailureReason_{};
+  bool initialized_{false};
+  // Whether this communicator is using nonblocking mode. Recorded during comm
+  // creation or split. For safety, we give a default value of true (more
+  // protection).
+  bool nonBlocking_{true};
+  // Device index for which the NCCL comm is created
+  at::DeviceIndex deviceIndex_{-1};
+#ifdef NCCL_HAS_COMM_REGISTER
+  // Stores handlers for tensors registered by NCCL
+  std::unordered_map<void*, void*> registeredSegmentHandles_;
+#endif // NCCL_HAS_COMM_REGISTER
+
+ private:
+  ncclComm_t ncclComm_{nullptr};
+};
+
+// Helper that automatically cleans up premul sums.
+struct ncclRedOpRAII {
+  ncclRedOpRAII() = default;
+  ncclRedOpRAII(ncclRedOp_t op) : op_(op) {}
+  ncclRedOpRAII(ncclRedOp_t op, ncclComm_t comm)
+      : op_(op), comm_(comm), premul_sum_(true) {}
+  ncclRedOpRAII(const ncclRedOpRAII&) = delete;
+  ncclRedOpRAII& operator=(const ncclRedOpRAII&) = delete;
+  ncclRedOpRAII(ncclRedOpRAII&& tmp) noexcept : ncclRedOpRAII() {
+    std::swap(tmp.op_, this->op_);
+    std::swap(tmp.comm_, this->comm_);
+    std::swap(tmp.premul_sum_, this->premul_sum_);
+  }
+#if defined(ENABLE_NCCL_PREMUL_SUM_SUPPORT)
+  ~ncclRedOpRAII() {
+    if (premul_sum_) {
+      ncclRedOpDestroy(op_, comm_);
+    }
+  }
+#endif // ENABLE_NCCL_PREMUL_SUM_SUPPORT
+  operator ncclRedOp_t() const {
+    return op_;
+  }
+  ncclRedOp_t op_{};
+  ncclComm_t comm_{};
+  bool premul_sum_ = false;
+};
+
+void printNcclCommProxyTrace(
+    const std::string& dumpReason,
+    const std::unordered_map<std::string, std::string>& dumpMap);
+} // namespace c10d
+
+#endif // USE_C10D_NCCL
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/NanCheck.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/NanCheck.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f906cfd22a70e8584a8176e33c0fc2766fdf7aab
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/NanCheck.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#ifdef USE_C10D_NCCL
+
+#include <ATen/ATen.h>
+#include <c10/cuda/CUDAStream.h>
+
+namespace c10d {
+
+// Check for NaNs in a tensor on a given stream. If any are found, throw a
+// device-side error.
+void checkForNan(const at::Tensor& tensor, at::cuda::CUDAStream& stream);
+
+} // namespace c10d
+
+#endif // USE_C10D_NCCL
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ParamCommsUtils.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ParamCommsUtils.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d7aa46044062ef1426085dbc1a8ed363863bd45b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ParamCommsUtils.hpp
@@ -0,0 +1,180 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <ATen/record_function.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/ThreadLocalDebugInfo.h>
+#include <string>
+#include <vector>
+
+namespace torch {
+
+class TORCH_API ParamCommsDebugInfo : public c10::DebugInfoBase {
+ public:
+  ParamCommsDebugInfo() = default;
+  ParamCommsDebugInfo(
+      std::tuple<std::string, std::string> pgName,
+      int rank,
+      std::string&& collName,
+      int64_t inNelems,
+      int64_t outNelems,
+      at::ScalarType dType,
+      std::vector<int64_t> inSplitSizes,
+      std::vector<int64_t> outSplitSizes,
+      int globalRankStart,
+      int globalRankStride,
+      int worldSize);
+
+  ~ParamCommsDebugInfo() override = default;
+
+  const std::string getProcessGroupName() const {
+    return std::get<0>(pgName_);
+  }
+
+  const std::string getProcessGroupDesc() const {
+    return std::get<1>(pgName_);
+  }
+
+  int getRank() const {
+    return rank_;
+  }
+
+  int getWorldSize() const {
+    return worldSize_;
+  }
+
+  int getGlobalRankStart() const {
+    return globalRankStart_;
+  }
+
+  int getGlobalRankStride() const {
+    return globalRankStride_;
+  }
+
+  const std::string getCollectiveName() const {
+    return collectiveName_;
+  }
+
+  int64_t getInMessageNelems() const {
+    return inMessageNelems_;
+  }
+
+  int64_t getOutMessageNelems() const {
+    return outMessageNelems_;
+  }
+
+  at::ScalarType getDType() const {
+    return dType_;
+  }
+
+  const std::vector<int64_t>& getInputSplitSizes() const {
+    return inputSplitSizes_;
+  }
+
+  const std::vector<int64_t>& getOutputSplitSizes() const {
+    return outputSplitSizes_;
+  }
+
+  const std::vector<int64_t>& getGroupRanks() const {
+    return groupRanks_;
+  }
+
+ private:
+  std::tuple<std::string, std::string> pgName_; // <group_name, group_desc>
+  int rank_{};
+  int worldSize_{};
+  std::string collectiveName_;
+  int64_t inMessageNelems_{};
+  int64_t outMessageNelems_{};
+  at::ScalarType dType_ = at::kByte;
+  std::vector<int64_t> inputSplitSizes_;
+  std::vector<int64_t> outputSplitSizes_;
+  int globalRankStart_{};
+  int globalRankStride_{};
+  std::vector<int64_t> groupRanks_{};
+};
+
+#define RECORD_PARAM_COMMS(                                                    \
+    seq,                                                                       \
+    pgName,                                                                    \
+    rank,                                                                      \
+    collName,                                                                  \
+    inNelems,                                                                  \
+    outNelems,                                                                 \
+    dType,                                                                     \
+    inSplitSizes,                                                              \
+    outSplitSizes,                                                             \
+    globalRankStart,                                                           \
+    globalRankStride,                                                          \
+    worldSize)                                                                 \
+  auto paramCommsInfo = std::make_shared<torch::ParamCommsDebugInfo>(          \
+      pgName,                                                                  \
+      rank,                                                                    \
+      collName,                                                                \
+      inNelems,                                                                \
+      outNelems,                                                               \
+      dType,                                                                   \
+      inSplitSizes,                                                            \
+      outSplitSizes,                                                           \
+      globalRankStart,                                                         \
+      globalRankStride,                                                        \
+      worldSize);                                                              \
+  c10::DebugInfoGuard g(c10::DebugInfoKind::PARAM_COMMS_INFO, paramCommsInfo); \
+  std::initializer_list<const c10::IValue> paramList = {                       \
+      seq,                                                                     \
+      pgName,                                                                  \
+      rank,                                                                    \
+      collName,                                                                \
+      inSplitSizes,                                                            \
+      outSplitSizes,                                                           \
+      globalRankStart,                                                         \
+      globalRankStride,                                                        \
+      worldSize};                                                              \
+  c10::ArrayRef<const c10::IValue> paramInputs(paramList);                     \
+  RECORD_FUNCTION(at::kParamCommsCallName, paramInputs);
+
+#define RECORD_PARAM_COMMS_DATA(                                               \
+    seq,                                                                       \
+    pgName,                                                                    \
+    InputTensors,                                                              \
+    OutputTensors,                                                             \
+    rank,                                                                      \
+    collName,                                                                  \
+    inNelems,                                                                  \
+    outNelems,                                                                 \
+    dType,                                                                     \
+    inSplitSizes,                                                              \
+    outSplitSizes,                                                             \
+    globalRankStart,                                                           \
+    globalRankStride,                                                          \
+    worldSize)                                                                 \
+  auto paramCommsInfo = std::make_shared<torch::ParamCommsDebugInfo>(          \
+      pgName,                                                                  \
+      rank,                                                                    \
+      collName,                                                                \
+      inNelems,                                                                \
+      outNelems,                                                               \
+      dType,                                                                   \
+      inSplitSizes,                                                            \
+      outSplitSizes,                                                           \
+      globalRankStart,                                                         \
+      globalRankStride,                                                        \
+      worldSize);                                                              \
+  c10::DebugInfoGuard g(c10::DebugInfoKind::PARAM_COMMS_INFO, paramCommsInfo); \
+  std::initializer_list<const c10::IValue> paramList = {                       \
+      c10::IValue(InputTensors),                                               \
+      seq,                                                                     \
+      pgName,                                                                  \
+      rank,                                                                    \
+      collName,                                                                \
+      inSplitSizes,                                                            \
+      outSplitSizes,                                                           \
+      globalRankStart,                                                         \
+      globalRankStride,                                                        \
+      worldSize};                                                              \
+  c10::ArrayRef<const c10::IValue> paramInputs(paramList);                     \
+  RECORD_FUNCTION_WITH_INPUTS_OUTPUTS(                                         \
+      at::kParamCommsCallName,                                                 \
+      paramInputs,                                                             \
+      std::vector<c10::IValue>(1, c10::IValue(OutputTensors)));
+} // namespace torch
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/PrefixStore.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/PrefixStore.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..044c760d4f9111a3d7b0fd0d136c5e1661b43b8e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/PrefixStore.hpp
@@ -0,0 +1,75 @@
+#pragma once
+
+#include <torch/csrc/distributed/c10d/Store.hpp>
+
+namespace c10d {
+
+class TORCH_API PrefixStore : public Store {
+ public:
+  explicit PrefixStore(std::string prefix, c10::intrusive_ptr<Store> store);
+
+  c10::intrusive_ptr<Store> clone() override;
+
+  using Store::set;
+  void set(const std::string& key, const std::vector<uint8_t>& value) override;
+
+  using Store::compareSet;
+  std::vector<uint8_t> compareSet(
+      const std::string& key,
+      const std::vector<uint8_t>& expectedValue,
+      const std::vector<uint8_t>& desiredValue) override;
+
+  std::vector<uint8_t> get(const std::string& key) override;
+
+  int64_t add(const std::string& key, int64_t value) override;
+
+  bool deleteKey(const std::string& key) override;
+
+  int64_t getNumKeys() override;
+
+  bool check(const std::vector<std::string>& keys) override;
+
+  void wait(const std::vector<std::string>& keys) override;
+
+  void wait(
+      const std::vector<std::string>& keys,
+      const std::chrono::milliseconds& timeout) override;
+
+  const std::chrono::milliseconds& getTimeout() const noexcept override;
+
+  void setTimeout(const std::chrono::milliseconds& timeout) override;
+
+  void append(const std::string& key, const std::vector<uint8_t>& value)
+      override;
+
+  std::vector<std::vector<uint8_t>> multiGet(
+      const std::vector<std::string>& keys) override;
+
+  void multiSet(
+      const std::vector<std::string>& keys,
+      const std::vector<std::vector<uint8_t>>& values) override;
+
+  // Returns true if this store support append, multiGet and multiSet
+  bool hasExtendedApi() const override;
+
+  void queuePush(const std::string& key, const std::vector<uint8_t>& value)
+      override;
+
+  std::vector<uint8_t> queuePop(const std::string& key, bool block) override;
+
+  int64_t queueLen(const std::string& key) override;
+
+  c10::intrusive_ptr<Store> getUnderlyingStore();
+
+  // Recursively to fetch the store before layers of wrapping with PrefixStore.
+  c10::intrusive_ptr<Store> getUnderlyingNonPrefixStore();
+
+ protected:
+  std::string prefix_;
+  c10::intrusive_ptr<Store> store_;
+
+  std::string joinKey(const std::string& key);
+  std::vector<std::string> joinKeys(const std::vector<std::string>& keys);
+};
+
+} // namespace c10d
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroup.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroup.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0355e26715418d4dab816f89c760cb686fc6cbca
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroup.hpp
@@ -0,0 +1,981 @@
+#pragma once
+
+#include <torch/csrc/distributed/c10d/Backend.hpp>
+#include <torch/csrc/distributed/c10d/Work.hpp>
+#include <memory>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <ATen/ATen.h>
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <c10/macros/Macros.h>
+
+#include <torch/csrc/distributed/c10d/Work.hpp>
+// *************************************************************************
+// PROCESS GROUP collective communication API IS BEING CHANGED BETWEEN
+// versions 1.7 and 1.8.
+// PLEASE DO NOT ADD ANY DEPENDENCIES.
+// SEE RFC: https://github.com/pytorch/pytorch/issues/39662
+// *************************************************************************
+
+constexpr auto kProcessGroupDefaultTimeout =
+    std::chrono::milliseconds(30 * 60 * 1000);
+
+namespace c10d {
+
+// We only call `register_work()` in two cases:
+// 1. If the work object is created from a functional collective call.
+// 2. If the work object is created from a non-functional collective call within
+//    the `with allow_inflight_collective_as_graph_input_ctx()` context manager.
+C10_EXPORT void register_work(
+    const at::Tensor& tensor,
+    const c10::intrusive_ptr<c10d::Work>& work);
+
+C10_EXPORT at::Tensor wait_tensor(const at::Tensor& tensor);
+
+// We only call `unregister_work()` in one case:
+// 1. If the work object is created from a non-functional collective call within
+//    the `with allow_inflight_collective_as_graph_input_ctx()` context manager.
+//
+// Q: What about the functional collective case?
+// A: The unregistration of work object for functional collective is done in
+//    the required user-side explicit call to `wait_tensor()`.
+C10_EXPORT void unregister_work(const c10::intrusive_ptr<c10d::Work>& work);
+
+C10_EXPORT size_t get_work_registry_size();
+
+C10_EXPORT void set_allow_inflight_collective_as_graph_input(bool value);
+
+C10_EXPORT bool allow_inflight_collective_as_graph_input();
+
+// ProcessGroup is a base class that captures collective and point to
+// point communication in a fixed set of processes.
+//
+// The functions specified in the class below describe the API alone;
+// implementations are provided in subclasses.
+//
+// Every function that performs I/O is executed asynchronously by a
+// thread pool owned by the ProcessGroup (by default). They return an
+// object that can be used to wait for completion or error.
+//
+// The ProcessGroup can instantiate subgroups with fewer or an equal
+// number of members. Implementations must take care that multiple
+// process groups can be used in parallel and synchronize accordingly.
+//
+// The ProcessGroup assumes a fixed set of processes. If the set
+// changes, existing instances must be destructed and instantiation
+// and initialization must start from scratch. For members of the
+// process group to find each other (referred to as rendezvous from
+// hereon)
+//
+class TORCH_API ProcessGroup : public torch::CustomClassHolder {
+ public:
+  enum BackendType : uint8_t {
+    UNDEFINED = 0,
+    GLOO = 1,
+    NCCL = 2,
+    UCC = 3,
+    MPI = 4,
+    XCCL = 5,
+    CUSTOM = 6,
+  };
+
+  static std::string backendTypeToString(const BackendType& type) {
+    switch (type) {
+      case BackendType::GLOO:
+        return "gloo";
+      case BackendType::NCCL:
+        return "nccl";
+      case BackendType::XCCL:
+        return "xccl";
+      case BackendType::UCC:
+        return "ucc";
+      case BackendType::MPI:
+        return "mpi";
+      case BackendType::UNDEFINED:
+        return "undefined";
+      case BackendType::CUSTOM:
+        return "custom";
+      default:
+        TORCH_CHECK(false, "THis should never happen!");
+    }
+  }
+
+  static BackendType strToBackendType(const std::string& backend) {
+    if (backend == "undefined") {
+      return BackendType::UNDEFINED;
+    } else if (backend == "gloo") {
+      return BackendType::GLOO;
+    } else if (backend == "nccl") {
+      return BackendType::NCCL;
+    } else if (backend == "xccl") {
+      return BackendType::XCCL;
+    } else if (backend == "ucc") {
+      return BackendType::UCC;
+    } else if (backend == "mpi") {
+      return BackendType::MPI;
+    } else {
+      return BackendType::CUSTOM;
+    }
+  }
+
+  // Not used, set for backwards compatibility and only used for TypeDef in
+  // Ops.cpp
+  explicit ProcessGroup(int rank, int size);
+
+  explicit ProcessGroup(
+      c10::intrusive_ptr<::c10d::Store> store,
+      int rank,
+      int size);
+  ~ProcessGroup() override;
+
+  virtual int getRank() const {
+    return rank_;
+  }
+
+  virtual int getSize() const {
+    return size_;
+  }
+
+  // Returns an unique opaque ID of this process group object.
+  int64_t getID() const {
+    return reinterpret_cast<std::intptr_t>(this);
+  }
+
+  // Returns an unique opaque ID of a backend for the specific backend type
+  // that can correlate with this process group's collectives.
+  int64_t getBackendID(BackendType backend_type) const {
+    return reinterpret_cast<std::intptr_t>(getBackend(backend_type).get());
+  }
+
+  virtual const std::string getBackendName() const {
+    return backendTypeToString(backendType_);
+  }
+
+  BackendType getBackendType() const {
+    return backendType_;
+  }
+
+  inline bool backendSupportsSequenceNumbers(BackendType backendType) {
+    if (backendType == BackendType::GLOO || backendType == BackendType::NCCL ||
+        backendType == BackendType::XCCL || backendType == BackendType::UCC)
+      return true;
+    return false;
+  }
+
+  virtual void startCoalescing(c10::DeviceType deviceType) {
+    // only nccl has implemented startCoalescing so only execute for nccl
+    // backends
+    auto backend = getBackend(deviceType);
+    backend->startCoalescing();
+  }
+
+  virtual c10::intrusive_ptr<Work> endCoalescing(c10::DeviceType deviceType) {
+    // only nccl has implemented endCoalescing so only execute for nccl
+    // backends
+    auto backend = getBackend(deviceType);
+    auto work = backend->endCoalescing();
+    return work;
+  }
+
+  virtual c10::intrusive_ptr<Work> broadcast(
+      std::vector<at::Tensor>& tensors,
+      const BroadcastOptions& opts = BroadcastOptions()) {
+    static auto op =
+        c10::Dispatcher::singleton()
+            .findSchemaOrThrow("c10d::broadcast_", "")
+            .typed<
+                std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(
+                    at::TensorList,
+                    const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                    int64_t,
+                    int64_t,
+                    bool,
+                    int64_t)>();
+    // It's awakward to unbox the opts here and box them again in the custom C++
+    // op. But it's also complicated to make opts as a CustomClassHolder. Leave
+    // it as it is now.
+    auto work = std::get<1>(op.call(
+        tensors,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        opts.rootRank,
+        opts.rootTensor,
+        opts.asyncOp,
+        opts.timeout.count()));
+
+    if (c10d::allow_inflight_collective_as_graph_input()) {
+      for (const auto& tensor : tensors) {
+        c10d::register_work(tensor, work);
+      }
+    }
+    return work;
+  }
+
+  virtual c10::intrusive_ptr<Work> allreduce(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceOptions& opts = AllreduceOptions()) {
+    static auto op =
+        c10::Dispatcher::singleton()
+            .findSchemaOrThrow("c10d::allreduce_", "")
+            .typed<
+                std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(
+                    at::TensorList,
+                    const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                    const c10::intrusive_ptr<::c10d::ReduceOp>&,
+                    const std::optional<at::Tensor>& sparse_indices,
+                    bool,
+                    int64_t)>();
+
+    auto work = std::get<1>(op.call(
+        tensors,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        c10::make_intrusive<ReduceOp>(opts.reduceOp),
+        opts.sparseIndices,
+        opts.asyncOp,
+        opts.timeout.count()));
+
+    if (c10d::allow_inflight_collective_as_graph_input()) {
+      for (const auto& tensor : tensors) {
+        c10d::register_work(tensor, work);
+      }
+    }
+    return work;
+  }
+
+  virtual c10::intrusive_ptr<Work> allreduce_coalesced(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceCoalescedOptions& opts = AllreduceCoalescedOptions()) {
+    static auto op = c10::Dispatcher::singleton()
+                         .findSchemaOrThrow("c10d::allreduce_coalesced_", "")
+                         .typed<c10::intrusive_ptr<::c10d::Work>(
+                             at::TensorList,
+                             const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                             const c10::intrusive_ptr<::c10d::ReduceOp>&,
+                             bool,
+                             int64_t)>();
+
+    auto work = op.call(
+        tensors,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        c10::make_intrusive<ReduceOp>(opts.reduceOp),
+        opts.asyncOp,
+        opts.timeout.count());
+
+    if (c10d::allow_inflight_collective_as_graph_input()) {
+      for (const auto& tensor : tensors) {
+        c10d::register_work(tensor, work);
+      }
+    }
+    return work;
+  }
+
+  virtual c10::intrusive_ptr<Work> reduce(
+      std::vector<at::Tensor>& tensors,
+      const ReduceOptions& opts = ReduceOptions()) {
+    static auto op = c10::Dispatcher::singleton()
+                         .findSchemaOrThrow("c10d::reduce_", "")
+                         .typed<c10::intrusive_ptr<::c10d::Work>(
+                             at::TensorList,
+                             const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                             const c10::intrusive_ptr<::c10d::ReduceOp>&,
+                             int64_t,
+                             int64_t,
+                             bool,
+                             int64_t)>();
+    auto work = op.call(
+        tensors,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        c10::make_intrusive<ReduceOp>(opts.reduceOp),
+        opts.rootRank,
+        opts.rootTensor,
+        opts.asyncOp,
+        opts.timeout.count());
+
+    if (c10d::allow_inflight_collective_as_graph_input()) {
+      for (const auto& tensor : tensors) {
+        c10d::register_work(tensor, work);
+      }
+    }
+    return work;
+  }
+
+  virtual c10::intrusive_ptr<Work> allgather(
+      std::vector<std::vector<at::Tensor>>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const AllgatherOptions& opts = AllgatherOptions()) {
+    static auto op = c10::Dispatcher::singleton()
+                         .findSchemaOrThrow("c10d::allgather_", "")
+                         .typed<std::tuple<
+                             std::vector<std::vector<at::Tensor>>,
+                             c10::intrusive_ptr<Work>>(
+                             const std::vector<std::vector<at::Tensor>>&,
+                             at::TensorList,
+                             const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                             bool,
+                             int64_t)>();
+
+    auto work = std::get<1>(op.call(
+        outputTensors,
+        inputTensors,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        opts.asyncOp,
+        opts.timeout.count()));
+
+    if (c10d::allow_inflight_collective_as_graph_input()) {
+      for (const auto& tensor_list : outputTensors) {
+        for (const auto& tensor : tensor_list) {
+          c10d::register_work(tensor, work);
+        }
+      }
+    }
+    return work;
+  }
+
+  // Gathers a single tensor inputBuffer into a single buffer outputBuffer that
+  // is interpreted as a contiguous collection of size inputBuffer * WORLD_SIZE.
+  // For implementers of ProcessGroup API and advanced users only.
+  // Note: this function will be deprecated in near future.
+  virtual c10::intrusive_ptr<Work> _allgather_base(
+      at::Tensor& outputBuffer,
+      at::Tensor& inputBuffer,
+      const AllgatherOptions& opts = AllgatherOptions()) {
+    static auto op =
+        c10::Dispatcher::singleton()
+            .findSchemaOrThrow("c10d::_allgather_base_", "")
+            .typed<std::tuple<at::Tensor, c10::intrusive_ptr<Work>>(
+                at::Tensor&,
+                at::Tensor&,
+                const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                bool,
+                int64_t)>();
+
+    auto work = std::get<1>(op.call(
+        outputBuffer,
+        inputBuffer,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        opts.asyncOp,
+        opts.timeout.count()));
+
+    if (c10d::allow_inflight_collective_as_graph_input()) {
+      c10d::register_work(outputBuffer, work);
+    }
+    return work;
+  }
+
+  // This function is deprecated and will be moved out of ProcessGroup to comms:
+  // * do not add dependencies on this function,
+  // * do not implement it in your ProcessGroup, implement _allgather_base
+  //   instead.
+  virtual c10::intrusive_ptr<Work> allgather_coalesced(
+      std::vector<std::vector<at::Tensor>>& outputTensorLists,
+      std::vector<at::Tensor>& inputTensors,
+      const AllgatherOptions& opts = AllgatherOptions()) {
+    static auto op = c10::Dispatcher::singleton()
+                         .findSchemaOrThrow("c10d::allgather_coalesced_", "")
+                         .typed<c10::intrusive_ptr<Work>(
+                             const std::vector<std::vector<at::Tensor>>&,
+                             const at::TensorList&,
+                             const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                             bool)>();
+
+    auto work = op.call(
+        outputTensorLists,
+        inputTensors,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        opts.asyncOp);
+
+    if (c10d::allow_inflight_collective_as_graph_input()) {
+      for (const auto& tensor_list : outputTensorLists) {
+        for (const auto& tensor : tensor_list) {
+          c10d::register_work(tensor, work);
+        }
+      }
+    }
+    return work;
+  }
+
+  // This function is a coalesced version of `allgather_into_tensor` (currently
+  // still named as `_allgather_base`). Each tensor in the vector corresponds to
+  // an input/output of one `allgather_into_tensor` operation.
+  virtual c10::intrusive_ptr<Work> allgather_into_tensor_coalesced(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const AllgatherOptions& opts = AllgatherOptions()) {
+    static auto op =
+        c10::Dispatcher::singleton()
+            .findSchemaOrThrow("c10d::allgather_into_tensor_coalesced_", "")
+            .typed<c10::intrusive_ptr<Work>(
+                const at::TensorList,
+                const at::TensorList,
+                const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                bool)>();
+
+    auto work = op.call(
+        outputTensors,
+        inputTensors,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        opts.asyncOp);
+
+    if (c10d::allow_inflight_collective_as_graph_input()) {
+      for (const auto& tensor : outputTensors) {
+        c10d::register_work(tensor, work);
+      }
+    }
+    return work;
+  }
+
+  virtual c10::intrusive_ptr<Work> gather(
+      std::vector<std::vector<at::Tensor>>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const GatherOptions& opts = GatherOptions()) {
+    static auto op = c10::Dispatcher::singleton()
+                         .findSchemaOrThrow("c10d::gather_", "")
+                         .typed<c10::intrusive_ptr<::c10d::Work>(
+                             const std::vector<std::vector<at::Tensor>>&,
+                             const at::TensorList&,
+                             const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                             int64_t,
+                             bool,
+                             int64_t)>();
+    auto work = op.call(
+        outputTensors,
+        inputTensors,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        opts.rootRank,
+        opts.asyncOp,
+        opts.timeout.count());
+
+    if (c10d::allow_inflight_collective_as_graph_input()) {
+      for (const auto& tensor_list : outputTensors) {
+        for (const auto& tensor : tensor_list) {
+          c10d::register_work(tensor, work);
+        }
+      }
+    }
+    return work;
+  }
+
+  virtual c10::intrusive_ptr<Work> scatter(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<std::vector<at::Tensor>>& inputTensors,
+      const ScatterOptions& opts = ScatterOptions()) {
+    static auto op =
+        c10::Dispatcher::singleton()
+            .findSchemaOrThrow("c10d::scatter_", "")
+            .typed<
+                std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(
+                    const at::TensorList&,
+                    const std::vector<std::vector<at::Tensor>>&,
+                    const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                    int64_t,
+                    bool,
+                    int64_t)>();
+    auto work = std::get<1>(op.call(
+        outputTensors,
+        inputTensors,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        opts.rootRank,
+        opts.asyncOp,
+        opts.timeout.count()));
+
+    if (c10d::allow_inflight_collective_as_graph_input()) {
+      for (const auto& tensor : outputTensors) {
+        c10d::register_work(tensor, work);
+      }
+    }
+    return work;
+  }
+
+  virtual c10::intrusive_ptr<Work> reduce_scatter(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<std::vector<at::Tensor>>& inputTensors,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) {
+    static auto op =
+        c10::Dispatcher::singleton()
+            .findSchemaOrThrow("c10d::reduce_scatter_", "")
+            .typed<
+                std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(
+                    const at::TensorList&,
+                    const std::vector<std::vector<at::Tensor>>&,
+                    const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                    const c10::intrusive_ptr<::c10d::ReduceOp>&,
+                    bool,
+                    int64_t)>();
+    auto work = std::get<1>(op.call(
+        outputTensors,
+        inputTensors,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        c10::make_intrusive<::c10d::ReduceOp>(opts.reduceOp),
+        opts.asyncOp,
+        opts.timeout.count()));
+
+    if (c10d::allow_inflight_collective_as_graph_input()) {
+      for (const auto& tensor : outputTensors) {
+        c10d::register_work(tensor, work);
+      }
+    }
+    return work;
+  }
+
+  virtual c10::intrusive_ptr<Work> _reduce_scatter_base(
+      at::Tensor& outputBuffer,
+      at::Tensor& inputBuffer,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) {
+    static auto op =
+        c10::Dispatcher::singleton()
+            .findSchemaOrThrow("c10d::_reduce_scatter_base_", "")
+            .typed<std::tuple<at::Tensor, c10::intrusive_ptr<Work>>(
+                at::Tensor&,
+                at::Tensor&,
+                const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                const c10::intrusive_ptr<::c10d::ReduceOp>&,
+                bool,
+                int64_t)>();
+    auto work = std::get<1>(op.call(
+        outputBuffer,
+        inputBuffer,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        c10::make_intrusive<::c10d::ReduceOp>(opts.reduceOp),
+        opts.asyncOp,
+        opts.timeout.count()));
+
+    if (c10d::allow_inflight_collective_as_graph_input()) {
+      c10d::register_work(outputBuffer, work);
+    }
+    return work;
+  }
+
+  // This function is a coalesced version of `reduce_scatter_tensor` (currently
+  // still named as `_reduce_scatter_base`). Each tensor in the vector
+  // corresponds to an input/output of one `reduce_scatter_tensor` operation.
+  virtual c10::intrusive_ptr<Work> reduce_scatter_tensor_coalesced(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) {
+    static auto op =
+        c10::Dispatcher::singleton()
+            .findSchemaOrThrow("c10d::reduce_scatter_tensor_coalesced_", "")
+            .typed<c10::intrusive_ptr<Work>(
+                const at::TensorList,
+                const at::TensorList,
+                const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                const c10::intrusive_ptr<::c10d::ReduceOp>&,
+                bool,
+                int64_t)>();
+
+    auto work = op.call(
+        outputTensors,
+        inputTensors,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        c10::make_intrusive<::c10d::ReduceOp>(opts.reduceOp),
+        opts.asyncOp,
+        opts.timeout.count());
+
+    if (c10d::allow_inflight_collective_as_graph_input()) {
+      for (const auto& tensor : outputTensors) {
+        c10d::register_work(tensor, work);
+      }
+    }
+    return work;
+  }
+
+  virtual c10::intrusive_ptr<Work> alltoall_base(
+      at::Tensor& outputBuffer,
+      at::Tensor& inputBuffer,
+      std::vector<int64_t>& outputSplitSizes,
+      std::vector<int64_t>& inputSplitSizes,
+      const AllToAllOptions& opts = AllToAllOptions()) {
+    static auto op = c10::Dispatcher::singleton()
+                         .findSchemaOrThrow("c10d::alltoall_base_", "")
+                         .typed<c10::intrusive_ptr<::c10d::Work>(
+                             at::Tensor&,
+                             at::Tensor&,
+                             const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                             std::vector<int64_t>,
+                             std::vector<int64_t>,
+                             bool,
+                             int64_t)>();
+    auto work = op.call(
+        outputBuffer,
+        inputBuffer,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        outputSplitSizes,
+        inputSplitSizes,
+        opts.asyncOp,
+        opts.timeout.count());
+
+    if (c10d::allow_inflight_collective_as_graph_input()) {
+      c10d::register_work(outputBuffer, work);
+    }
+    return work;
+  }
+
+  virtual c10::intrusive_ptr<Work> alltoall(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const AllToAllOptions& opts = AllToAllOptions()) {
+    static auto op =
+        c10::Dispatcher::singleton()
+            .findSchemaOrThrow("c10d::alltoall_", "")
+            .typed<
+                std::tuple<std::vector<at::Tensor>, c10::intrusive_ptr<Work>>(
+                    const at::TensorList&,
+                    const at::TensorList&,
+                    const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                    bool,
+                    int64_t)>();
+    auto work = std::get<1>(op.call(
+        outputTensors,
+        inputTensors,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        opts.asyncOp,
+        opts.timeout.count()));
+
+    if (c10d::allow_inflight_collective_as_graph_input()) {
+      for (const auto& tensor : outputTensors) {
+        c10d::register_work(tensor, work);
+      }
+    }
+    return work;
+  }
+
+  virtual void monitoredBarrier(
+      const BarrierOptions& opts,
+      bool wait_all_ranks = false) {
+    static auto op = c10::Dispatcher::singleton()
+                         .findSchemaOrThrow("c10d::monitored_barrier_", "")
+                         .typed<void(
+                             at::Tensor,
+                             const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                             const std::vector<int64_t>&,
+                             int64_t,
+                             bool)>();
+    // Default to using cpu implementation, monitored barrier is only for GLOO
+    at::Tensor tensor = at::empty({0}, at::TensorOptions().device(at::kCPU));
+    op.call(
+        tensor,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        opts.device_ids,
+        opts.timeout.count(),
+        wait_all_ranks);
+  }
+
+  // Agrees on an initial sequence number for the whole group by having rank 0
+  // create it and broadcast it to other ranks using the store. Only implemented
+  // for GLOO and NCCL backends currently.
+  virtual void setSequenceNumberForGroup() {
+    auto backendType = getBackendType();
+    // TODO: HACK for backend name to get sequence number for that backend.
+    if (backendSupportsSequenceNumbers(backendType)) {
+      getDefaultBackend()->setSequenceNumberForGroup();
+    } else {
+      TORCH_CHECK(
+          false,
+          c10::str(
+              "ProcessGroup ",
+              getBackendName(),
+              " does not yet support sequence numbers."));
+    }
+  }
+
+  // Retrieves the current sequence number for the whole group, which should be
+  // in sync. If the returned number is not consistent across the group, it
+  // may indicate that there is some sort of collective desynchronization.
+  virtual uint64_t getSequenceNumberForGroup() {
+    auto backendType = getBackendType();
+
+    // TODO: HACK for backend name to get sequence number for that backend.
+    if (backendSupportsSequenceNumbers(backendType)) {
+      return getDefaultBackend()->getSequenceNumberForGroup();
+    } else {
+      TORCH_CHECK(
+          false,
+          c10::str(
+              "ProcessGroup ",
+              getBackendName(),
+              " does not yet support sequence numbers."));
+    }
+  }
+
+  virtual c10::intrusive_ptr<Work> send(
+      std::vector<at::Tensor>& tensors,
+      int dstRank,
+      int tag) {
+    static auto op = c10::Dispatcher::singleton()
+                         .findSchemaOrThrow("c10d::send", "")
+                         .typed<c10::intrusive_ptr<::c10d::Work>(
+                             at::TensorList,
+                             const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                             int64_t,
+                             int64_t)>();
+    auto work = op.call(
+        tensors,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        dstRank,
+        tag);
+    if (c10d::allow_inflight_collective_as_graph_input()) {
+      for (const auto& tensor : tensors) {
+        c10d::register_work(tensor, work);
+      }
+    }
+    return work;
+  }
+
+  virtual c10::intrusive_ptr<Work> recv(
+      std::vector<at::Tensor>& tensors,
+      int srcRank,
+      int tag) {
+    static auto op = c10::Dispatcher::singleton()
+                         .findSchemaOrThrow("c10d::recv_", "")
+                         .typed<c10::intrusive_ptr<::c10d::Work>(
+                             at::TensorList,
+                             const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                             int64_t,
+                             int64_t)>();
+    auto work = op.call(
+        tensors,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        srcRank,
+        tag);
+    if (c10d::allow_inflight_collective_as_graph_input()) {
+      for (const auto& tensor : tensors) {
+        c10d::register_work(tensor, work);
+      }
+    }
+    return work;
+  }
+
+  virtual c10::intrusive_ptr<Work> recvAnysource(
+      std::vector<at::Tensor>& tensors,
+      int tag) {
+    static auto op = c10::Dispatcher::singleton()
+                         .findSchemaOrThrow("c10d::recv_any_source_", "")
+                         .typed<c10::intrusive_ptr<::c10d::Work>(
+                             at::TensorList,
+                             const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                             int64_t)>();
+    auto work = op.call(
+        tensors,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        tag);
+    if (c10d::allow_inflight_collective_as_graph_input()) {
+      for (const auto& tensor : tensors) {
+        c10d::register_work(tensor, work);
+      }
+    }
+    return work;
+  }
+
+  virtual c10::intrusive_ptr<Work> barrier(
+      const BarrierOptions& opts = BarrierOptions()) {
+    static at::Tensor tensor;
+    // TODO: if nccl was specified then use it
+    auto device = opts.device;
+    if (device.has_value()) {
+      // set device tensor from argument
+      tensor = at::empty(
+          {1}, at::TensorOptions().device(device.value()).dtype(at::kByte));
+    } else if (backendType_ == c10d::ProcessGroup::BackendType::NCCL) {
+      // set cuda tensor
+      tensor = at::empty(
+          {1},
+          at::TensorOptions().device(at::DeviceType::CUDA).dtype(at::kByte));
+    } else if (backendType_ == c10d::ProcessGroup::BackendType::XCCL) {
+      // set xpu tensor for override cpu dispatch
+      tensor = at::empty(
+          {1},
+          at::TensorOptions().device(at::DeviceType::XPU).dtype(at::kByte));
+    } else {
+      // Default to using cpu implementation
+      tensor = at::empty(
+          {1},
+          at::TensorOptions().device(at::DeviceType::CPU).dtype(at::kByte));
+    }
+
+    static auto op = c10::Dispatcher::singleton()
+                         .findSchemaOrThrow("c10d::barrier", "")
+                         .typed<c10::intrusive_ptr<::c10d::Work>(
+                             at::Tensor,
+                             const c10::intrusive_ptr<::c10d::ProcessGroup>&,
+                             const std::vector<int64_t>&,
+                             bool,
+                             int64_t)>();
+
+    auto work = op.call(
+        tensor,
+        c10::intrusive_ptr<ProcessGroup>::unsafe_reclaim_from_nonowning(this),
+        opts.device_ids,
+        opts.asyncOp,
+        opts.timeout.count());
+    if (c10d::allow_inflight_collective_as_graph_input()) {
+      c10d::register_work(tensor, work);
+    }
+    return work;
+  }
+
+  bool hasBackends() {
+    return !deviceTypeToBackendType_.empty();
+  }
+
+  void setBackend(
+      c10::DeviceType deviceType,
+      BackendType backendType,
+      const std::optional<c10::intrusive_ptr<Backend>>& backend) {
+    // TODO: should we add these entries after the backend setting succeeds?
+    deviceTypeToBackendType_[deviceType] = backendType;
+    deviceTypes_.insert(deviceType);
+    // if the backendType is already set then reuse it for this device
+    if (backendTypeToBackend_.find(backendType) !=
+        backendTypeToBackend_.end()) {
+      auto existingBackend = backendTypeToBackend_.at(backendType);
+      deviceTypeToBackend_[deviceType] = existingBackend;
+      TORCH_CHECK(
+          existingBackend->getBoundDeviceId() ==
+          (*backend)->getBoundDeviceId());
+    } else {
+      // check if backend has value
+      if (backend.has_value()) {
+        deviceTypeToBackend_[deviceType] = backend.value();
+        backendTypeToBackend_[backendType] = backend.value();
+        (*backend)->setBoundDeviceId(bound_device_id_);
+      }
+    }
+  }
+
+  c10::intrusive_ptr<Backend> getDefaultBackend() const {
+    auto backend_iter = backendTypeToBackend_.find(backendType_);
+    TORCH_CHECK(
+        backend_iter != backendTypeToBackend_.end(),
+        "Could not find the default backend type ",
+        uint16_t(backendType_),
+        " for Process Group with name ",
+        getBackendName(),
+        ".");
+    return backend_iter->second;
+  }
+
+  void setDefaultBackend(const BackendType& backendType) {
+    backendType_ = backendType;
+  }
+
+  void setDefaultBackend(const std::string& backend) {
+    backendType_ = strToBackendType(backend);
+  }
+
+  c10::intrusive_ptr<Backend> getBackend(c10::DeviceType deviceType);
+
+  c10::intrusive_ptr<Backend> getBackend(BackendType backendType) const {
+    TORCH_CHECK(
+        backendTypeToBackend_.find(backendType) != backendTypeToBackend_.end(),
+        "Could not find backend type ",
+        uint16_t(backendType),
+        " for Process Group with name ",
+        backendTypeToString(backendType),
+        ".");
+    return backendTypeToBackend_.at(backendType);
+  }
+
+  // Return device types supported by this ProcessGroup.
+  // Note: the return type is `Device` rather than `DeviceType` for the purpose
+  // of easy comparison at Python level. The `Device` will have default index
+  // (-1).
+  std::vector<c10::Device> getDeviceTypes() const {
+    std::vector<c10::Device> devices;
+    devices.reserve(deviceTypes_.size());
+    for (auto& dt : deviceTypes_) {
+      devices.emplace_back(dt);
+    }
+    return devices;
+  }
+
+  void registerOnCompletionHook(
+      std::function<void(std::shared_ptr<WorkInfo>)>&& hook) {
+    getDefaultBackend()->registerOnCompletionHook(std::move(hook));
+  }
+
+  void waitForPendingWorks() {
+    getDefaultBackend()->waitForPendingWorks();
+  }
+
+  virtual void shutdown() {
+    for (auto& backend : backendTypeToBackend_) {
+      backend.second->shutdown();
+    }
+  }
+
+  virtual void abort() {
+    for (auto& backend : backendTypeToBackend_) {
+      backend.second->abort();
+    }
+  }
+
+  bool hasHooks() const {
+    auto backend_iter = backendTypeToBackend_.find(backendType_);
+    if (backend_iter == backendTypeToBackend_.end()) {
+      TORCH_WARN(
+          "No backend of type ",
+          uint16_t(backendType_),
+          " found for Process Group with name ",
+          getBackendName(),
+          ". Assuming no hooks are registered.");
+      return false;
+    }
+
+    return backend_iter->second->hasHooks();
+  }
+
+  virtual const std::string& getGroupName() const;
+  virtual void setGroupName(const std::string& name);
+  virtual const std::string& getGroupDesc() const;
+  virtual void setGroupDesc(const std::string& name);
+  void enableCollectivesTiming();
+
+  void release_resources() override;
+
+  // ProcessGroups optionally can be "bound" to a specific device.
+  // Currently this is only for nccl and allows for some opt-in
+  // optimizations such as automatic use of ncclCommSplit.  The device
+  // is specified in `init_process_group` and eventually makes it
+  // here and then down into the actual backend instances.
+  std::optional<at::Device> getBoundDeviceId() const {
+    return bound_device_id_;
+  }
+
+  void setBoundDeviceId(std::optional<at::Device> device) {
+    if (device) {
+      TORCH_CHECK(device->has_index(), "setBoundDeviceId must have an index");
+    }
+    bound_device_id_ = device;
+  }
+
+ protected:
+  // Implementations of this interface need to call this to setup
+  // appropriate logging etc.
+  void init();
+
+  c10::intrusive_ptr<c10d::Store> store_;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const int rank_;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const int size_;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  BackendType backendType_;
+  std::string pg_desc_;
+
+  // Debug level setting. It is parsed once when ProcessGroup is constructed and
+  // remains the same across use of this process group.
+  DebugLevel dist_debug_level_{DebugLevel::Off};
+
+  // Backend classes for this ProcessGroup
+  std::unordered_set<c10::DeviceType> deviceTypes_;
+  std::unordered_map<c10::DeviceType, BackendType> deviceTypeToBackendType_;
+  std::unordered_map<c10::DeviceType, c10::intrusive_ptr<Backend>>
+      deviceTypeToBackend_;
+  std::unordered_map<BackendType, c10::intrusive_ptr<Backend>>
+      backendTypeToBackend_;
+
+  std::optional<at::Device> bound_device_id_;
+};
+
+} // namespace c10d
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a2929ed5df6a88ebd75cff80bd8bd10afbf448ff
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroupGloo.hpp
@@ -0,0 +1,464 @@
+#pragma once
+
+#ifdef USE_C10D_GLOO
+
+#include <condition_variable>
+#include <deque>
+#include <mutex>
+#include <thread>
+#include <utility>
+#include <vector>
+
+#include <gloo/algorithm.h>
+#include <gloo/common/error.h>
+#include <gloo/context.h>
+#include <gloo/rendezvous/store.h>
+#include <gloo/transport/device.h>
+
+#include <c10/util/hash.h>
+
+#include <torch/csrc/distributed/c10d/Backend.hpp>
+#include <torch/csrc/distributed/c10d/Store.hpp>
+#include <torch/csrc/distributed/c10d/Types.hpp>
+#include <torch/csrc/distributed/c10d/Utils.hpp>
+#include <torch/csrc/distributed/c10d/logger.hpp>
+
+#include <ATen/ThreadLocalState.h>
+
+namespace c10d {
+
+constexpr const char* GLOO_BACKEND_NAME = "gloo";
+
+// Control whether or not connections are established in a full mesh or lazily
+// as needed.
+static std::vector<std::string> TORCH_GLOO_LAZY_INIT = {"TORCH_GLOO_LAZY_INIT"};
+
+// Returns default value for lazyInit.
+bool TORCH_API getDefaultGlooLazyInit();
+
+// ProcessGroupGloo implements Gloo bindings for c10d.
+//
+// All functions on this class are expected to be called in the same
+// order across processes in the group. This is the only way that we
+// can guarantee to match up the same calls across processes. For
+// multi-threaded usage of process groups, you can consider using
+// multiple process group instances.
+//
+class TORCH_API ProcessGroupGloo : public Backend {
+ public:
+  // AsyncWork is the Gloo specific superclass for asynchronous work items.
+  // We can split asynchronous work into 3 phases:
+  // 1) Sanity checks and prepare input (e.g. memcpy)
+  // 2) Run operation on background thread
+  // 3) Synchronize with completion on foreground thread
+  //
+  // There is state to be shared between these 3 phases and all of this state
+  // is captured in the AsyncWork class and its derivatives.
+  //
+  // Note: while we are porting operations to use new style collectives, there
+  // is a split between operations using the existing caching approach and
+  // operations using the new AsyncWork base class. Over time we will port
+  // all operations and perform needed cleanup.
+  //
+  // FIXME: This probably should be called WorkGloo since the work is executed
+  // in sync mode by a background thread.
+  class TORCH_API AsyncWork : public Work {
+   public:
+    explicit AsyncWork(
+        std::shared_ptr<gloo::Context> context,
+        std::vector<std::vector<at::Tensor>> outputTensors,
+        OpType opType,
+        uint64_t seq,
+        const char* profilingTitle = nullptr,
+        const std::optional<std::vector<at::Tensor>>& inputTensors =
+            std::nullopt);
+
+    ~AsyncWork() override = default;
+
+    static void execute(const c10::intrusive_ptr<AsyncWork>& work);
+
+    virtual void run() = 0;
+
+    std::vector<at::Tensor> result() override;
+
+    c10::intrusive_ptr<c10::ivalue::Future> getFuture() override;
+    uint64_t getSequencenumber() const override;
+    std::chrono::milliseconds getTimeout() const;
+    virtual const std::vector<at::Tensor> getInputTensors() = 0;
+    virtual const std::vector<at::Tensor> getOutputTensors() = 0;
+    inline std::string getProfilerTitle() const {
+      return profilingTitle_;
+    }
+    inline at::ThreadLocalState getTLS() const {
+      return tls_;
+    }
+
+   protected:
+    friend class ProcessGroupGloo;
+    // unique id used to tell the trace buffer that this
+    // work has completed
+    std::optional<uint64_t> trace_id_;
+    std::shared_ptr<gloo::Context> context_;
+
+   private:
+    void finishWorkGloo();
+    void finishWorkGlooError(const std::exception_ptr& eptr);
+    inline void recordAsyncWorkProfilingInfo(
+        const char* profilingTitle,
+        const std::optional<std::vector<at::Tensor>>& inputTensors);
+
+    const std::vector<std::vector<at::Tensor>> outputTensors_;
+    c10::intrusive_ptr<at::ivalue::Future> future_;
+    std::function<void()> recordFunctionBeforeCallback_;
+    const uint64_t seq_;
+    std::string profilingTitle_;
+    at::ThreadLocalState tls_;
+  };
+
+  // Wrap c10d store as Gloo store
+  class TORCH_API GlooStore : public ::gloo::rendezvous::Store {
+   public:
+    GlooStore(c10::intrusive_ptr<::c10d::Store> store)
+        : store_(std::move(store)) {}
+
+    void setUint(const std::string& key, const std::vector<uint8_t>& value) {
+      store_->set(key, value);
+    }
+
+    void set(const std::string& key, const std::vector<char>& value) override {
+      std::vector<uint8_t> tmp(value.begin(), value.end());
+      store_->set(key, tmp);
+    }
+
+    std::vector<uint8_t> getUint(const std::string& key) {
+      auto value = store_->get(key);
+      return value;
+    }
+
+    std::vector<char> get(const std::string& key) override {
+      auto value = store_->get(key);
+      return std::vector<char>(value.begin(), value.end());
+    }
+
+    void wait(const std::vector<std::string>& keys) override {
+      store_->wait(keys, ::c10d::Store::kDefaultTimeout);
+    }
+
+    void wait(
+        const std::vector<std::string>& keys,
+        const std::chrono::milliseconds& timeout) override {
+      store_->wait(keys, timeout);
+    }
+
+#ifdef GLOO_STORE_HAS_STORE_V2
+    bool has_v2_support() override {
+      return store_->hasExtendedApi();
+    }
+
+    std::vector<std::vector<char>> multi_get(
+        const std::vector<std::string>& keys) override {
+      std::vector<std::vector<char>> res;
+      for (auto& value : store_->multiGet(keys)) {
+        res.emplace_back(value.begin(), value.end());
+      }
+      return res;
+    }
+
+    void multi_set(
+        const std::vector<std::string>& keys,
+        const std::vector<std::vector<char>>& values) override {
+      std::vector<std::vector<uint8_t>> u_values;
+      u_values.reserve(values.size());
+      for (auto& value : values) {
+        u_values.emplace_back(value.begin(), value.end());
+      }
+      store_->multiSet(keys, u_values);
+    }
+
+    void append(const std::string& key, const std::vector<char>& value)
+        override {
+      std::vector<uint8_t> tmp(value.begin(), value.end());
+      return store_->append(key, tmp);
+    }
+
+    int64_t add(const std::string& key, int64_t value) override {
+      return store_->add(key, value);
+    }
+#endif
+
+   protected:
+    c10::intrusive_ptr<::c10d::Store> store_;
+  };
+
+  // For send and recv operations there is no need to pass them to the
+  // thread pool as they are entirely completed by the device thread.
+  // This work object is used to synchronize completion of the send or
+  // recv operation. It keeps a reference to the tensor it is
+  // operating on to prevent it from being deallocated while the
+  // operation is still in flight.
+  class TORCH_API SendWork : public Work {
+   public:
+    explicit SendWork(
+        at::Tensor& tensor,
+        std::unique_ptr<::gloo::transport::UnboundBuffer> buffer,
+        uint64_t seq);
+
+    bool wait(std::chrono::milliseconds timeout = kNoTimeout) override;
+
+    void abort() override;
+
+    uint64_t getSequencenumber() const override;
+
+   protected:
+    at::Tensor tensor_;
+    std::unique_ptr<::gloo::transport::UnboundBuffer> buffer_;
+    const uint64_t seq_;
+  };
+
+  class TORCH_API RecvWork : public Work {
+   public:
+    explicit RecvWork(
+        at::Tensor& tensor,
+        std::unique_ptr<::gloo::transport::UnboundBuffer> buffer,
+        OpType opType,
+        uint64_t seq,
+        const char* profilingTitle = nullptr);
+
+    int sourceRank() const override;
+
+    bool wait(std::chrono::milliseconds timeout = kNoTimeout) override;
+
+    void abort() override;
+
+    uint64_t getSequencenumber() const override;
+
+   protected:
+    at::Tensor tensor_;
+    std::unique_ptr<::gloo::transport::UnboundBuffer> buffer_;
+    int srcRank_;
+    const uint64_t seq_;
+  };
+
+  struct TORCH_API Options : public Backend::Options {
+    explicit Options(
+        std::chrono::milliseconds timeout = kBackendDefaultTimeout);
+
+    // return intrusive_ptr of the object
+    static c10::intrusive_ptr<Options> create(
+        std::chrono::milliseconds timeout = kBackendDefaultTimeout) {
+      return c10::make_intrusive<Options>(timeout);
+    }
+
+    std::vector<uint64_t> global_ranks_in_group;
+    std::string group_name;
+    std::vector<std::shared_ptr<::gloo::transport::Device>> devices;
+    int threads;
+  };
+
+  const std::string getBackendName() const override {
+    return std::string(GLOO_BACKEND_NAME);
+  }
+
+  // Helper functions to create a new device object.
+  // They are static functions on this class to keep them logically
+  // separate from the rest of the code base (e.g. torch/csrc/distributed).
+
+  // Create new device instance for specific interface.
+  static std::shared_ptr<::gloo::transport::Device> createDeviceForInterface(
+      const std::string& interface,
+      bool lazyInit = false);
+
+  // Create new device instance for specific hostname or address.
+  static std::shared_ptr<::gloo::transport::Device> createDeviceForHostname(
+      const std::string& hostname,
+      bool lazyInit = false);
+
+  // Create new device instance.
+  // It tries to resolve this machine's hostname and bind to that address.
+  // If that fails (i.e. the hostname doesn't resolve to an address), it
+  // falls back to binding to the loopback address.
+  static std::shared_ptr<::gloo::transport::Device> createDefaultDevice(
+      bool lazyInit = false);
+
+  explicit ProcessGroupGloo(
+      const c10::intrusive_ptr<Store>& store,
+      int rank,
+      int size,
+      c10::intrusive_ptr<Options> options = Options::create());
+
+  ~ProcessGroupGloo() override;
+
+  c10::intrusive_ptr<Options> getOptions() {
+    return options_;
+  }
+
+  const std::vector<uint64_t>& groupRanks() const;
+
+  c10::intrusive_ptr<Work> broadcast(
+      std::vector<at::Tensor>& tensors,
+      const BroadcastOptions& opts = BroadcastOptions()) override;
+
+  c10::intrusive_ptr<Work> allreduce(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceOptions& opts = AllreduceOptions()) override;
+
+  c10::intrusive_ptr<Work> allreduce_sparse(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceOptions& opts = AllreduceOptions()) override;
+
+  c10::intrusive_ptr<Work> allreduce_coalesced(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceCoalescedOptions& opts =
+          AllreduceCoalescedOptions()) override;
+
+  c10::intrusive_ptr<Work> reduce(
+      std::vector<at::Tensor>& tensors,
+      const ReduceOptions& opts = ReduceOptions()) override;
+
+  c10::intrusive_ptr<Work> _reduce_scatter_base(
+      at::Tensor& outputTensor,
+      at::Tensor& inputTensor,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
+
+  c10::intrusive_ptr<Work> _allgather_base(
+      at::Tensor& output_tensor,
+      at::Tensor& input_tensor,
+      const AllgatherOptions& opts = AllgatherOptions()) override;
+
+  c10::intrusive_ptr<Work> allgather(
+      std::vector<std::vector<at::Tensor>>& outputs,
+      std::vector<at::Tensor>& inputs,
+      const AllgatherOptions& opts = AllgatherOptions()) override;
+
+  c10::intrusive_ptr<Work> allgather_coalesced(
+      std::vector<std::vector<at::Tensor>>& output_lists,
+      std::vector<at::Tensor>& input_list,
+      const AllgatherOptions& opts = AllgatherOptions()) override;
+
+  c10::intrusive_ptr<Work> allgather_into_tensor_coalesced(
+      std::vector<at::Tensor>& outputs,
+      std::vector<at::Tensor>& inputs,
+      const AllgatherOptions& opts = AllgatherOptions()) override;
+
+  c10::intrusive_ptr<Work> gather(
+      std::vector<std::vector<at::Tensor>>& outputs,
+      std::vector<at::Tensor>& inputs,
+      const GatherOptions& opts = GatherOptions()) override;
+
+  c10::intrusive_ptr<Work> scatter(
+      std::vector<at::Tensor>& outputs,
+      std::vector<std::vector<at::Tensor>>& inputs,
+      const ScatterOptions& opts = ScatterOptions()) override;
+
+  c10::intrusive_ptr<Work> reduce_scatter(
+      std::vector<at::Tensor>& outputs,
+      std::vector<std::vector<at::Tensor>>& inputs,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
+
+  c10::intrusive_ptr<Work> reduce_scatter_tensor_coalesced(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
+
+  c10::intrusive_ptr<Work> alltoall_base(
+      at::Tensor& outputTensor,
+      at::Tensor& inputTensor,
+      std::vector<int64_t>& outputCounts,
+      std::vector<int64_t>& inputCounts,
+      const AllToAllOptions& opts = AllToAllOptions()) override;
+
+  c10::intrusive_ptr<Work> send(
+      std::vector<at::Tensor>& tensors,
+      int dstRank,
+      int tag) override;
+
+  c10::intrusive_ptr<Work> recv(
+      std::vector<at::Tensor>& tensors,
+      int srcRank,
+      int tag) override;
+
+  c10::intrusive_ptr<Work> recvAnysource(
+      std::vector<at::Tensor>& tensors,
+      int tag) override;
+
+  c10::intrusive_ptr<Work> barrier(
+      const BarrierOptions& opts = BarrierOptions()) override;
+
+  void enableCollectivesTiming() override;
+
+  const std::shared_ptr<::gloo::rendezvous::Store>& _getStore() const {
+    return store_;
+  }
+
+  // Similar to barrier(), but blocks rank 0 until all other ranks have
+  // acknowledged that they are alive (through send/recv from rank 0). Rank 0
+  // is able to report all failed ranks if waitAllRanks = true, otherwise
+  // reports the first rank it detected as failed.
+  void monitoredBarrier(
+      const BarrierOptions& opts = BarrierOptions(),
+      bool waitAllRanks = false) override;
+
+  // Agrees on an initial sequence number for the whole group by having rank 0
+  // create it and broadcast it to other ranks using the store.
+  void setSequenceNumberForGroup() override;
+
+  // Retrieves the current sequence number for the whole group, which should be
+  // in sync. If the returned number is not consistent across the group, it
+  // may indicate that there is some sort of collective desynchronization.
+  uint64_t getSequenceNumberForGroup() override;
+
+  int getNumThreads() {
+    return options_->threads;
+  }
+
+ protected:
+  std::shared_ptr<::gloo::rendezvous::Store> store_;
+  const c10::intrusive_ptr<Options> options_;
+
+  // Every Gloo context represents a set of connections to its peers.
+  // In order to use more than one device (or allow for parallelism on
+  // a single device), you need multiple contexts.
+  std::vector<std::shared_ptr<::gloo::Context>> contexts_;
+  std::vector<std::thread> threads_;
+  bool stop_;
+
+  // Incremented for every collective we kick off.
+  // The value is used as tag for collective operations. Collectives are kicked
+  // off in identical order across processes. Therefore the tag can be used
+  // to match up operations during concurrent execution.
+  uint32_t collectiveCounter_;
+
+  // Returns next collective tag to use (uses collectiveCounter_).
+  uint32_t nextTag();
+
+  // Returns the context to use for the specified tag.
+  // With `nextTag` returning an increasing number, this should lead
+  // to contexts being used in a round-robin fashion.
+  std::shared_ptr<::gloo::Context> getContext(uint32_t tag);
+
+  // Entrypoint for worker threads.
+  void runLoop(int workerIndex);
+
+  // Queue work to run on worker thread.
+  void enqueue(c10::intrusive_ptr<AsyncWork> work);
+
+  // Keep both a queue of pending work, and a vector with in progress work.
+  // Both of these can only be mutated when holding the queue lock.
+  // We keep both around instead of just the queue, so we can grab a weak_ptr
+  // to all in progress and pending work when executing a barrier.
+  // When executing a barrier, we need to ensure that all prior work
+  // has completed before completing itself.
+  std::deque<c10::intrusive_ptr<AsyncWork>> workQueue_;
+  std::vector<c10::intrusive_ptr<AsyncWork>> workInProgress_;
+  std::mutex workMutex_;
+  std::condition_variable workProduceCV_;
+  std::condition_variable workConsumeCV_;
+  uint64_t seq_{0};
+  size_t local_id_;
+  std::shared_ptr<ProcessGroupStatus> pgStatus_ =
+      std::make_shared<ProcessGroupStatus>();
+};
+
+} // namespace c10d
+
+#endif // USE_C10D_GLOO
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroupGlooDetail.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroupGlooDetail.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c80324605965145a31e94e08c07824e4bcc76745
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroupGlooDetail.hpp
@@ -0,0 +1,647 @@
+#pragma once
+
+#ifdef USE_C10D_GLOO
+
+#include <c10/util/Registry.h>
+#include <torch/csrc/distributed/c10d/ProcessGroupGloo.hpp>
+
+#include <gloo/allgather.h>
+#include <gloo/allgatherv.h>
+#include <gloo/allreduce.h>
+#include <gloo/alltoall.h>
+#include <gloo/alltoallv.h>
+#include <gloo/barrier.h>
+#include <gloo/broadcast.h>
+#include <gloo/gather.h>
+#include <gloo/reduce.h>
+#include <gloo/scatter.h>
+
+#ifdef _WIN32
+#define GENERATE_ALL_TYPES(type, func, ...)      \
+  switch (type) {                                \
+    case ::at::ScalarType::Float:                \
+      func<float>(__VA_ARGS__);                  \
+      break;                                     \
+    case ::at::ScalarType::Double:               \
+      func<double>(__VA_ARGS__);                 \
+      break;                                     \
+    case ::at::ScalarType::Half:                 \
+      func<c10::Half>(__VA_ARGS__);              \
+      break;                                     \
+    case ::at::ScalarType::BFloat16:             \
+      func<c10::BFloat16>(__VA_ARGS__);          \
+      break;                                     \
+    case ::at::ScalarType::Char:                 \
+      func<int8_t>(__VA_ARGS__);                 \
+      break;                                     \
+    case ::at::ScalarType::Byte:                 \
+    case ::at::ScalarType::Bool:                 \
+      func<uint8_t>(__VA_ARGS__);                \
+      break;                                     \
+    case ::at::ScalarType::Int:                  \
+      func<int32_t>(__VA_ARGS__);                \
+      break;                                     \
+    case ::at::ScalarType::Long:                 \
+      func<int64_t>(__VA_ARGS__);                \
+      break;                                     \
+    default:                                     \
+      TORCH_CHECK(false, "Invalid scalar type"); \
+  }
+
+#define HOST_NAME_MAX 256
+#else
+#define GENERATE_ALL_TYPES(type, func, args...)  \
+  switch (type) {                                \
+    case ::at::ScalarType::Float:                \
+      func<float>(args);                         \
+      break;                                     \
+    case ::at::ScalarType::Double:               \
+      func<double>(args);                        \
+      break;                                     \
+    case ::at::ScalarType::Half:                 \
+      func<c10::Half>(args);                     \
+      break;                                     \
+    case ::at::ScalarType::BFloat16:             \
+      func<c10::BFloat16>(args);                 \
+      break;                                     \
+    case ::at::ScalarType::Char:                 \
+      func<int8_t>(args);                        \
+      break;                                     \
+    case ::at::ScalarType::Byte:                 \
+    case ::at::ScalarType::Bool:                 \
+      func<uint8_t>(args);                       \
+      break;                                     \
+    case ::at::ScalarType::Int:                  \
+      func<int32_t>(args);                       \
+      break;                                     \
+    case ::at::ScalarType::Long:                 \
+      func<int64_t>(args);                       \
+      break;                                     \
+    default:                                     \
+      TORCH_CHECK(false, "Invalid scalar type"); \
+  }
+#endif
+
+namespace c10d {
+
+TORCH_DECLARE_TYPED_REGISTRY(
+    GlooAllreduceRegistry,
+    c10::DeviceType,
+    ProcessGroupGloo::AsyncWork,
+    c10::intrusive_ptr,
+    std::shared_ptr<gloo::Context>,
+    std::vector<at::Tensor>&,
+    ReduceOp,
+    uint32_t,
+    uint64_t);
+
+// This function initializes a vector of CUDA streams, one for every
+// tensor in the input tensor vector, and ensures that these streams are
+// synchronized with the current default streams. This is needed so
+// that new work on the new streams is serialized w.r.t. all operations
+// on the tensors.
+TORCH_API void initializeStreamsEvents(
+    const std::vector<at::Tensor>& tensors,
+    std::vector<c10::Stream>& streams,
+    std::vector<c10::Event>& events);
+
+// This function initializes a vector of CUDA streams, one per device,
+// and ensures that these streams are synchronized with the current default
+// streams. It is assumed that the tensors in the nested tensor vectors are
+// on the same device.
+TORCH_API void initializeStreamsEvents(
+    std::vector<std::vector<at::Tensor>>& tensors,
+    std::vector<c10::Stream>& streams,
+    std::vector<c10::Event>& events);
+
+typedef void (*ReduceFunc)(void*, const void*, const void*, size_t);
+
+template <typename T, std::enable_if_t<!std::is_integral_v<T>, int> = 0>
+ReduceFunc toFunction(const ReduceOp& r) {
+  switch (r) {
+    case ReduceOp::SUM:
+    case ReduceOp::AVG:
+      return ReduceFunc(&::gloo::sum<T>);
+    case ReduceOp::PRODUCT:
+      return ReduceFunc(&::gloo::product<T>);
+    case ReduceOp::MIN:
+      return ReduceFunc(&::gloo::min<T>);
+    case ReduceOp::MAX:
+      return ReduceFunc(&::gloo::max<T>);
+    case ReduceOp::BAND:
+      TORCH_CHECK(false, "Cannot use ReduceOp.BAND with non-integral dtype");
+      break;
+    case ReduceOp::BOR:
+      TORCH_CHECK(false, "Cannot use ReduceOp.BOR with non-integral dtype");
+      break;
+    case ReduceOp::BXOR:
+      TORCH_CHECK(false, "Cannot use ReduceOp.BXOR with non-integral dtype");
+      break;
+    case ReduceOp::PREMUL_SUM:
+      TORCH_CHECK(false, "Cannot use ReduceOp.PREMUL_SUM with Gloo");
+      break;
+    case ReduceOp::UNUSED:
+    default:
+      break;
+  }
+
+  TORCH_CHECK(false, "Unhandled ReduceOp");
+}
+
+// Bitwise AND with SFINAE guard for integral types.
+template <typename T, std::enable_if_t<std::is_integral_v<T>, int> = 0>
+void band(void* c, const void* a, const void* b, size_t n) {
+  auto tc = static_cast<T*>(c);
+  auto ta = static_cast<const T*>(a);
+  auto tb = static_cast<const T*>(b);
+  for (const auto i : c10::irange(n)) {
+    tc[i] = ta[i] & tb[i];
+  }
+}
+
+// Bitwise OR with SFINAE guard for integral types.
+template <typename T, std::enable_if_t<std::is_integral_v<T>, int> = 0>
+void bor(void* c, const void* a, const void* b, size_t n) {
+  auto tc = static_cast<T*>(c);
+  auto ta = static_cast<const T*>(a);
+  auto tb = static_cast<const T*>(b);
+  for (const auto i : c10::irange(n)) {
+    tc[i] = ta[i] | tb[i];
+  }
+}
+
+// Bitwise XOR with SFINAE guard for integral types.
+template <typename T, std::enable_if_t<std::is_integral_v<T>, int> = 0>
+void bxor(void* c, const void* a, const void* b, size_t n) {
+  auto tc = static_cast<T*>(c);
+  auto ta = static_cast<const T*>(a);
+  auto tb = static_cast<const T*>(b);
+  for (const auto i : c10::irange(n)) {
+    tc[i] = ta[i] ^ tb[i];
+  }
+}
+
+template <typename T, std::enable_if_t<std::is_integral_v<T>, int> = 0>
+ReduceFunc toFunction(const ReduceOp& r) {
+  switch (r) {
+    case ReduceOp::SUM:
+    case ReduceOp::AVG:
+      return ReduceFunc(&::gloo::sum<T>);
+    case ReduceOp::PRODUCT:
+      return ReduceFunc(&::gloo::product<T>);
+    case ReduceOp::MIN:
+      return ReduceFunc(&::gloo::min<T>);
+    case ReduceOp::MAX:
+      return ReduceFunc(&::gloo::max<T>);
+    case ReduceOp::BAND:
+      return ReduceFunc(&band<T>);
+    case ReduceOp::BOR:
+      return ReduceFunc(&bor<T>);
+    case ReduceOp::BXOR:
+      return ReduceFunc(&bxor<T>);
+    case ReduceOp::PREMUL_SUM:
+      TORCH_CHECK(false, "Cannot use ReduceOp.PREMUL_SUM with Gloo");
+      break;
+    case ReduceOp::UNUSED:
+    default:
+      break;
+  }
+
+  TORCH_CHECK(false, "Unhandled ReduceOp");
+}
+
+template <typename T, typename O>
+void setInputs(O& opts, std::vector<at::Tensor>& tensors) {
+  opts.setInputs(getDataPointers<T>(tensors), tensors[0].numel());
+}
+
+template <typename T, typename O>
+void setInput(O& opts, at::Tensor& tensor) {
+  opts.setInput(getDataPointer<T>(tensor), tensor.numel());
+}
+
+template <typename T, typename O>
+void setInput(O& opts, at::Tensor& tensor, std::vector<size_t>& counts) {
+  opts.setInput(getDataPointer<T>(tensor), counts);
+}
+
+template <typename T, typename O>
+void setInput(O& opts, at::Tensor& tensor, std::vector<int64_t>& counts) {
+  opts.setInput(getDataPointer<T>(tensor), counts);
+}
+
+template <typename T, typename O>
+void setOutputs(O& opts, std::vector<at::Tensor>& tensors) {
+  opts.setOutputs(getDataPointers<T>(tensors), tensors[0].numel());
+}
+
+template <typename T, typename O>
+void setOutput(O& opts, at::Tensor& tensor) {
+  opts.setOutput(getDataPointer<T>(tensor), tensor.numel());
+}
+
+template <typename T, typename O>
+void setOutput(O& opts, at::Tensor& tensor, std::vector<size_t>& counts) {
+  opts.setOutput(getDataPointer<T>(tensor), counts);
+}
+
+template <typename T, typename O>
+void setOutput(O& opts, at::Tensor& tensor, std::vector<int64_t>& counts) {
+  opts.setOutput(getDataPointer<T>(tensor), counts);
+}
+
+static at::Tensor pinnedLike(at::Tensor& tensor) {
+  auto* allocator = at::detail::getCUDAHooks().getPinnedMemoryAllocator();
+  auto storage = c10::Storage(
+      c10::Storage::use_byte_size_t(),
+      static_cast<int64_t>(at::detail::computeStorageNbytes(
+          tensor.sizes(), tensor.strides(), tensor.dtype().itemsize())),
+      allocator,
+      /*resizable=*/false);
+  return at::empty({0}, tensor.options().device(at::kCPU))
+      .set_(storage, 0, tensor.sizes(), tensor.strides());
+}
+
+class AsyncAllreduceWork : public ProcessGroupGloo::AsyncWork {
+ public:
+  AsyncAllreduceWork(
+      std::shared_ptr<gloo::Context> context,
+      std::vector<at::Tensor>& inputs,
+      ReduceOp reduceOp,
+      uint32_t tag,
+      uint64_t seq)
+      : ProcessGroupGloo::AsyncWork(
+            std::move(context),
+            {inputs},
+            OpType::ALLREDUCE,
+            seq,
+            "gloo:all_reduce",
+            inputs),
+        inputs(inputs),
+        reduceOp(std::move(reduceOp)),
+        tag(tag) {}
+
+  std::vector<at::Tensor> inputs{};
+  const ReduceOp reduceOp;
+  const uint32_t tag;
+
+  void allreduce(std::vector<at::Tensor>& tensors) {
+    const auto& scalarType = tensors[0].scalar_type();
+    gloo::AllreduceOptions opts(context_);
+    opts.setReduceFunction(getFunction(scalarType, reduceOp));
+    opts.setTag(tag);
+    GENERATE_ALL_TYPES(scalarType, setOutputs, opts, tensors);
+    gloo::allreduce(opts);
+
+    // Gloo doesn't support AVG so we use SUM + division.
+    if (reduceOp == ReduceOp::AVG) {
+      tensors[0] /= context_->size;
+    }
+  }
+
+  const std::vector<at::Tensor> getInputTensors() override {
+    return inputs;
+  }
+
+  const std::vector<at::Tensor> getOutputTensors() override {
+    return inputs;
+  }
+
+  void run() override {
+    allreduce(inputs);
+  }
+
+  template <typename T>
+  void getFunction(gloo::AllreduceOptions::Func& fn, const ReduceOp op) {
+    fn = toFunction<T>(op);
+  }
+
+  gloo::AllreduceOptions::Func getFunction(
+      const at::ScalarType& dtype,
+      const ReduceOp& op) {
+    gloo::AllreduceOptions::Func fn;
+    GENERATE_ALL_TYPES(dtype, getFunction, fn, op);
+    return fn;
+  }
+};
+
+class AsyncAllreduceCoalescedWork : public AsyncAllreduceWork {
+ public:
+  AsyncAllreduceCoalescedWork(
+      const std::shared_ptr<gloo::Context>& context,
+      std::vector<at::Tensor>& inputs,
+      ReduceOp reduceOp,
+      uint32_t tag,
+      uint64_t seq)
+      : AsyncAllreduceWork(context, inputs, std::move(reduceOp), tag, seq) {}
+
+  void run() override {
+    allreduceCoalesced(inputs);
+  }
+
+ private:
+  void allreduceCoalesced(std::vector<at::Tensor>& tensors) {
+    // reduce coalesced, flattened tensors.
+    at::Tensor coalescedTensor = flattenDenseTensors(tensors);
+    std::vector<at::Tensor> allreduceInput = {coalescedTensor};
+    allreduce(allreduceInput);
+
+    // separate and reshape tensors.
+    size_t offset = 0;
+    for (at::Tensor& tensor : tensors) {
+      const int64_t tensorNumel = tensor.numel();
+      const c10::IntArrayRef tensorShape = tensor.sizes();
+      tensor.copy_(coalescedTensor.slice(0, offset, offset + tensorNumel)
+                       .view(tensorShape));
+      offset += tensorNumel;
+    }
+  }
+};
+
+class AsyncSparseAllreduceWork : public ProcessGroupGloo::AsyncWork {
+ public:
+  AsyncSparseAllreduceWork(
+      std::shared_ptr<gloo::Context> context,
+      std::vector<at::Tensor>& inputs,
+      uint32_t tag,
+      uint64_t seq)
+      : ProcessGroupGloo::AsyncWork(
+            std::move(context),
+            {inputs},
+            OpType::_ALLREDUCE_SPARSE,
+            seq,
+            "gloo:sparse_all_reduce",
+            inputs),
+        inputs(inputs),
+        tag(tag) {}
+
+  std::vector<at::Tensor> inputs{};
+  const uint32_t tag;
+
+  // We share dimensionality about the sparse tensors before collecting
+  // their contents. We assume here that the maximum number of sparse
+  // and dense dimensions is 4. This is stored in a contiguous piece of
+  // memory so that we can easily run allgather on it.
+  //
+  // The layout of this memory is as follows:
+  //
+  //   - [0:4]: sparse dims
+  //   - [4:8]: dense dims
+  //   -   [8]: nnz
+  //
+  class SparseTensorMetadata {
+   public:
+    static constexpr auto dim = 9;
+
+    // Construct from an existing metadata tensor to facilitate structured
+    // access to metadata from peers, after gathering it.
+    explicit SparseTensorMetadata(at::Tensor metadata)
+        : metadata_(std::move(metadata)),
+          data_(metadata_.mutable_data_ptr<int64_t>()) {
+      AT_ASSERT(metadata_.scalar_type() == at::kLong);
+      AT_ASSERT(metadata_.dim() == 1);
+      AT_ASSERT(metadata_.size(0) == dim);
+    }
+
+    // Populate the metadata.
+    void populate_from_sparse_tensor(const at::Tensor& tensor) {
+      const auto sparse_dim = tensor.sparse_dim();
+      AT_ASSERT(sparse_dim <= 4);
+      for (const auto i : c10::irange(4)) {
+        if (i < sparse_dim) {
+          data_[i] = tensor.size(i);
+        }
+      }
+      const auto dense_dim = tensor.dense_dim();
+      AT_ASSERT(dense_dim <= 4);
+      for (const auto i : c10::irange(4)) {
+        if (i < dense_dim) {
+          data_[i + 4] = tensor.size(sparse_dim + i);
+        }
+      }
+      data_[8] = tensor._nnz();
+    }
+
+    std::vector<int64_t> sizes() const {
+      std::vector<int64_t> sizes;
+      // Sparse sizes
+      for (const auto i : c10::irange(4)) {
+        if (data_[i] <= 0) {
+          break;
+        }
+        sizes.push_back(data_[i]);
+      }
+      // Dense sizes
+      for (const auto i : c10::irange(4, 8)) {
+        if (data_[i] <= 0) {
+          break;
+        }
+        sizes.push_back(data_[i]);
+      }
+      return sizes;
+    }
+
+    int64_t nnz() const {
+      return data_[8];
+    }
+
+   protected:
+    at::Tensor metadata_;
+    int64_t* data_;
+  };
+
+  // Sparse allreduce is implemented with allgather on indices and values.
+  // Every process then sums the resulting sparse tensors locally.
+  // The nnz for sparse tensors may be different across processes, so first
+  // we run allgather on the nnz, and then allgather with max(nnz).
+  at::Tensor allreduce(std::vector<at::Tensor>& tensors) {
+    // TODO: This is a massive hack!  There is some confusion about
+    // Variable/Tensor inside the body of this function.  Turning off
+    // grad smooths over the confusion for now.  This fixes
+    // test/test_c10d_gloo.py ProcessGroupGlooTest.test_sparse_allreduce_basics
+    //
+    // The correct fix is to stop allocating tensors that are not variables,
+    // but to conveniently do this c10d must depend on torch not ATen
+    at::AutoDispatchBelowAutograd guard;
+    auto input = tensors[0];
+
+    // Perform local reduction if we have multiple inputs.
+    for (const auto i : c10::irange(1, tensors.size())) {
+      input += tensors[i];
+    }
+
+    // Need to coalesce before we can access indices and values.
+    input = input.coalesce();
+
+    // Gather metadata information from all ranks.
+    auto metadata = allgather_metadata(input);
+
+    // Sanity check dimensionality across ranks.
+    {
+      const auto expected = metadata[context_->rank].sizes();
+      for (const auto i : c10::irange(context_->size)) {
+        if (i == context_->rank) {
+          continue;
+        }
+        const auto actual = metadata[i].sizes();
+        TORCH_CHECK(actual == expected, "Sparse dimensions do not match");
+      }
+    }
+
+    // Gather all indices and all values.
+    auto indices = allgather_indices(input, metadata);
+    auto values = allgather_values(input, metadata);
+
+    // Perform global reduction.
+    AT_ASSERT(static_cast<int>(indices.size()) == context_->size);
+    AT_ASSERT(static_cast<int>(values.size()) == context_->size);
+    auto output = at::sparse_coo_tensor(
+        indices[0], values[0], input.sizes(), input.options());
+    for (const auto i : c10::irange(1, context_->size)) {
+      output += at::sparse_coo_tensor(
+          indices[i], values[i], input.sizes(), input.options());
+    }
+
+    // Coalesce for good measure.
+    return output.coalesce();
+  }
+
+  void run() override {
+    auto output = allreduce(inputs);
+
+    // This copy is needed when we run a multi-gpu version of reduce (multiple
+    // inputs per rank).
+    for (const auto i : c10::irange(inputs.size())) {
+      inputs[i].copy_(output);
+    }
+  }
+
+  const std::vector<at::Tensor> getInputTensors() override {
+    return inputs;
+  }
+
+  const std::vector<at::Tensor> getOutputTensors() override {
+    return inputs;
+  }
+
+ private:
+  std::vector<SparseTensorMetadata> allgather_metadata(
+      const at::Tensor& tensor) {
+    auto buffer =
+        at::zeros({context_->size, SparseTensorMetadata::dim}, at::kLong);
+
+    // Prepare metadata vector (1 entry per rank)
+    std::vector<SparseTensorMetadata> metadata;
+    metadata.reserve(context_->size);
+    for (const auto i : c10::irange(context_->size)) {
+      metadata.emplace_back(buffer.select(0, i));
+    }
+
+    // Populate data for this rank
+    metadata[context_->rank].populate_from_sparse_tensor(tensor);
+
+    // Allgather metadata
+    gloo::AllgatherOptions opts(context_);
+    opts.setOutput(buffer.mutable_data_ptr<int64_t>(), buffer.numel());
+    opts.setTag(tag);
+    gloo::allgather(opts);
+
+    return metadata;
+  }
+
+  std::vector<at::Tensor> allgather_indices(
+      const at::Tensor& tensor,
+      const std::vector<SparseTensorMetadata>& metadata) {
+    const auto sparseDim = tensor.sparse_dim();
+
+    std::vector<size_t> counts(context_->size);
+    size_t totalSize = 0;
+    for (const auto i : c10::irange(metadata.size())) {
+      counts[i] = metadata[i].nnz() * sparseDim;
+      totalSize += counts[i];
+    }
+
+    auto output = at::empty({static_cast<int64_t>(totalSize)}, at::kLong);
+
+    // tensors copied from cuda may not be contiguous, get a contiguous
+    // tensor before use its data_ptr
+    auto input = tensor.indices().contiguous();
+
+    // Allgatherv indices.
+    gloo::AllgathervOptions opts(context_);
+    opts.setInput(
+        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+        const_cast<int64_t*>(input.const_data_ptr<int64_t>()),
+        input.numel());
+    opts.setOutput(output.mutable_data_ptr<int64_t>(), counts);
+    opts.setTag(tag);
+    gloo::allgatherv(opts);
+
+    // Compile indices tensor per rank.
+    std::vector<at::Tensor> indices;
+    indices.reserve(metadata.size());
+    int64_t offset = 0;
+    for (const auto& i : metadata) {
+      const auto nnz = i.nnz();
+      const auto numel = sparseDim * nnz;
+      indices.push_back(
+          output.narrow(0, offset, numel).reshape({sparseDim, nnz}));
+      offset += numel;
+    }
+
+    return indices;
+  }
+
+  std::vector<at::Tensor> allgather_values(
+      const at::Tensor& tensor,
+      const std::vector<SparseTensorMetadata>& metadata) {
+    // There are nnz #dense_dim()-dimensional tensors per rank.
+    const auto valueShape = tensor.sizes().slice(tensor.sparse_dim());
+    int64_t denseNumel = 1;
+    for (auto dim : valueShape) {
+      denseNumel *= dim;
+    }
+
+    std::vector<size_t> counts(context_->size);
+    int64_t totalSize = 0;
+    for (const auto i : c10::irange(metadata.size())) {
+      counts[i] = metadata[i].nnz() * denseNumel;
+      totalSize += static_cast<int64_t>(counts[i]);
+    }
+
+    auto output = at::empty({totalSize}, tensor.scalar_type());
+
+    // Allgatherv indices.
+    gloo::AllgathervOptions opts(context_);
+    // tensors copied from cuda may not be contiguous, get a contiguous
+    // tensor before use its data_ptr
+    at::Tensor valueTensor = tensor.values().contiguous();
+    GENERATE_ALL_TYPES(valueTensor.scalar_type(), setInput, opts, valueTensor);
+    GENERATE_ALL_TYPES(
+        valueTensor.scalar_type(), setOutput, opts, output, counts);
+    opts.setTag(tag);
+    gloo::allgatherv(opts);
+
+    // Compile values tensor per rank.
+    std::vector<at::Tensor> values;
+    values.reserve(metadata.size());
+    int64_t offset = 0;
+    for (const auto& i : metadata) {
+      const auto nnz = i.nnz();
+      const auto numel = denseNumel * nnz;
+      auto tensorShape = std::vector<int64_t>({(int64_t)nnz});
+      std::copy(
+          valueShape.begin(),
+          valueShape.end(),
+          std::back_inserter(tensorShape));
+      values.push_back(output.narrow(0, offset, numel).reshape(tensorShape));
+      offset += numel;
+    }
+
+    return values;
+  }
+};
+
+} // namespace c10d
+
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2842012ba48f88db9414bebdd2cb9823fe568c54
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp
@@ -0,0 +1,273 @@
+#pragma once
+
+#ifdef USE_C10D_MPI
+
+#include <condition_variable>
+#include <deque>
+#include <exception>
+#include <memory>
+#include <mutex>
+#include <thread>
+#include <vector>
+
+#include <ATen/core/ivalue.h>
+#include <ATen/core/ivalue_inl.h>
+
+#include <torch/csrc/distributed/c10d/Backend.hpp>
+#include <torch/csrc/distributed/c10d/Types.hpp>
+#include <torch/csrc/distributed/c10d/Utils.hpp>
+
+#include <mpi.h>
+
+namespace c10d {
+
+constexpr const char* MPI_BACKEND_NAME = "mpi";
+
+// WorkEntry is the state associated with a single MPI run instance.
+// It include the source Tensor list and destination Tensor list, as well as
+// The actual run function that will operate either on src or dst or both.
+struct WorkEntry {
+  explicit WorkEntry(
+      std::vector<at::Tensor>* srcPtr,
+      std::vector<at::Tensor>* dstPtr,
+      std::function<void(std::unique_ptr<WorkEntry>&)> run)
+      : dst(dstPtr ? *dstPtr : std::vector<at::Tensor>()), run(std::move(run)) {
+    if (srcPtr) {
+      src = *srcPtr;
+    }
+  }
+
+  // Not copyable
+  WorkEntry(const WorkEntry&) = delete;
+  // Not copy assignable
+  WorkEntry& operator=(const WorkEntry&) = delete;
+
+  // For input and output tensors (in-place), we will always use src
+  std::vector<at::Tensor> src;
+
+  // Copy of user provided outputs.
+  const std::vector<at::Tensor> dst;
+
+  // src rank returned, for recv only
+  int* srcRank = nullptr;
+  std::function<void(std::unique_ptr<WorkEntry>&)> run;
+};
+
+// ProcessGroupMPI implements MPI bindings for c10d.
+//
+// All functions on this class are expected to be called in the same
+// order across processes in the group. This is the only way that we
+// can guarantee to match up the same calls across processes.
+//
+// All MPI functions provided by this class is asynchronously scheduled on a
+// Worker thread. Therefore, ProcessGroupMPI requires the MPI implementation
+// that is used to have a minimum thread support value of MPI_THREAD_SERIALIZED.
+// That is, The process may be multi-threaded, and multiple threads may make
+// MPI calls, but only one at a time: MPI calls are not made concurrently from
+// two distinct threads (all MPI calls are serialized). However, with
+// MPI_THREAD_SERIALIZED, ProcessGroupMPI will only support a single process
+// group. In other words, no more than 1 process group can be created globally.
+//
+// If you would like to use multiple ProcessGroupMPI, it requires your MPI
+// implementation to have a thread support value of MPI_THREAD_MULTIPLE, that
+// is, multiple threads may call MPI, with no restriction.
+//
+// Also note that ProcessGroupMPI only supports a single Tensor operation. In
+// other words, the size of the input Tensor vector should always be 1.
+//
+// CUDA tensor can be supported if the MPI used is CUDA-aware MPI, and
+// ProcessGroupMPI will automatically detect this support.
+class TORCH_API ProcessGroupMPI : public Backend {
+ public:
+  class WorkMPI : public Work {
+   public:
+    explicit WorkMPI(
+        std::vector<at::Tensor> outputTensors,
+        const char* profilingTitle = nullptr,
+        const std::optional<std::vector<at::Tensor>>& inputTensors =
+            std::nullopt)
+        : Work(-1, OpType::UNKNOWN, profilingTitle, inputTensors),
+          outputTensors_(std::move(outputTensors)),
+          future_(c10::make_intrusive<at::ivalue::Future>(
+              c10::ListType::create(c10::TensorType::get()))) {}
+
+    std::vector<at::Tensor> result() override;
+
+    c10::intrusive_ptr<c10::ivalue::Future> getFuture() override;
+
+   protected:
+    friend class ProcessGroupMPI;
+
+   private:
+    void finishWorkMPI();
+    void finishWorkMPIError(const std::exception_ptr& eptr);
+
+    std::vector<at::Tensor> outputTensors_;
+    c10::intrusive_ptr<at::ivalue::Future> future_;
+  };
+
+  class AsyncWork : public Work {
+   public:
+    AsyncWork(
+        MPI_Request request,
+        std::vector<at::Tensor> outputTensors,
+        const char* profilingTitle = nullptr,
+        const std::optional<std::vector<at::Tensor>>& inputTensors =
+            std::nullopt);
+
+    ~AsyncWork() override;
+
+    bool isCompleted() override;
+
+    bool isSuccess() const override;
+
+    int sourceRank() const override;
+
+    bool wait(std::chrono::milliseconds timeout = kUnsetTimeout) override;
+
+    void abort() override;
+
+    std::vector<at::Tensor> result() override;
+
+   protected:
+    void populateException();
+
+   private:
+    const std::vector<at::Tensor> outputTensors_;
+    MPI_Request request_;
+    MPI_Status status_{};
+  };
+
+  // Constructor will spawn up the worker thread loop
+  explicit ProcessGroupMPI(int rank, int size, MPI_Comm pgComm);
+
+  ~ProcessGroupMPI() override;
+
+  // Abort the MPI program, needs to be called when exception is detected
+  void abort() override;
+
+  const std::string getBackendName() const override {
+    return std::string(MPI_BACKEND_NAME);
+  }
+
+  c10::intrusive_ptr<Work> broadcast(
+      std::vector<at::Tensor>& data,
+      const BroadcastOptions& opts = BroadcastOptions()) override;
+
+  c10::intrusive_ptr<Work> allreduce(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceOptions& opts = AllreduceOptions()) override;
+
+  c10::intrusive_ptr<Work> allreduce_coalesced(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceCoalescedOptions& opts =
+          AllreduceCoalescedOptions()) override;
+
+  c10::intrusive_ptr<Work> reduce(
+      std::vector<at::Tensor>& tensors,
+      const ReduceOptions& opts = ReduceOptions()) override;
+
+  c10::intrusive_ptr<Work> allgather(
+      std::vector<std::vector<at::Tensor>>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const AllgatherOptions& opts = AllgatherOptions()) override;
+
+  c10::intrusive_ptr<Work> _allgather_base(
+      at::Tensor& outputbuffer,
+      at::Tensor& inputbuffer,
+      const AllgatherOptions& opts = AllgatherOptions()) override;
+
+  c10::intrusive_ptr<Work> allgather_coalesced(
+      std::vector<std::vector<at::Tensor>>& outputTensorLists,
+      std::vector<at::Tensor>& inputTensors,
+      const AllgatherOptions& opts = AllgatherOptions()) override;
+
+  c10::intrusive_ptr<Work> gather(
+      std::vector<std::vector<at::Tensor>>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const GatherOptions& opts = GatherOptions()) override;
+
+  c10::intrusive_ptr<Work> scatter(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<std::vector<at::Tensor>>& inputTensors,
+      const ScatterOptions& opts = ScatterOptions()) override;
+
+  c10::intrusive_ptr<Work> reduce_scatter(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<std::vector<at::Tensor>>& inputTensors,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
+
+  c10::intrusive_ptr<Work> _reduce_scatter_base(
+      at::Tensor& outputTensor,
+      at::Tensor& inputTensor,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
+
+  c10::intrusive_ptr<Work> alltoall_base(
+      at::Tensor& outputTensor,
+      at::Tensor& inputTensor,
+      std::vector<int64_t>& outputSplitSizes,
+      std::vector<int64_t>& inputSplitSizes,
+      const AllToAllOptions& opts = AllToAllOptions()) override;
+
+  c10::intrusive_ptr<Work> alltoall(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const AllToAllOptions& opts = AllToAllOptions()) override;
+
+  c10::intrusive_ptr<Work> send(
+      std::vector<at::Tensor>& tensors,
+      int dstRank,
+      int tag) override;
+
+  c10::intrusive_ptr<Work> recv(
+      std::vector<at::Tensor>& tensors,
+      int srcRank,
+      int tag) override;
+
+  c10::intrusive_ptr<Work> recvAnysource(
+      std::vector<at::Tensor>& tensor,
+      int tag) override;
+
+  c10::intrusive_ptr<Work> barrier(
+      const BarrierOptions& opts = BarrierOptions()) override;
+
+  // Creating a new ProcessGroupMPI, will initialize MPI if not initialized
+  static c10::intrusive_ptr<ProcessGroupMPI> createProcessGroupMPI(
+      std::vector<int> ranks = {});
+
+ protected:
+  using WorkType =
+      std::tuple<std::unique_ptr<WorkEntry>, c10::intrusive_ptr<WorkMPI>>;
+  // Worker thread loop
+  void runLoop();
+  // Helper function that is called by the destructor
+  void destroy();
+
+  c10::intrusive_ptr<Work> enqueue(
+      std::unique_ptr<WorkEntry> entry,
+      const char* profilingTitle = nullptr,
+      const std::optional<std::vector<at::Tensor>>& inputTensors =
+          std::nullopt);
+
+  bool stop_;
+
+  std::mutex pgMutex_;
+  std::thread workerThread_;
+
+  std::deque<WorkType> queue_;
+  std::condition_variable queueProduceCV_;
+  std::condition_variable queueConsumeCV_;
+
+  // Global states
+  static void initMPIOnce();
+  static void mpiExit();
+
+  static std::mutex pgGlobalMutex_;
+  static int mpiThreadSupport_;
+
+  MPI_Comm pgComm_;
+};
+
+} // namespace c10d
+
+#endif // USE_C10D_MPI
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..782e7009617f8fb1ccdf446c52f730230f0dd543
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@@ -0,0 +1,1481 @@
+#pragma once
+
+#ifdef USE_C10D_NCCL
+
+#if defined(__linux__)
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#endif
+
+#include <atomic>
+#include <chrono>
+#include <deque>
+#include <future>
+#include <iostream>
+#include <list>
+#include <mutex>
+#include <thread>
+#include <unordered_map>
+
+#include <torch/csrc/distributed/c10d/Backend.hpp>
+#include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
+#include <torch/csrc/distributed/c10d/PrefixStore.hpp>
+#include <torch/csrc/distributed/c10d/Store.hpp>
+#include <torch/csrc/distributed/c10d/logger.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/intra_node_comm.hpp>
+
+#include <ATen/DynamicLibrary.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAEvent.h>
+#include <c10/core/Stream.h>
+#include <c10/core/StreamGuard.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAStream.h>
+
+#include <torch/custom_class.h>
+
+namespace c10d {
+
+// Control broadcasting of NCCL uniqueId
+static std::vector<std::string> TORCH_NCCL_BCAST_UNIQUEID = {
+    "TORCH_NCCL_BCAST_UNIQUEID"};
+
+// Control whether to always use high priority streams
+static std::vector<std::string> TORCH_NCCL_HIGH_PRIORITY = {
+    "TORCH_NCCL_HIGH_PRIORITY"};
+
+// Control whether or not wait() is blocking or non-blocking.
+static std::vector<std::string> TORCH_NCCL_BLOCKING_WAIT = {
+    "TORCH_NCCL_BLOCKING_WAIT",
+    "NCCL_BLOCKING_WAIT"};
+
+// TODO: We want to eventually remove this variable and make users to use
+// the default value (3 - SkipCleanUp).
+// Control whether or not we perform Async Error Handling with NCCL.
+static std::vector<std::string> TORCH_NCCL_ASYNC_ERROR_HANDLING = {
+    "TORCH_NCCL_ASYNC_ERROR_HANDLING",
+    "NCCL_ASYNC_ERROR_HANDLING"};
+
+// Control whether dumping debug info on watchdog
+// timeout is enabled. This variable must be set together with
+// TORCH_NCCL_ENABLE_MONITORING=1 and TORCH_NCCL_TRACE_BUFFER_SIZE > 0.
+static std::vector<std::string> TORCH_NCCL_DUMP_ON_TIMEOUT = {
+    "TORCH_NCCL_DUMP_ON_TIMEOUT"};
+
+// Control whether to propagate NCCL errors to all ranks through TCPStore.
+static std::vector<std::string> TORCH_NCCL_PROPAGATE_ERROR = {
+    "TORCH_NCCL_PROPAGATE_ERROR"};
+
+// Control whether Desync Debug is enabled. This variable must be set
+// together with TORCH_NCCL_ASYNC_ERROR_HANDLING.
+static std::vector<std::string> TORCH_NCCL_DESYNC_DEBUG = {
+    "TORCH_NCCL_DESYNC_DEBUG",
+    "NCCL_DESYNC_DEBUG"};
+
+// Enable recording start-events for all ProcessGroupNCCL collectives, and
+// compute accurate collective timing per-collective. (Note: end-events are
+// recorded by default. Turn on this flag can increase chances of a watchdog
+// hang due to performing a CUDA event query which eventually calls
+// cudaEventElapsedTime() API.
+static std::vector<std::string> TORCH_NCCL_ENABLE_TIMING = {
+    "TORCH_NCCL_ENABLE_TIMING",
+    "NCCL_ENABLE_TIMING"};
+
+// Enable monitoring thread which aborts the process when the ProcessGroupNCCL
+// Watchdog thread gets stuck and no heartbeat is detected after
+// TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC. This can happen due to calling CUDA/NCCL
+// APIs that may hang. It is Useful to prevent jobs being stuck for a prolonged
+// time than necessary tying up cluster resources.
+static std::vector<std::string> TORCH_NCCL_ENABLE_MONITORING = {
+    "TORCH_NCCL_ENABLE_MONITORING"};
+
+// Control the watchdog heartbeat timeout period after which the monitoring
+// thread will abort the process.
+static std::vector<std::string> TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC = {
+    "TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC"};
+
+// Whether to rethrow CUDA Errors in the watchdog (default true)
+static std::vector<std::string> TORCH_NCCL_RETHROW_CUDA_ERRORS = {
+    "TORCH_NCCL_RETHROW_CUDA_ERRORS"};
+
+// The maximum number of events we store in the flight recorder's ring buffer.
+// (One event could be the start or end of a collective, for example).
+static std::vector<std::string> TORCH_NCCL_TRACE_BUFFER_SIZE = {
+    "TORCH_NCCL_TRACE_BUFFER_SIZE"};
+
+// Control how much extra time we will wait for dumping the debugging info
+// before we exit and throws timeout exception.
+static std::vector<std::string> TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC = {
+    "TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC"};
+
+// Control the interval inside the monitoring thread to check the coordinated
+// signal from other ranks, e.g. to dump the debugging information.
+static std::vector<std::string> TORCH_NCCL_COORD_CHECK_MILSEC = {
+    "TORCH_NCCL_COORD_CHECK_MILSEC"};
+
+// Whether to log C++ stack traces on unclean shutdown (default true)
+static std::vector<std::string> TORCH_NCCL_LOG_CPP_STACK_ON_UNCLEAN_SHUTDOWN = {
+    "TORCH_NCCL_LOG_CPP_STACK_ON_UNCLEAN_SHUTDOWN"};
+
+// Control whether to use CudaEventCache for the collective in watchdog thread.
+// We noticed in the past when cuda global lock is held, destroying CudaEvent
+// can cause a hang.
+static std::vector<std::string> TORCH_NCCL_CUDA_EVENT_CACHE = {
+    "TORCH_NCCL_CUDA_EVENT_CACHE"};
+
+// Control the number of ranks each root can cover during NCCL comm init.
+static std::vector<std::string> TORCH_NCCL_RANKS_PER_ROOT = {
+    "TORCH_NCCL_RANKS_PER_ROOT"};
+
+static std::vector<std::string> TORCH_NCCL_NAN_CHECK = {"TORCH_NCCL_NAN_CHECK"};
+
+constexpr const char* NCCL_BACKEND_NAME = "nccl";
+
+constexpr const char* kStoreDumpKey = "exception_dump";
+
+constexpr const char* kStoreErrorSignalKey = "remote_error";
+
+constexpr const int kWorkStatusUpdatePeriodMs = 30 * 1000; // 30 seconds
+
+constexpr auto kProcessGroupNCCLDefaultTimeout =
+    std::chrono::milliseconds(10 * 60 * 1000);
+
+// NoHandling: do not handle asynchronous NCCL errors
+// TearDown: tear down process upon error, see `WorkNCCL::handleException`
+// CleanUpOnly: just clean up collectives and abort communicators without
+// tearing down process SkipCleanUp: (this is a temporary option and can be
+// removed in future) tear down process without cleaning up NCCL communicators.
+// This should be used as a last resort in case `ncclCommAbort` itself is
+// hanging
+enum ErrorHandlingMode {
+  NoHandling = 0,
+  TearDown = 1,
+  CleanUpOnly = 2,
+  SkipCleanUp = 3
+};
+
+#define SHOULD_CLEAN_UP(a) (a != NoHandling && a != SkipCleanUp)
+
+#define SHOULD_TEAR_DOWN(a) (a != NoHandling && a != CleanUpOnly)
+
+#define PRINT_COLLECTIVE_HASH_SIGNATURE(phase, opType, numel, hashValue)      \
+  LOG(WARNING) << logPrefix() << "Hash of " << phase << " to NCCL " << opType \
+               << " with size " << numel << " is " << hashValue;
+
+// If set, ProcessGroupNCCL doesn't use recordStream calls to ensure
+// caching allocator safety for tensors used on both user-facing and
+// internal comm streams.
+// Instead, it stashes live references to those tensors until after
+// user-facing streams are synced with comm streams.
+// See stashed_for_allocator_safety_ below.
+static std::vector<std::string> TORCH_NCCL_AVOID_RECORD_STREAMS = {
+    "TORCH_NCCL_AVOID_RECORD_STREAMS"};
+
+// If set, ProcessGroupNCCL registers postAlloc and preFree hooks to cuda cache
+// allocator so that whenever a tensor is allocated or freed, ProcessGroupNCCL
+// can register/deregister the tensor on all available NCCL communicators.
+static std::vector<std::string> TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK =
+    {"TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK",
+     "NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK"};
+
+#if defined(__linux__)
+struct DumpPipe {
+  DumpPipe(int rank) {
+    std::string fileStem =
+        getCvarString({"TORCH_NCCL_DEBUG_INFO_PIPE_FILE"}, "");
+    if (fileStem.empty() ||
+        getCvarInt({"TORCH_NCCL_TRACE_BUFFER_SIZE"}, 0) <= 0) {
+      return;
+    }
+    TORCH_CHECK(!fileStem.empty(), "TORCH_NCCL_DEBUG_INFO_PIPE_FILE is empty");
+    std::string filename = c10::str(fileStem, rank, ".pipe");
+    TORCH_CHECK(
+        unlink(filename.c_str()) != -1 || errno == ENOENT,
+        "Error removing existing named pipe ",
+        filename,
+        ", Error: ",
+        std::strerror(errno));
+    TORCH_CHECK(
+        mkfifo(filename.c_str(), 0666) != -1,
+        "Error creating named pipe ",
+        filename,
+        ", Error: ",
+        std::strerror(errno));
+    fd_ = open(filename.c_str(), O_RDONLY | O_NONBLOCK);
+    LOG(INFO) << "Pipe file " << filename
+              << " has been opened, write to it to trigger NCCL Debug Dump.";
+    TORCH_CHECK(fd_ != -1, "Error opening named pipe ", filename);
+  }
+  bool shouldDump() {
+    if (fd_ == -1) {
+      return false;
+    }
+    // NOLINTNEXTLINE(*array*)
+    char buf[128]{};
+    // non-blocking from O_NONBLOCK above.
+    // Ignore EINTR because we already will poll this
+    // again later.
+    ssize_t bytesRead = read(fd_, &buf, 128);
+    return bytesRead > 0;
+  }
+  ~DumpPipe() {
+    if (fd_ != -1) {
+      close(fd_);
+    }
+  }
+
+ private:
+  int fd_ = -1;
+};
+#else
+struct DumpPipe {
+  DumpPipe(int rank) {}
+  bool shouldDump() {
+    return false;
+  }
+};
+#endif
+
+// A shelf for stashing tensors between op call and `work.wait()`.
+// Used in case of async ops.
+class TensorShelf {
+ public:
+  // Stash tensors so that CachingAllocator cannot recycle them prematurely.
+  void stash(std::vector<at::Tensor>& tensors);
+  // Stash tensors from another shelf.
+  void stash(TensorShelf& other);
+  // Unstage the stashed tensors so that CachingAllocator can recycle them.
+  // Same as `clear()`.
+  void unstash();
+  // Whether shelf is empty.
+  bool empty();
+  // Clear the shelf.
+  void clear();
+
+ protected:
+  // Get the inner tensor vector. Use with caution as it is not protected by
+  // mutex.
+  std::vector<at::Tensor>& get();
+
+ private:
+  std::vector<at::Tensor> tVector_;
+  // Need a mutex to protect `tVector_` because it can be potentially accessed
+  // from both main thread and watchdog thread.
+  std::mutex mutex_;
+};
+
+// ProcessGroupNCCL implements NCCL bindings for c10d.
+//
+// All functions of the class are expected to be called in the same order
+// across all processes in the process group.  This is the only way that we
+// can guarantee to match up the same calls among all processes.
+//
+// All NCCL functions provided by this class are asynchronous functions. More
+// specifically, each NCCL call is scheduled on a separate CUDA stream that is
+// different from the current CUDA stream. This is for the purpose of
+// achieving potentially concurrency and better performance. As a result,
+// it is the callers' responsibility to make sure that the CUDA stream their
+// code works on needs to wait for the NCCL operation from
+// this class.
+//
+// This can be done by calling:
+//
+// either WorkNCCL::wait() or WorkNCCL::synchronize(), both achieves the same
+// functionality and are synonyms.
+//
+// Also note that WorkNCCL::finishedGPUExecution() is a helper function only
+// provided by ProcessGroupNCCL to check if the NCCL operation of WorkNCCL has
+// finished execution on the GPU (not just scheduled).
+//
+// Example on using the NCCL process group
+//
+//   ProcessGroupNCCL pg(store, rank, size);
+//   std::shared_ptr<WorkNCCL> work = pg.allreduce(tensors);
+//
+//   // At this point, NCCL kernel has already by queued successfully
+//   // Now, let current stream wait for the NCCL to finish, this function is
+//   // async operation as well
+//
+//   work->wait()
+//
+//   // Now continue on other work in the current stream.
+class TORCH_API ProcessGroupNCCL : public Backend {
+ public:
+  class WorkNCCL : public Work, public std::enable_shared_from_this<WorkNCCL> {
+   public:
+    friend struct WorkInfo;
+
+    // Constructor takes a list of CUDA devices
+    WorkNCCL(
+        std::string pgUID,
+        std::string pgDesc,
+        at::Device& device,
+        int rank,
+        OpType opType,
+        uint64_t seq,
+        bool isP2P = false,
+        const char* profilingTitle = nullptr,
+        const std::optional<std::vector<at::Tensor>>& inputs = std::nullopt,
+        bool enableTiming = false,
+        bool cudaEventCacheEnabled = false,
+        DebugLevel distDebugLevel = DebugLevel::Off);
+    // Copy constructor doing partial copy without outputs_. Cleanup thread
+    // monitors and removes finished works. However it will deadlock when
+    // destructs outputs_ tensors who are view tensors in autograd graph.
+    WorkNCCL(const WorkNCCL& w);
+
+    ~WorkNCCL() override = default;
+
+    // Checks if the NCCL kernel has started to execute.
+    bool isStarted();
+
+    // Checks if request has completed. In this specific case of NCCL, it checks
+    // if the NCCL operation has completed on the GPU in its own NCCL stream.
+    // Non-blocking operation.
+    bool isCompleted() override;
+
+    bool isSuccess() const override;
+
+    // Same as calling synchronize() for NCCL work if timeout is not set.
+    // Otherwise, it will block the CPU thread until the NCCL work is completed
+    // or timed out. If timeout, exception will be thrown.
+    bool wait(std::chrono::milliseconds timeout = kNoTimeout) override;
+
+    void abort() override;
+
+    // Let current stream wait on the completion of the NCCL work
+    // Throws on exceptions.
+    void synchronize() override;
+
+    // Synchronize streams by blocking each on the NCCL stream
+    void synchronizeStream();
+
+    // Helper function to handle exception (throw if needed).
+    void handleException(ErrorHandlingMode asyncErrorHandling);
+
+    // Helper function that checks if the NCCL kernels have finished
+    // execution on the GPUs
+    bool finishedGPUExecution();
+
+    // Get a Future object that will be marked as completed internally.
+    c10::intrusive_ptr<c10::ivalue::Future> getFuture() override;
+
+    // Get a Future result of each work (e.g. success, different error types).
+    // instead of the tensor output.
+    c10::intrusive_ptr<c10::ivalue::Future> getFutureResult() override;
+
+    float getDuration() const override;
+
+    uint64_t getSequencenumber() const override;
+
+    const std::string& logPrefix() const;
+
+    // Helper function that sets an exception_ptr on the WorkNCCL object.
+    void setException(std::exception_ptr exception_ptr);
+
+    // Helper function that returns True if the WorkNCCL object has timed out
+    // and False otherwise.
+    // In case of timeout, set exception on the WorkNCCL object.
+    bool checkTimeout(
+        std::optional<std::chrono::milliseconds> timeout = std::nullopt);
+
+    // Print the traceback of the collective at call time
+    void printTraceback() const;
+
+    std::string getTraceback() const;
+
+    std::vector<at::Tensor> result() override;
+
+   protected:
+    // The process group unique id
+    std::string pgUID_;
+
+    // The process group description
+    std::string pgDesc_;
+
+    // The cached list of CUDA devices to operate on
+    at::Device device_;
+
+    // The start CUDA event of NCCL operator tracking this work item. These
+    // start CUDA events are needed by desync debugging if enabled.
+    std::shared_ptr<at::cuda::CUDAEvent> ncclStartEvent_;
+
+    // The end CUDA event of NCCL operator tracking this work item.
+    std::shared_ptr<at::cuda::CUDAEvent> ncclEndEvent_;
+
+    // The NCCL communicator used for this work item.
+    std::shared_ptr<NCCLComm> ncclComm_;
+
+    // whether this work is a barrier op
+    bool isBarrierOp_{false};
+
+    // Clone of blockingWait_ from ProcessGroupNCCL.
+    bool blockingWait_{false};
+
+    // Clone of opTimeout_ from ProcessGroupNCCL.
+    std::chrono::milliseconds opTimeout_{};
+
+    // Ephemeral timeouts are owned by exactly one work,
+    // and reset after that work completes.
+    // There may be more than one ephemeral timeout active at the same time,
+    // and this variable is used to track the ownership of ephemeral timeout.
+    std::chrono::milliseconds ownedEphermeralTimeout_ =
+        std::chrono::milliseconds(0);
+
+    // Time point representing when the work started.
+    std::chrono::time_point<std::chrono::steady_clock> workStartTime_;
+
+    // Record the sequential number of collective or p2p.
+    uint64_t seq_;
+    bool isP2P_;
+
+    // Indicates if the nccl start event has been updated to the store trace.
+    // This will be used by desync debug.
+    bool startTraceUpdated_{false};
+
+    // Record collective sizes for debug. We only record the size on the first
+    // device as multi-device per process is deprecated
+    size_t numelIn_ = -1;
+    size_t numelOut_ = -1;
+
+    // Wrapper method for the static checkForNCCLErrors which can be overridden
+    // for tests.
+    virtual std::exception_ptr checkForNCCLErrors();
+
+    friend std::ostream& operator<<(
+        std::ostream& output,
+        const WorkNCCL& workNCCL);
+
+    // Checks for NCCL errors and sets an appropriate exception_ptr.
+    void checkAndSetException();
+
+    // Just checks whether GPU execution has started, without modifying
+    // exception_ptr.
+    bool startedGPUExecutionInternal() const;
+
+    // Just checks whether GPU execution has completed, without modifying
+    // exception_ptr.
+    bool finishedGPUExecutionInternal() const;
+
+    // Reference to the store so that we can write aborted communicators
+    // to the store.
+    c10::intrusive_ptr<Store> store_;
+
+    // Store a reference to NCCL collective's outputs, used by result and to
+    // give a more descriptive message when representing the Work as a string.
+    std::shared_ptr<std::vector<at::Tensor>> outputs_;
+
+    // TORCH_NCCL_AVOID_RECORD_STREAMS implementation helper.
+    // Stores references to participating non-output tensors (ie inputs,
+    // flattened intermediates).
+    // We'll clear this list in synchronizeStream, just after user-facing
+    // stream(s) are synced with the nccl work stream(s).
+    // By keeping these refs (as well as outputs_) alive until after the
+    // collective's work rejoins the user-facing streams, we achieve
+    // caching allocator safety without any recordStream calls.
+    // For in-place collectives, some refs stashed here may alias outputs_,
+    // but that doesn't do any harm.
+    std::shared_ptr<TensorShelf> stashed_for_allocator_safety_;
+
+    // The future returned by getFuture.
+    c10::intrusive_ptr<at::ivalue::Future> future_;
+
+    // the future result (e.g., success or failure) of the work
+    c10::intrusive_ptr<at::ivalue::Future> futureWorkResult_;
+
+    bool timingEnabled_;
+    // unique id used to tell the trace buffer that this
+    // work has completed
+    std::optional<uint64_t> trace_id_;
+    DebugLevel distDebugLevel_;
+    friend class ProcessGroupNCCL;
+  };
+
+  class CUDAEventCache
+      : public std::enable_shared_from_this<ProcessGroupNCCL::CUDAEventCache> {
+   public:
+    CUDAEventCache();
+    std::shared_ptr<at::cuda::CUDAEvent> create(bool timing);
+    static std::shared_ptr<ProcessGroupNCCL::CUDAEventCache> get(
+        at::DeviceIndex device);
+
+   private:
+    std::mutex cacheMutex_;
+    // NOTE: We intentionally store raw pointers so that
+    // we do not attempt to destroy the event objects on process exit,
+    // because cuda may be gone.
+    std::array<std::deque<at::cuda::CUDAEvent*>, 2>
+        eventsArray_; // 0 for timing=false, 1 for timing=true
+  };
+
+  struct Options : Backend::Options {
+    // NOTE: timeout in ProcessGroupNCCL::Options denote the timeout for
+    // operations. This is only used when blockingWait_ is enabled.
+    explicit Options(bool is_high_priority_stream = false);
+
+    // return intrusive_ptr of the object
+    static c10::intrusive_ptr<Options> create(
+        bool is_high_priority_stream = false) {
+      return c10::make_intrusive<Options>(is_high_priority_stream);
+    }
+
+    // Schedule NCCL operations on high priority CUDA streams
+    bool is_high_priority_stream;
+
+#ifdef NCCL_HAS_CONFIG
+    // Configure ranks
+    ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
+#endif
+
+    // Optional "parent" backend and color to create communicators from
+    // via `ncclCommSplit`
+    std::shared_ptr<ProcessGroupNCCL> split_from;
+    // Color to use for `ncclCommSplit`, values:
+    // * Non-negative value: in group;
+    // * NCCL_SPLIT_NOCOLOR (-1): not in group;
+    // * NCCL_SPLIT_NOCOLOR - 1: uninitialized.
+    // [Note 1]: the type must be `int` instead of `int64_t` because NCCL API
+    // accepts int. Otherwise, an implicit conversion may happen at the API call
+    // and the value may become negative.
+    // [Note 2]: this member is pybinded to Python, the value passed from Python
+    // must be within the numerical range of C++ int. Otherwise, Python will
+    // raise a RuntimeError saying type is incompatible. See also
+    // `_process_group_color` in `distributed_c10d.py`.
+#ifdef NCCL_HAS_COMM_SPLIT
+    int split_color{NCCL_SPLIT_NOCOLOR - 1};
+#else
+    // [Note 3]: for older NCCL versions, NCCL_SPLIT_NOCOLOR is not defined. But
+    // `split_color` is pybinded to Python, so we need to define it. So we use
+    // the int value of `NCCL_SPLIT_NOCOLOR` (-1) instead.
+    int split_color{-2};
+#endif
+    std::vector<uint64_t> global_ranks_in_group;
+    std::string group_name;
+  };
+
+  // Helper class related to TORCH_NCCL_DESYNC_DEBUG
+  class DesyncDebugger {
+   public:
+    // Initialize and enable DesyncDebugger
+    void init(
+        int rank,
+        int size,
+        int globalRank,
+        int pgId,
+        c10::intrusive_ptr<Store> store);
+
+    // Run desync debug. This function is called by watchdog at time of timeout.
+    void run();
+
+    // Log work start to store.
+    void logWorkStart(WorkNCCL& work);
+
+    // Log work end to store.
+    void logWorkEnd(WorkNCCL& work);
+
+   private:
+    // Whether desync debug is enabled.
+    // If false, all functions are no-op.
+    bool enabled_{false};
+
+    // From ProcessGroupNCCL
+    int rank_;
+    int size_;
+    int globalRank_;
+    int pgId_;
+
+    // Reference to the store so that we can log start/end event.
+    c10::intrusive_ptr<Store> store_;
+
+    // The store keys to trace the last NCCL collective kernel CUDA events -
+    // start event and end event respectively. These are used to do desync root
+    // cause analysis.
+    std::string traceKeyStart_;
+    std::string traceKeyEnd_;
+  };
+
+  // Class that runs as a separate thread aside from watchdog
+  // thread because we need to check the heartbeat from watchdog thread
+  // so that when we get stuck in some NCCL/CUDA calls,
+  // we can dump the debugging information and abort the process.
+  class HeartbeatMonitor {
+   public:
+    HeartbeatMonitor(ProcessGroupNCCL* pg);
+    virtual ~HeartbeatMonitor() = default;
+
+    // Start the heartbeat monitor thread.
+    void start();
+
+    // Join the heartbeat monitor thread.
+    void join();
+
+    // Run the actual loop to check watchdog heartbeat.
+    virtual void runLoop();
+
+    // Set the terminal flag and notify the heartbeat monitor thread to stop.
+    void stop();
+
+    // Set the last update time of watchdog thread.
+    void setLastWorkListUpdateTime(
+        std::chrono::time_point<std::chrono::steady_clock> time);
+
+    int getDumpTimeout() const;
+
+    // Util function to get the timeout error message
+    std::string getNCCLWatchdogTimeoutErrorMsg(const std::string& extraMsg);
+
+    // Util function to get the timeout exit message
+    std::string getNCCLWatchdogTimeoutExitMsg(const std::string& exitReason);
+
+   protected:
+    // We need to keep a reference to the PG instance so that we can access
+    // the member functions of the PG instance. We store a raw pointer on
+    // purpose because the heartbeat monitor thread now still lives within the
+    // lifetime of the PG instance.
+    ProcessGroupNCCL* pg_;
+
+   private:
+    // Whether or not to print C++ stack traces to logs on unclean shutdown.
+    bool logCppStackOnUncleanShutdown_;
+
+    // The time interval used for deciding whether there is no watchdog
+    // heartbeat.
+    int heartbeatTimeoutInSec_;
+
+    // timeout for the dump to finish.
+    int waitTimeoutDumpInMilSec_;
+
+    // Interval of check coordinated signals in ProcessGroupNCCL from other
+    // ranks e.g., trigger the dump of the debugging info for timeout when
+    // notified.
+    int coordCheckIntervalMilSec_;
+
+    // We gate the heartbeat monitor thread so that we can roll it out
+    // gradually.
+    bool watchdogHeartbeatMonitorEnabled_;
+
+    // Monitor thread which checks the heartbeat of Watchdog thread.
+    // If the monitor thread finds there is no heartbeat, it will dump debug
+    // info and then kill the watchdog thread to avoid hang.
+    std::thread ncclHeartbeatMonitorThread_;
+
+    // Whether or not we should terminate the heartbeat monitoring threads.
+    std::atomic<bool> terminateHeartbeatMonitorThread_{false};
+
+    // Condition Variable for monitor thread to wake up early
+    std::condition_variable monitorWakeUpCV_;
+
+    // Whether or not to dump debug info on exception including both watchdog
+    // timeout and nccl errors.
+    bool dumpOnTimeoutOrEx_;
+
+    // Mutex to Guard monitorWakeUpCV_
+    std::mutex monitorMutex_;
+
+    // The last update time of WorkList inside watchdog thread.
+    std::chrono::time_point<std::chrono::steady_clock> lastWorkListUpdateTime_;
+  };
+
+  // Class that runs as a side thread to check whether the NCCL collective
+  // is timed out or errors on the cached NCCL communicators.
+  class Watchdog {
+   public:
+    Watchdog(ProcessGroupNCCL* pg);
+    virtual ~Watchdog() = default;
+
+    // Start the watchdog thread.
+    void start();
+
+    // Join the watchdog thread.
+    void join();
+
+    // Function that runs as part of a separate thread and checks for errors on
+    // NCCL communicators. We need a separate thread to check for NCCL errors
+    // since we can't rely on the user calling certain methods like wait(),
+    // isCompleted() etc. to detect and remediate errors. In addition to this,
+    // we need a mechanism to safely abort and remove NCCL communicators from
+    // our cache. This can be done cleanly by having a thread for the
+    // ProcessGroupNCCL class. Attempting to modify the communicator cache from
+    // the WorkNCCL class might run into issues with object lifetime since the
+    // ProcessGroupNCCL object might get destroyed before the WorkNCCL object.
+    void run();
+
+    // Watchdog's inside loop.
+    // Takes care of cleaning up completed work, and aborting upon failure or
+    // timeout.
+    void runLoop();
+
+    // Notify the loop inside watchdog.
+    void notify();
+
+    void checkAndSetRemoteError();
+
+    // A helper function to get the src rank of a signal from the Store. This is
+    // nonblocking function returning -1 if the signal is not available yet.
+    int getSignalSrcRank(
+        c10::intrusive_ptr<Store>& store,
+        const std::string& signal);
+
+    uint64_t getHeartbt() const;
+
+    void setDesyncDebug(bool desyncDebug);
+
+   private:
+    std::thread ncclCommWatchdogThread_;
+
+    // We need to keep a reference to the PG instance so that we can access
+    // the member functions of the PG instance. We store a raw pointer on
+    // purpose because the watchdog thread now still lives within the
+    // lifetime of the PG instance.
+    ProcessGroupNCCL* pg_;
+
+    // Whether the NCCL watchdog should rethrow CUDA errors.
+    bool rethrowCUDAErrors_ = false;
+
+    std::exception_ptr watchDogException_ = nullptr;
+
+    // Condition Variable for watchdog thread sleep
+    std::condition_variable workMetaListCV_;
+
+    // Heartbeat of watchdog thread.
+    std::atomic_uint64_t heartbeat_{};
+
+    // Whether or not to propagate detected errors to all ranks in the same PG
+    // through TCPStore.
+    bool propagatePgError_;
+
+    // Whether or not to enable timeout root cause analysis.
+    bool desyncDebug_;
+
+    DesyncDebugger desyncDebugger_;
+  };
+
+  // If you wish to create multiple process groups, each with a potentially
+  // different rank and size, you can do so by passing a new store instance
+  // to each one. If you have only a single store object, you can
+  // use the `c10d::PrefixStore` to derive scoped instances.
+  // This is also what the Python API in torch.distributed does.
+  //
+  // The process group instance keeps a reference to the store because
+  // it may be used long after the constructor runs. In fact, the constructor
+  // doesn't create any NCCL communicators. A single NCCL communicator can
+  // only be used on a specific set of devices, and are therefore created
+  // on-demand when a collective runs. If another collective is executed later,
+  // against a different set of devices, the process group creates another NCCL
+  // communicator. These NCCL communicators are cached and reused if possible.
+  //
+  ProcessGroupNCCL(
+      c10::intrusive_ptr<Store> store,
+      int rank,
+      int size,
+      c10::intrusive_ptr<Options> options = Options::create());
+
+  // This constructor includes the deprecated `groupName` argument.
+  // If you have existing code that uses the `groupName`, you can replace
+  // it by specifying a `c10d::PrefixStore(groupName, store)` for store.
+  C10_DEPRECATED ProcessGroupNCCL(
+      const c10::intrusive_ptr<Store>& store,
+      int rank,
+      int size,
+      const std::string& groupName,
+      c10::intrusive_ptr<Options> options = Options::create())
+      : ProcessGroupNCCL(store, rank, size, std::move(options)) {}
+
+  ~ProcessGroupNCCL() override;
+
+  // This function returns a local uid for ProcessGroupNCCL.
+  uint64_t getUid() {
+    return static_cast<uint64_t>(local_id_);
+  }
+
+  c10::intrusive_ptr<Options> getOptions() {
+    return options_;
+  }
+
+  const std::string getBackendName() const override {
+    return std::string(NCCL_BACKEND_NAME);
+  }
+
+  bool supportsSplitting() const override {
+    return true;
+  }
+
+  bool supportsCoalescing() const override {
+    return true;
+  }
+
+  bool supportsTimeEstimation() const override {
+#ifdef NCCL_SIM_INFO_INITIALIZER
+    return true;
+#else
+    return false;
+#endif
+  }
+
+  void startCoalescing() override;
+
+  c10::intrusive_ptr<Work> endCoalescing() override;
+
+  void startTimeEstimate();
+
+  float endTimeEstimate();
+
+  // For specifying a composite optype, such as ALLGATHER and REDUCE_SCATTER
+  c10::intrusive_ptr<Work> endCoalescing(OpType optype);
+
+  c10::intrusive_ptr<Work> broadcast(
+      std::vector<at::Tensor>& tensors,
+      const BroadcastOptions& opts = BroadcastOptions()) override;
+
+  c10::intrusive_ptr<Work> _broadcast_oop(
+      at::Tensor& outputTensors,
+      at::Tensor& inputTensors,
+      const BroadcastOptions& opts = BroadcastOptions());
+
+  c10::intrusive_ptr<Work> allreduce_sparse(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceOptions& opts = AllreduceOptions()) override;
+
+  c10::intrusive_ptr<Work> allreduce(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceOptions& opts = AllreduceOptions()) override;
+
+  c10::intrusive_ptr<Work> allreduce_coalesced(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceCoalescedOptions& opts =
+          AllreduceCoalescedOptions()) override;
+
+  c10::intrusive_ptr<Work> reduce(
+      std::vector<at::Tensor>& tensors,
+      const ReduceOptions& opts = ReduceOptions()) override;
+
+  c10::intrusive_ptr<Work> _reduce_oop(
+      at::Tensor& outputTensors,
+      at::Tensor& inputTensors,
+      const ReduceOptions& opts = ReduceOptions());
+
+  c10::intrusive_ptr<Work> allgather(
+      std::vector<std::vector<at::Tensor>>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const AllgatherOptions& opts = AllgatherOptions()) override;
+
+  c10::intrusive_ptr<Work> _allgather_base(
+      at::Tensor& outputbuffer,
+      at::Tensor& inputbuffer,
+      const AllgatherOptions& opts = AllgatherOptions()) override;
+
+  c10::intrusive_ptr<Work> allgather_coalesced(
+      std::vector<std::vector<at::Tensor>>& outputTensorLists,
+      std::vector<at::Tensor>& inputTensors,
+      const AllgatherOptions& opts = AllgatherOptions()) override;
+
+  c10::intrusive_ptr<Work> allgather_into_tensor_coalesced(
+      std::vector<at::Tensor>& outputs,
+      std::vector<at::Tensor>& inputs,
+      const AllgatherOptions& opts = AllgatherOptions()) override;
+
+  c10::intrusive_ptr<Work> reduce_scatter(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<std::vector<at::Tensor>>& inputTensors,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
+
+  c10::intrusive_ptr<Work> _reduce_scatter_base(
+      at::Tensor& outputTensor,
+      at::Tensor& inputTensor,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
+
+  c10::intrusive_ptr<Work> reduce_scatter_tensor_coalesced(
+      std::vector<at::Tensor>& outputs,
+      std::vector<at::Tensor>& inputs,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
+
+  c10::intrusive_ptr<Work> barrier(
+      const BarrierOptions& opts = BarrierOptions()) override;
+
+  c10::intrusive_ptr<Work> alltoall_base(
+      at::Tensor& outputTensor,
+      at::Tensor& inputTensor,
+      std::vector<int64_t>& outputSplitSizes,
+      std::vector<int64_t>& inputSplitSizes,
+      const AllToAllOptions& opts = AllToAllOptions()) override;
+
+  c10::intrusive_ptr<Work> alltoall(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const AllToAllOptions& opts = AllToAllOptions()) override;
+
+  c10::intrusive_ptr<Work> send(
+      std::vector<at::Tensor>& tensors,
+      int dstRank,
+      int tag) override;
+
+  c10::intrusive_ptr<Work> recv(
+      std::vector<at::Tensor>& tensors,
+      int srcRank,
+      int tag) override;
+
+  int64_t getCommPtr();
+
+  void groupStart();
+
+  void groupEnd();
+
+  void groupEndNonblocking(const std::shared_ptr<NCCLComm>& comm);
+
+  c10::intrusive_ptr<Work> gather(
+      std::vector<std::vector<at::Tensor>>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const GatherOptions& opts = GatherOptions()) override;
+
+  c10::intrusive_ptr<Work> scatter(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<std::vector<at::Tensor>>& inputTensors,
+      const ScatterOptions& opts = ScatterOptions()) override;
+
+  // Unsupported Ops
+  c10::intrusive_ptr<Work> recvAnysource(
+      std::vector<at::Tensor>& tensors,
+      int tag) override;
+
+  // Agrees on an initial sequence number for the whole group by having rank 0
+  // create it and broadcast it to other ranks using the store.
+  void setSequenceNumberForGroup() override;
+
+  // Retrieves the current sequence number for the whole group, which should be
+  // in sync. If the returned number is not consistent across the group, it
+  // may indicate that there is some sort of collective desynchronization.
+  uint64_t getSequenceNumberForGroup() override;
+
+  // Return the total number of splits the communicators held by this process
+  // group have performed.  Counts ncclCommCreateFromRanks() for ncclx v2.21.5+
+  uint64_t getCommSplitCounter() const;
+
+  void registerOnCompletionHook(
+      std::function<void(std::shared_ptr<WorkInfo>)>&& hook) override;
+  void waitForPendingWorks() override;
+
+  void enableCollectivesTiming() override;
+
+  // Helper function for iteratively aborting communicators in the provided map
+  void abortCommsFromMap(
+      std::unordered_map<std::string, std::shared_ptr<NCCLComm>>& ncclCommsMap,
+      const std::optional<std::string>& abortReason);
+
+  c10::intrusive_ptr<intra_node_comm::IntraNodeComm> initIntraNodeComm();
+
+  // Destroy (shutdown) this backend -- normal exit.
+  void shutdown() override;
+
+  // Provides an API to abort the ProcessGroup (similar to ncclCommAbort)
+  // instead of relying on ProcessGroupNCCL destructor.
+  void abort() override;
+
+  void eagerConnectSingleDevice(at::Device device) override;
+
+  void performNocolorSplit(at::Device device);
+
+  // If all comms on this PG are fully initialized, return true.
+  bool isInitialized();
+
+  ErrorType getError() override;
+
+  std::shared_ptr<c10::Allocator> getMemAllocator() override;
+
+  // Allocate tensor from communication-optimized memory pool
+  at::Tensor allocateTensor(long size, at::TensorOptions options = {}) override;
+
+  // Whether tensor allocation from NCCL memory pool is supported
+  bool supportsTensorAlloc(c10::DeviceIndex deviceIdx) override;
+
+  // Performs NCCL user buffer registration for all buffers in
+  // the given MemPool
+  void registerMemPool(c10::cuda::MemPool* pool);
+
+  // Performs NCCL user buffer de-registration for all buffers in
+  // the given MemPool
+  void deregisterMemPool(c10::cuda::MemPool* pool);
+
+  // This method adds a temporary extension for the timeout period,
+  // applying to all collectives between the calling of this API and
+  // the completion of the first collective on the GPU. While this feature
+  // provides flexibility in specific scenarios, it introduces statefulness
+  // to timeout setting. Therefore, it is advisable to use this API sparingly
+  // and consider alternative approaches, such as directly setting the timeout
+  // or utilizing a barrier collective (one can set any timeout to the barrier),
+  // whenever feasible.
+  void addEphemeralTimeout(const std::chrono::milliseconds& timeout);
+
+  // This function is only intended for testing purposes because we don't
+  // want to expose the `WorkNCCL` via pybind. It verifies whether the
+  // `opTimeout_` of the provided WorkNCCL instance is the same as the specified
+  // timeout.
+  bool verifyWorkTimeoutForTest(
+      const c10::intrusive_ptr<Work>& work,
+      const std::chrono::milliseconds& timeout);
+
+  void setEnableNanCheck(bool enableNanCheck);
+
+ protected:
+  uint64_t getWatchdogHeartbt() const;
+
+  // Instance of the heartbeat monitor thread.
+  std::unique_ptr<HeartbeatMonitor> heartbeatMonitor_;
+
+  // Instance of the watchdog thread.
+  std::unique_ptr<Watchdog> watchdog_;
+
+  // Helper that broadcasts nccl unique ID to all ranks through the store
+  void broadcastUniqueNCCLID(
+      ncclUniqueId* ncclID,
+      bool isSingleP2POp,
+      const std::string& devicesKey,
+      int p2pRank);
+
+  // Helper that allgathers nccl unique IDs to all ranks through the store
+  void allgatherUniqueNCCLIDs(
+      int rootIdx,
+      ncclUniqueId* ncclID,
+      std::vector<ncclUniqueId>& ncclIDs);
+
+  // Helper that looks up the cached NCCL communicators only
+  std::shared_ptr<NCCLComm> getNCCLComm(const std::string& deviceKey);
+
+  std::shared_ptr<NCCLComm> initNCCLComm(
+      const std::string& deviceKey,
+      at::Device& device,
+      OpType opType,
+      int p2pRank = 0,
+      bool isSendRecvSelf = false);
+
+  // Wrapper method which can be overridden for tests.
+  virtual std::exception_ptr checkForNCCLErrors(
+      std::shared_ptr<NCCLComm>& ncclComm);
+
+  // Ensure thaht if record is True, the work obj will be enqueued via
+  // workEnqueue
+  virtual c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL> initWork(
+      at::Device& device,
+      int rank,
+      OpType opType,
+      bool isP2P,
+      const char* profilingTitle = nullptr,
+      const std::vector<at::Tensor>& inputs = {},
+      const std::vector<at::Tensor>& outputs = {},
+      bool record = false);
+
+  // In the timeout case and we will dump debug info such as the NCCL flight
+  // recorder to storage. Down the road, if we have more complicated or blocking
+  // operations, we might need to use a side thread to do it.
+  bool dumpDebuggingInfo(bool includeStackTrace = true);
+
+  // Abort all communicators on this rank.
+  bool abortComms(const std::optional<std::string>& abortReason = std::nullopt);
+
+  // A helper function to check if nonblocking API mode should be used.
+  // Use this helper instead of directly checking `useNonblocking_` variable.
+  bool useNonblocking();
+
+ protected:
+  int globalRankStart_;
+  int globalRankStride_;
+
+  // Helper that encapsulates work shared across all collective communication
+  // primitives.  The callbacks have the following signatures:
+  //
+  //    ncclResult_t fn(at::Tensor& input, at::Tensor& output,
+  //                    ncclComm_t, at::cuda::CUDAStream&);
+  //    void {pre,post}(std::vector<at::cuda::CUDAStream&>);
+  template <typename Fn>
+  c10::intrusive_ptr<Work> collective(
+      at::Tensor& input,
+      at::Tensor& output,
+      Fn fn,
+      OpType opType,
+      bool asyncOp,
+      const char* profilingTitle = nullptr,
+      bool nanCheck = true);
+
+  template <typename Fn, typename PreProcess, typename PostProcess>
+  c10::intrusive_ptr<Work> collective(
+      at::Tensor& input,
+      at::Tensor& output,
+      Fn fn,
+      PreProcess pre,
+      PostProcess post,
+      OpType opType,
+      bool asyncOp,
+      const char* profilingTitle = nullptr,
+      bool nanCheck = true);
+
+  template <typename Fn, typename PreProcess, typename PostProcess>
+  c10::intrusive_ptr<Work> collective(
+      std::vector<at::Tensor>& inputs,
+      std::vector<at::Tensor>& outputs,
+      Fn fn,
+      PreProcess pre,
+      PostProcess post,
+      OpType opType,
+      bool asyncOp,
+      const char* profilingTitle = nullptr,
+      bool nanCheck = true);
+
+  template <typename Fn>
+  c10::intrusive_ptr<Work> collectiveCoalesced(
+      std::vector<at::Tensor>& input,
+      std::vector<at::Tensor>& output,
+      Fn fn,
+      OpType opType,
+      bool asyncOp,
+      const char* profilingTitle = nullptr);
+
+  // Helper that encapsulates work shared across point-to-point communication
+  // primitives. It is the same structure as the helper used for collective
+  // communication primitives.
+  template <typename Fn>
+  c10::intrusive_ptr<Work> pointToPoint(
+      at::Tensor& tensor,
+      Fn fn,
+      int peer,
+      OpType opType,
+      const char* profilingTitle = nullptr);
+
+  template <typename Fn, typename PreProcess, typename PostProcess>
+  c10::intrusive_ptr<Work> pointToPoint(
+      at::Tensor& tensor,
+      Fn fn,
+      int peer,
+      OpType opType,
+      PreProcess pre,
+      PostProcess post,
+      const char* profilingTitle);
+
+  c10::intrusive_ptr<Work> allreduce_impl(
+      at::Tensor& tensor,
+      const char* profilingTitle = "nccl:all_reduce",
+      const AllreduceOptions& opts = AllreduceOptions());
+
+  // Checks for NCCL errors on each of the communicators and returns an
+  // appropriate exception_ptr (nullptr if no errors).
+  static std::exception_ptr checkForNCCLErrorsInternal(
+      std::shared_ptr<NCCLComm>& ncclComm);
+
+  // Return the CUDA device most likely associated with this backend.
+  // If we aren't bound to a specific device, there is no strict
+  // guarantee that this heuristic is the correct assignment of ranks
+  // to GPUs that Python layers use, but in practice it tends to be.
+  // Fortunately we don't rely on this for correctness of any tensor
+  // operations, just for ancillary uses like barriers.
+  at::Device guessDeviceForRank() const;
+
+  // Destroys initialized NCCL communicators in devNCCLComMap_ given by input
+  // key. Throws if there are no communicators to destroy. Also removes
+  // communicators from the cache and clears used device indices.
+  void destroyNCCLComms(const std::string& devNCCLCommMapKey);
+
+  void runHookLoop();
+
+  // Generates a prefix that is unique to this process group and rank, for
+  // disambiguating logs
+  std::string createLogPrefix() const;
+
+  // Returns the unique prefix created in createLogPrefix
+  const std::string& logPrefix() const;
+
+  // Returns the global rank of the device. This function assumes that users
+  // always create a default global process group(PG) which includes all
+  // devices. It is called in the constructor of ProcessGroupNCCL, so it always
+  // return the rank_ of the the very first PG created, aka, default global PG.
+  const int& globalRank() const;
+
+  const c10::intrusive_ptr<Store>& globalStore() const;
+
+  // Returns the global ranks of a PG.
+  const std::vector<uint64_t>& groupRanks() const;
+
+  // Util function to assign timeout to each work.
+  void assignTimeoutToWork(
+      const c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>& work,
+      const c10::intrusive_ptr<Options>& option);
+
+  // Broadcast flight-recorder dump signal
+  void broadcastDumpSignal();
+
+  // A helper function to broadcast a signal (key) from a src rank to all other
+  // ranks using the specified store.
+  void broadcastSignal(
+      c10::intrusive_ptr<Store>& store,
+      const std::string& signal,
+      int srcRank);
+
+ protected:
+  // Function that directly trigger std::abort so that the whole process
+  // gets terminated.
+  virtual void terminateProcess(const std::string& errMsg);
+
+  // A helper function to wait for a future to complete or timeout.
+  // Returns true if the future completes before timeout, false otherwise.
+  bool waitForFutureOrTimeout(
+      std::future<bool>& fut,
+      const std::chrono::milliseconds& timeOutMilSec,
+      const std::string& futDescription,
+      ::c10d::C10dLoggingData& debugLog,
+      bool throwException = false);
+
+  // A helper function to guess the device id of the current rank, based on
+  // bounded device or used device. Do not use this function if you already know
+  // the device id to operate on.
+  c10::DeviceIndex guessDeviceId() const;
+
+  static const int64_t kWatchdogThreadSleepMillis;
+
+  // The store is used to broadcast the NCCL unique ID of rank 0. This store
+  // comes with prefix and it is different across ProcessGroup NCCL instances
+  // (aka, different ProcessGroups).
+  c10::intrusive_ptr<Store> store_;
+
+  // Reference to the store without prefix so that keys are same across all
+  // ProcessGroup NCCL instances and (key, value) pairs written to the store are
+  // global.
+  c10::intrusive_ptr<Store> globalStore_;
+
+  // The lock which protects the write/read of
+  // ephemeralTimeoutActive_/ephemeralTimeoutInflight_.
+  // TODO(fduwjj): We need to have an audit on all mutexes we are adding here.
+  // And consolidate them if possible.
+  std::mutex mtxTimeoutExtension_;
+
+  // The ephemeral timeout added on top of existing timeout for works issued
+  // before first work finishes.
+  std::chrono::milliseconds ephemeralTimeoutActive_ =
+      std::chrono::milliseconds(0);
+
+  // The ephemeral timeout addition which has been already applied to work.
+  std::chrono::milliseconds ephemeralTimeoutInflight_ =
+      std::chrono::milliseconds(0);
+
+  const c10::intrusive_ptr<Options> options_;
+
+  // The number of NCCL communicators that have been created during
+  // the lifetime of this process group. This sequence number is
+  // used to scope keys used in the store.
+  uint64_t ncclCommCounter_{0};
+
+  // The NCCL communicator that the process group has cached.
+  //
+  // For collective operations:
+  // The key is a list of GPU devices that an operation is operating on
+  // The GPU devices are stored in a device sequence and the cache NCCL
+  // communicator is associated with this GPU device sequence
+  //
+  // e.g. If the process group op only uses device 0, then the value of
+  // the used device string stored (value of the hashmap) would be "0".
+  //
+  //      If the process group op uses device 0 - 7 and the each tensor of the
+  //      input tensor list is on device, 0, 1, 2, 3, 4, 5, 6, 7 separately,
+  //      then the value of the used device string (key) stored would be
+  //      "0,1,2,3,4,5,6,7"
+  //
+  //      If the process group op uses device 0 - 7 and the each tensor of the
+  //      input tensor list is on device, 0, 4, 5, 6, 7, 1, 2, 3 separately,
+  //      then the value of the used device string stored would be
+  //      "0,4,5,6,7,1,2,3"
+  //
+  //      Note that the order of the device for the tensor list matters.
+  //
+  // For point-to-point operations:
+  // The key is a string of my current rank and the peer process rank.
+  // e.g. If process 1 and process 2 are involved in a point-to-point
+  // communication, the key will be "1:2" on both processes. Note: this is for
+  // the scenario where there is only 1 GPU per process. When it comes to
+  // multiple GPUs per process, this part may need to redesigned.
+  // TODO: we probably need a separate map for P2P comms
+  std::unordered_map<std::string, std::shared_ptr<NCCLComm>> devNCCLCommMap_;
+
+  // The NCCL communicators currently in process of being initialized.
+  std::unordered_map<std::string, std::shared_ptr<NCCLComm>>
+      inInitializationCommMap_;
+
+  // Mutex to guard maps like devNCCLCommMap_.
+  std::mutex mutex_;
+
+  // Size of ring buffer where we store NCCL Traces for debugging.
+  int traceBufferSize_;
+
+  // We gate the cudaEventCache so that we can roll it out gradually.
+  std::atomic<bool> cudaEventCacheEnabled_{};
+
+  std::thread onCompletionHookThread_;
+
+  // Whether or not we should terminate the watchdog and workCleanup threads.
+  std::atomic<bool> terminateProcessGroup_;
+
+  // Whether there are hooks pending to be fired
+  std::atomic<bool> hasPendingHooks_{};
+
+  // This is the signal from watchdog threads to indicate whether the monitor
+  // thread should dump. Making it static so that it is accessible from all the
+  // PGs. With this flag, monitor thread would dump debug info under any one of
+  // the three conditions:
+  //
+  // 1: watchdog thread of any PG detects a collective timeout.
+  // 2: timeout signal is received from other ranks through tcpstore.
+  // 3: current PG's watchdog heartbeat timeout occurs.
+  //
+  // Note that only the monitor thread from PG0 will dump the debug info for
+  // case one and two so that the debug info is only dumped once.
+  static std::atomic<bool> shouldDump_;
+
+  // Mutex to Guard workMetaList_
+  std::mutex workMetaListMutex_;
+
+  bool writeDebugInfo_ = false;
+
+  // Vector to store WorkNCCL pointers
+  std::list<ProcessGroupNCCL::WorkNCCL> workMetaList_;
+
+  // Mutex to Guard workMetaList_
+  std::mutex completedWorkListMutex_;
+
+  // Condition Variable for watchdog thread sleep
+  std::condition_variable completedWorkListCV_;
+
+  std::list<ProcessGroupNCCL::WorkNCCL> completedWorkList_;
+
+  // Add Work Pointer to workVector
+  void workEnqueue(const c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL>&);
+
+  // The CUDA streams used by NCCL kernels
+  std::unordered_map<std::string, at::cuda::CUDAStream> ncclStreams_;
+
+  // The CUDA events used to sync NCCL streams
+  std::unordered_map<std::string, at::cuda::CUDAEvent> ncclEvents_;
+
+  // Device Indexes used for all collectives in this group
+  std::set<c10::DeviceIndex> usedDeviceIdxs_;
+
+  // Flag to denote if a coalescing groupStart/groupEnd block is active
+  int coalescing_state_ = 0;
+
+  // Stores device indexes for all collectives run inside a coalescing block
+  at::Device coalescedDevice_ = at::Device("cuda");
+
+  // Stores communicators for all collectives run inside a coalescing block
+  std::shared_ptr<NCCLComm> coalescedComm_ = nullptr;
+
+  // Whether the coalesced calls are sync or async.
+  bool coalescedAsync_;
+
+  // keeps track of input and output tensors when coalescing is in flight.  Will
+  // hand over these tensors to WorkNCCL's stash when coalescing is ended.
+  TensorShelf coalescedTensors_;
+
+  // Some ops may have completed, but user still hasn't called `work.wait()`.
+  // When watchdog detects this, it transfers the TensorShelf from `work` to
+  // this `shelves` structure. Next time we execute ProcessGroupNCCL's methods
+  // on main thread, we clear the `shelves` in one shot. This is mainly because
+  // watchdog (a side thread) unstashing the shelf directly seems to cause some
+  // problem.
+  std::vector<std::shared_ptr<TensorShelf>> shelvesToUnstash_;
+  std::mutex shelvesMutex_;
+
+  // Whether or not wait() and synchronize() are blocking operations that wait
+  // for the operation to complete.
+  bool blockingWait_ = false;
+
+  // Whether or not the workCleanupThread is used to perform async error
+  // handling.
+  ErrorHandlingMode asyncErrorHandling_ = NoHandling;
+
+  ErrorType error_ = ErrorType::SUCCESS;
+
+  std::mutex errorMutex_;
+
+  // Whether or not to sleep after an exception is thrown in the watchdog.
+  bool sleepAfterException_{};
+
+  // Whether or not to enable nan check for input tensors to collectives.
+  bool enableNanCheck_;
+
+  // Whether or not to create start CUDAEvent and enable timing for start
+  // and end events. Note that enableTiming_ is always true if desyncDebug_
+  // is set to true.
+  std::atomic<bool> enableTiming_{};
+
+  // Flag to enable the print of hash value of input/output of collectives for
+  // verification.
+  std::atomic<bool> enableCollectiveHashDebug_{};
+
+  // Whether or not TORCH_NCCL_AVOID_RECORD_STREAMS was set
+  bool avoidRecordStreams_ = false;
+
+  // The number of active ncclGroupStart() calls. This counter will be increased
+  // by 1 when ncclGroupStart() is called and decreased by 1 when ncclGroupEnd()
+  // is called.
+  static thread_local uint64_t ncclActiveGroupCounter_;
+
+  // Counting for the sequential number of NCCL collective call.
+  // (specifically, how many actual kernels we launched, which differs from
+  // op_id_ when coalescing is enabled)
+  uint64_t seqCollective_{0};
+
+  // Counting for the sequential number of NCCL P2P calls.
+  uint64_t seqP2P_{0};
+
+  // Incrementing counter for logical operations (collective or p2p) issued on
+  // the ProcessGroup
+  uint64_t op_id_{0};
+
+  // The number of ProcessGroupNCCL created on the current rank.
+  size_t local_id_;
+
+  std::string logPrefix_;
+
+  c10::intrusive_ptr<intra_node_comm::IntraNodeComm> intraNodeComm_;
+
+  // Number of devices on this node.
+  int localDeviceCount_{0};
+
+  std::shared_ptr<ProcessGroupStatus> pgStatus_ =
+      std::make_shared<ProcessGroupStatus>();
+
+  // Internal cached value: use NCCL non-blocking API mode or not.
+  // Use `useNonblocking()` method instead of accessing this variable directly.
+  std::optional<bool> useNonblocking_{std::nullopt};
+
+  // Communication-optimized memory pool associated with this PG
+  std::unique_ptr<c10::cuda::MemPool> memPool_ = nullptr;
+};
+
+// Dumps the NCCL comm traces and additional information about the Process
+// Group.
+TORCH_API std::string dump_nccl_trace(
+    bool includeCollectives,
+    bool includeStackTraces,
+    bool onlyActive);
+
+// Dumps the NCCL comm traces and additional information about the Process
+// Group in JSON formatted string.
+// We don't include stack traces in JSON format as it is far too much data.
+TORCH_API std::string dump_nccl_trace_json(
+    bool includeCollectives,
+    bool onlyActive);
+
+// Gets a mutable reference to a global optional function.Heartbeat Monitor
+// will use this function to dump traces, if available. Inside fbcode, we
+// store a function here that uses an internal tool for process tracing
+TORCH_API std::optional<
+    std::function<void(std::function<void(const std::string&)>)>>&
+get_cpp_trace_dumper();
+
+// Similar to get_cpp_trace_dumper, this stores a function defined in
+// torch-python layer that lets us check whether the GIL can be acquired,
+// helpful for instrumenting in cases where a hang was observed.
+typedef bool (*gil_checker_t)();
+
+TORCH_API gil_checker_t& get_gil_checker();
+} // namespace c10d
+
+#endif // USE_C10D_NCCL
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroupUCC.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroupUCC.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..247235f59b03e1f51c58d6b0237fbcdd469d9e17
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroupUCC.hpp
@@ -0,0 +1,358 @@
+#pragma once
+
+#ifdef USE_C10D_UCC
+
+#include <torch/csrc/distributed/c10d/UCCUtils.hpp>
+
+#include <exception>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <vector>
+
+#include <torch/csrc/distributed/c10d/Backend.hpp>
+#include <torch/csrc/distributed/c10d/Store.hpp>
+#include <torch/csrc/distributed/c10d/Types.hpp>
+#include <torch/csrc/distributed/c10d/Utils.hpp>
+#ifdef USE_CUDA
+#include <ATen/cuda/CUDAEvent.h>
+#include <c10/cuda/CUDAStream.h>
+#endif
+
+namespace c10d {
+
+#define TORCH_UCC_DEVICE_NOT_SET -2
+
+#ifdef USE_CUDA
+#define SAVE_TENSORS(_TENSORS, _DATA)                       \
+  do {                                                      \
+    if ((_TENSORS)[0].device().is_cuda()) {                 \
+      for (const auto i : c10::irange((_TENSORS).size())) { \
+        c10::cuda::CUDACachingAllocator::recordStream(      \
+            (_TENSORS)[i].storage().data_ptr(), (*stream)); \
+      }                                                     \
+    } else {                                                \
+      (_DATA) = (_TENSORS);                                 \
+    }                                                       \
+  } while (0)
+
+#else
+#define SAVE_TENSORS(_TENSORS, _DATA) (_DATA) = (_TENSORS);
+#endif
+
+constexpr const char* UCC_BACKEND_NAME = "ucc";
+
+struct event_pool_t {
+#ifdef USE_CUDA
+  std::queue<std::unique_ptr<at::cuda::CUDAEvent>> event_pool;
+#endif
+  std::mutex event_pool_mutex;
+};
+
+class Comm;
+
+// UCC does not support multiple CUDA devices per process.
+class TORCH_API ProcessGroupUCC : public Backend {
+ private:
+  void set_timeout(ucc_coll_args_t& args);
+
+ public:
+  class WorkData {
+   public:
+    std::vector<at::Tensor> src;
+    std::vector<at::Tensor> dst;
+    std::vector<at::Tensor> flat;
+    WorkData() {}
+    virtual ~WorkData() = default;
+  };
+  class AlltoallWorkData : public WorkData {
+   public:
+    AlltoallWorkData(int size)
+        : send_lengths(size),
+          send_offsets(size),
+          recv_lengths(size),
+          recv_offsets(size) {}
+    std::vector<uint64_t> send_lengths;
+    std::vector<uint64_t> send_offsets;
+    std::vector<uint64_t> recv_lengths;
+    std::vector<uint64_t> recv_offsets;
+  };
+
+  class AllgathervWorkData : public WorkData {
+   public:
+    AllgathervWorkData(int size) : recv_lengths(size), recv_offsets(size) {}
+    std::vector<uint64_t> recv_lengths;
+    std::vector<uint64_t> recv_offsets;
+  };
+
+  class ScattervWorkData : public WorkData {
+   public:
+    ScattervWorkData(int size) : send_lengths(size), send_offsets(size) {}
+    std::vector<uint64_t> send_lengths;
+    std::vector<uint64_t> send_offsets;
+  };
+
+  class ProgressEntry {
+    friend class ProcessGroupUCC;
+    friend class Comm;
+
+   public:
+    ProgressEntry(CommBase* comm, ucc_coll_req_h request)
+        : status_(UCC_INPROGRESS), comm_(comm), request_(request) {}
+    // Finalizes UCC status or exception of collective request.
+    void finalize(std::exception_ptr eptr = nullptr);
+    ucc_status_t status_;
+    CommBase* comm_;
+    ucc_coll_req_h request_;
+    std::unique_ptr<WorkData> data;
+    c10::intrusive_ptr<c10::ivalue::Future> future_;
+    std::exception_ptr eptr_;
+  };
+
+  class WorkUCC : public Work {
+    friend class ProcessGroupUCC;
+    friend class Comm;
+
+   public:
+    WorkUCC(
+        OpType opType,
+        uint64_t seq,
+        const char* prof_title,
+        const std::optional<std::vector<at::Tensor>>& inputs,
+        const c10::intrusive_ptr<ProcessGroupUCCLogger>& logger)
+        : Work(-1, opType, prof_title, inputs), logger_(logger), seq_(seq) {}
+    ~WorkUCC();
+    void setException();
+    void setAndThrowException();
+    bool isCompleted() override;
+    bool isSuccess() const override;
+    bool wait(std::chrono::milliseconds timeout = kUnsetTimeout) override;
+    c10::intrusive_ptr<c10::ivalue::Future> getFuture() override;
+    std::vector<at::Tensor> result() override;
+    int sourceRank() const override;
+#ifdef USE_CUDA
+    std::unique_ptr<at::cuda::CUDAEvent> fence = nullptr;
+    event_pool_t* ep = nullptr;
+#endif
+    int sourceRank_;
+
+   protected:
+    std::shared_ptr<ProgressEntry> entry_;
+    c10::intrusive_ptr<ProcessGroupUCCLogger> logger_;
+    uint64_t seq_;
+
+   private:
+    // The future returned by getFuture.
+    c10::intrusive_ptr<at::ivalue::Future> future_;
+    // Store a reference to collective's outputs, used by result
+    std::shared_ptr<std::vector<at::Tensor>> outputs_;
+  };
+
+  explicit ProcessGroupUCC(
+      const c10::intrusive_ptr<Store>& store,
+      int rank = -1,
+      int size = -1,
+      std::chrono::duration<float> timeout = kBackendDefaultTimeout);
+
+  void initComm(c10::Device dev);
+
+  ~ProcessGroupUCC() override;
+
+  const std::string getBackendName() const override {
+    return std::string(UCC_BACKEND_NAME);
+  }
+
+#ifdef USE_CUDA
+  std::unique_ptr<at::cuda::CUDAEvent> getPooledEvent();
+#endif
+
+  // Performs a health check by initializing dummy UCC & UCX communicators and
+  // then destroying them. This will help indicate and signal any
+  // UCC/UCX-related issues prior to the first collective. The actual
+  // initialization and subsequent destruction is ran on a separate thread and
+  // the main thread is signalled about timeouts/errors to report to the
+  // application.
+  void runHealthCheck();
+
+  template <typename PreProcess, typename PostProcess>
+  c10::intrusive_ptr<Work> collective_post(
+      OpType opType,
+      PreProcess preproc,
+      PostProcess postproc,
+      ucc_coll_args_t& coll,
+      std::unique_ptr<ProcessGroupUCC::WorkData> data,
+      c10::Device dev,
+      std::vector<at::Tensor>& inputTensors,
+      std::vector<at::Tensor>& outputTensors,
+      const char* prof_title);
+
+  c10::intrusive_ptr<Work> broadcast(
+      std::vector<at::Tensor>& data,
+      const BroadcastOptions& opts = BroadcastOptions()) override;
+
+  c10::intrusive_ptr<Work> allreduce(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceOptions& opts = AllreduceOptions()) override;
+
+  c10::intrusive_ptr<Work> allreduce_coalesced(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceCoalescedOptions& opts =
+          AllreduceCoalescedOptions()) override;
+
+  c10::intrusive_ptr<Work> reduce(
+      std::vector<at::Tensor>& tensors,
+      const ReduceOptions& opts = ReduceOptions()) override;
+
+  c10::intrusive_ptr<Work> allgather(
+      std::vector<std::vector<at::Tensor>>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const AllgatherOptions& opts = AllgatherOptions()) override;
+
+  c10::intrusive_ptr<Work> _allgather_base(
+      at::Tensor& outputBuffer,
+      at::Tensor& inputBuffer,
+      const AllgatherOptions& opts = AllgatherOptions()) override;
+
+  c10::intrusive_ptr<Work> barrier(
+      const BarrierOptions& opts = BarrierOptions()) override;
+
+  c10::intrusive_ptr<Work> gather(
+      std::vector<std::vector<at::Tensor>>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const GatherOptions& opts = GatherOptions()) override;
+
+  c10::intrusive_ptr<Work> scatter(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<std::vector<at::Tensor>>& inputTensors,
+      const ScatterOptions& opts = ScatterOptions()) override;
+
+  c10::intrusive_ptr<Work> reduce_scatter(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<std::vector<at::Tensor>>& inputTensors,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
+
+  c10::intrusive_ptr<Work> _reduce_scatter_base(
+      at::Tensor& outputTensor,
+      at::Tensor& inputTensor,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
+
+  c10::intrusive_ptr<Work> alltoall_base(
+      at::Tensor& outputTensor,
+      at::Tensor& inputTensor,
+      std::vector<int64_t>& outputSplitSizes,
+      std::vector<int64_t>& inputSplitSizes,
+      const AllToAllOptions& opts = AllToAllOptions()) override;
+
+  c10::intrusive_ptr<Work> alltoall(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const AllToAllOptions& opts = AllToAllOptions()) override;
+
+  c10::intrusive_ptr<Work> send(
+      std::vector<at::Tensor>& tensors,
+      int dstRank,
+      int tag) override;
+
+  c10::intrusive_ptr<Work> recv(
+      std::vector<at::Tensor>& tensors,
+      int srcRank,
+      int tag) override;
+
+  // Counting for the sequential number of UCC collective_post call.
+  uint64_t seq_{0};
+
+  // Agrees on an initial sequence number for the whole group by having rank 0
+  // create it and broadcast it to other ranks using the store.
+  void setSequenceNumberForGroup() override;
+
+  // Retrieves the current sequence number for the whole group, which should be
+  // in sync. If the returned number is not consistent across the group, it
+  // may indicate that there is some sort of collective desynchronization.
+  uint64_t getSequenceNumberForGroup() override;
+
+  static c10::intrusive_ptr<Backend> createProcessGroupUCC(
+      const c10::intrusive_ptr<::c10d::Store>& store,
+      int rank,
+      int size,
+      const std::chrono::duration<float>& timeout);
+
+ protected:
+  const std::chrono::duration<float> timeout_;
+  std::shared_ptr<torch_ucc_oob_coll_info_t> oob;
+  std::shared_ptr<Comm> comm = {nullptr};
+  uint32_t comm_id;
+  ucc_team_h team{nullptr};
+  ucc_ee_h cuda_ee{nullptr};
+  ucc_ee_h cuda_ee_p2p[2]{nullptr, nullptr};
+
+#ifdef USE_CUDA
+  std::unique_ptr<at::cuda::CUDAStream> stream = nullptr;
+  std::unique_ptr<at::cuda::CUDAStream> stream_p2p[2] = {nullptr, nullptr};
+  event_pool_t ep;
+#endif
+  c10::intrusive_ptr<ProcessGroupUCCLogger> logger;
+};
+
+class Comm {
+  c10::intrusive_ptr<ProcessGroupUCCLogger> logger;
+  std::shared_ptr<torch_ucc_oob_coll_info_t> oob;
+  CommUCC ucc_comm;
+  std::mutex mutex;
+  std::thread progress_thread;
+  std::condition_variable queue_produce_cv;
+  std::condition_variable queue_consume_cv;
+  std::deque<std::shared_ptr<ProcessGroupUCC::ProgressEntry>> progress_queue;
+  bool stop_progress_loop;
+  bool collective_inprogress;
+  torch_ucc_phase_t finalize_phase;
+
+ public:
+  c10::DeviceIndex cuda_device_index;
+  Comm(
+      const c10::intrusive_ptr<ProcessGroupUCCLogger>& logger,
+      std::shared_ptr<torch_ucc_oob_coll_info_t> oob,
+      c10::Device dev,
+      bool is_health_check);
+
+  ~Comm();
+
+  void ucc_create_team(
+      ucc_team_h& team,
+      std::shared_ptr<torch_ucc_oob_coll_info_t> oob);
+
+  void ucc_destroy_team(ucc_team_h& team);
+
+  c10::intrusive_ptr<Work> enqueue_p2p(
+      OpType opType,
+      ucc_coll_req_h request,
+      const char* prof_title);
+
+#ifdef USE_CUDA
+  void enqueue_cuda_collective(
+      std::unique_ptr<ProcessGroupUCC::WorkData> data,
+      c10::intrusive_ptr<ProcessGroupUCC::WorkUCC> work,
+      ucc_coll_args_t& coll,
+      ucc_team_h team,
+      ucc_ee_h ee);
+#endif
+
+  void enqueue_collective(
+      std::unique_ptr<ProcessGroupUCC::WorkData> data,
+      c10::intrusive_ptr<ProcessGroupUCC::WorkUCC> work,
+      ucc_coll_args_t& coll,
+      ucc_team_h team);
+
+  static std::shared_ptr<Comm> get_comm(
+      uint32_t& id,
+      c10::Device dev,
+      std::shared_ptr<torch_ucc_oob_coll_info_t> oob,
+      const c10::intrusive_ptr<ProcessGroupUCCLogger>& logger,
+      bool is_health_check = false);
+
+  void progress_loop();
+};
+
+} // namespace c10d
+
+#endif // USE_C10D_UCC
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroupWrapper.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroupWrapper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..7c4e690d163204359c98b194a0c98f3bb15ebba5
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/ProcessGroupWrapper.hpp
@@ -0,0 +1,140 @@
+#pragma once
+
+#ifdef USE_C10D_GLOO
+
+#include <torch/csrc/distributed/c10d/ProcessGroupGloo.hpp>
+#include <torch/csrc/distributed/c10d/Types.hpp>
+#include <torch/csrc/distributed/c10d/Utils.hpp>
+
+namespace c10d {
+
+class TORCH_API ProcessGroupWrapper : public Backend {
+ public:
+  explicit ProcessGroupWrapper(
+      const c10::intrusive_ptr<Backend>& backend,
+      c10::intrusive_ptr<Backend> glooBackend);
+
+  const std::string getBackendName() const override;
+
+  c10::intrusive_ptr<Work> broadcast(
+      std::vector<at::Tensor>& data,
+      const BroadcastOptions& opts = BroadcastOptions()) override;
+
+  c10::intrusive_ptr<Work> allreduce(
+      std::vector<at::Tensor>& data,
+      const AllreduceOptions& opts = AllreduceOptions()) override;
+
+  c10::intrusive_ptr<Work> allreduce_coalesced(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceCoalescedOptions& opts =
+          AllreduceCoalescedOptions()) override;
+
+  c10::intrusive_ptr<Work> reduce(
+      std::vector<at::Tensor>& tensors,
+      const ReduceOptions& opts = ReduceOptions()) override;
+
+  c10::intrusive_ptr<Work> allgather(
+      std::vector<std::vector<at::Tensor>>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const AllgatherOptions& opts = AllgatherOptions()) override;
+
+  c10::intrusive_ptr<Work> _allgather_base(
+      at::Tensor& outputBuffer,
+      at::Tensor& inputBuffer,
+      const AllgatherOptions& opts = AllgatherOptions()) override;
+
+  // This function is deprecated and will be moved out of ProcessGroup to comms:
+  // * do not add dependencies on this function,
+  // * do not implement it in your ProcessGroup, implement _allgather_base
+  //   instead.
+  c10::intrusive_ptr<Work> allgather_coalesced(
+      std::vector<std::vector<at::Tensor>>& outputTensorLists,
+      std::vector<at::Tensor>& inputTensors,
+      const AllgatherOptions& opts = AllgatherOptions()) override;
+
+  c10::intrusive_ptr<Work> gather(
+      std::vector<std::vector<at::Tensor>>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const GatherOptions& opts = GatherOptions()) override;
+
+  c10::intrusive_ptr<Work> scatter(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<std::vector<at::Tensor>>& inputTensors,
+      const ScatterOptions& opts = ScatterOptions()) override;
+
+  c10::intrusive_ptr<Work> reduce_scatter(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<std::vector<at::Tensor>>& inputTensors,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override;
+
+  c10::intrusive_ptr<Work> alltoall_base(
+      at::Tensor& outputTensor,
+      at::Tensor& inputTensor,
+      std::vector<int64_t>& outputSplitSizes,
+      std::vector<int64_t>& inputSplitSizes,
+      const AllToAllOptions& opts = AllToAllOptions()) override;
+
+  c10::intrusive_ptr<Work> alltoall(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const AllToAllOptions& opts = AllToAllOptions()) override;
+
+  void monitoredBarrier(const BarrierOptions& opts, bool waitAllRanks = false)
+      override;
+
+  // Agrees on an initial sequence number for the whole group by having rank 0
+  // create it and broadcast it to other ranks using the store. Only implemented
+  // for GLOO and NCCL backends currently.
+  // dont implement this
+  void setSequenceNumberForGroup() override;
+
+  // Retrieves the current sequence number for the whole group, which should be
+  // in sync. If the returned number is not consistent across the group, it
+  // may indicate that there is some sort of collective desynchronization.
+  uint64_t getSequenceNumberForGroup() override; // just call underlying
+
+  c10::intrusive_ptr<Work> send(
+      std::vector<at::Tensor>& tensors,
+      int dstRank,
+      int tag) override;
+
+  c10::intrusive_ptr<Work> recv(
+      std::vector<at::Tensor>& tensors,
+      int srcRank,
+      int tag) override;
+
+  c10::intrusive_ptr<Work> recvAnysource(
+      std::vector<at::Tensor>& tensors,
+      int tag) override;
+
+  c10::intrusive_ptr<Work> barrier(
+      const BarrierOptions& opts = BarrierOptions()) override;
+
+  c10::intrusive_ptr<Work> _reduce_scatter_base(
+      at::Tensor& outputBuffer,
+      at::Tensor& inputBuffer,
+      const ReduceScatterOptions& opts) override;
+
+  void startCoalescing() override;
+
+  c10::intrusive_ptr<Work> endCoalescing() override;
+
+  c10::intrusive_ptr<Backend> getWrappedPg() const;
+
+ private:
+  // Underlying process group that actual application collectives will be
+  // dispatched to
+  c10::intrusive_ptr<Backend> backend_;
+  // Gloo process group responsible for internal coordination such as monitored
+  // barrier, sequence number checking, collective fingerprint collecting.
+  c10::intrusive_ptr<Backend> glooBackend_;
+  // Conducts several checks to ensure that the underlying collective is well
+  // formed with the goal of notifying the user about incorrect collective use
+  // in the application.
+  void runCollectiveChecks(
+      OpType op_type,
+      const std::vector<at::Tensor>& tensors);
+};
+} // namespace c10d
+
+#endif // USE_C10D_GLOO
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/PyProcessGroup.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/PyProcessGroup.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1fec98ded8051a172bae87a4d006bfbeb767015c
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/PyProcessGroup.hpp
@@ -0,0 +1,326 @@
+#pragma once
+
+#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
+#include <torch/csrc/jit/python/pybind_utils.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace c10d {
+
+// PyProcessGroup is a pybind11 trampoline class to allow a Python
+// class to inherit from torch.distributed.ProcessGroup
+class PyProcessGroup : public ProcessGroup {
+ public:
+  // PyWork is a pybind11 trampoline class to allow a Python
+  // class to inherit from torch.distributed.Work
+  class TORCH_PYTHON_API PyWork : public Work {
+   public:
+    PyWork() = default;
+
+    bool wait(std::chrono::milliseconds timeout = kNoTimeout) override {
+      PYBIND11_OVERRIDE(
+          bool, /* Return type */
+          Work, /* Parent class */
+          wait, /* Name of function in C++ */
+          timeout);
+    }
+
+    c10::intrusive_ptr<c10::ivalue::Future> getFuture() override {
+      // We cannot use PYBIND11_OVERRIDE because:
+      // 1. We have to >MANUALLY< unwrap the PyFutureWrapper and
+      // 2. The python name is get_future
+      pybind11::gil_scoped_acquire gil;
+      auto override =
+          pybind11::get_override(static_cast<const Work*>(this), "get_future");
+
+      if (override) {
+        py::object o = override();
+        auto futWrapper =
+            o.cast<std::shared_ptr<torch::jit::PythonFutureWrapper>>();
+        return futWrapper->fut;
+      }
+
+      return Work::getFuture();
+    }
+  };
+
+#define WORK_OVERRIDE(cname, name, ...)                                 \
+  do {                                                                  \
+    pybind11::gil_scoped_acquire gil;                                   \
+    pybind11::function override =                                       \
+        pybind11::get_override(static_cast<const cname*>(this), #name); \
+    if (override) {                                                     \
+      auto o = override(__VA_ARGS__);                                   \
+      return c10::make_intrusive<PyWorkHolder>(o);                      \
+    }                                                                   \
+    return cname::name(__VA_ARGS__);                                    \
+  } while (false)
+
+  // This class is used to wrap a PyWork trampoline with it's corresponding
+  // Python object to prevent the Python object from being garbage collected.
+  class PyWorkHolder : public Work {
+   public:
+    PyWorkHolder(const c10::intrusive_ptr<Work>& work, py::object pyWork)
+        : work_(work), pyWork_(std::move(pyWork)) {}
+
+    PyWorkHolder(py::object pyWork)
+        : work_(pyWork.cast<c10::intrusive_ptr<Work>>()),
+          pyWork_(std::move(pyWork)) {}
+
+    ~PyWorkHolder() override {
+      // GIL must be held when freeing python objects.
+      py::gil_scoped_acquire gil;
+      pyWork_ = py::object();
+    }
+
+    bool wait(std::chrono::milliseconds timeout = kNoTimeout) override {
+      return work_->wait(timeout);
+    }
+
+    c10::intrusive_ptr<c10::ivalue::Future> getFuture() override {
+      return work_->getFuture();
+    }
+
+   private:
+    c10::intrusive_ptr<Work> work_;
+    py::object pyWork_;
+  };
+
+  using ProcessGroup::ProcessGroup;
+
+  const std::string getBackendName() const override {
+    PYBIND11_OVERRIDE(
+        std::string, /* Return type */
+        ProcessGroup, /* Parent class */
+        getBackendName, /* Name of function in C++ */
+    );
+  }
+
+  int getRank() const override {
+    PYBIND11_OVERRIDE(
+        int, /* Return type */
+        ProcessGroup, /* Parent class */
+        getRank, /* Name of function in C++ */
+    );
+  }
+
+  int getSize() const override {
+    PYBIND11_OVERRIDE(
+        int, /* Return type */
+        ProcessGroup, /* Parent class */
+        getSize, /* Name of function in C++ */
+    );
+  }
+
+  void abort() override {
+    PYBIND11_OVERRIDE(
+        void, /* Return type */
+        ProcessGroup, /* Parent class */
+        abort, /* Name of function in C++ */
+    );
+  }
+
+  const std::string& getGroupName() const override {
+    PYBIND11_OVERRIDE(
+        const std::string&, /* Return type */
+        ProcessGroup, /* Parent class */
+        getGroupName, /* Name of function in C++ */
+    );
+  }
+
+  void setGroupName(const std::string& group_name) override {
+    PYBIND11_OVERRIDE(
+        void, /* Return type */
+        ProcessGroup, /* Parent class */
+        setGroupName, /* Name of function in C++ */
+        group_name);
+  }
+
+  const std::string& getGroupDesc() const override {
+    PYBIND11_OVERRIDE(
+        const std::string&, /* Return type */
+        ProcessGroup, /* Parent class */
+        getGroupDesc, /* Name of function in C++ */
+    );
+  }
+
+  void setGroupDesc(const std::string& group_desc) override {
+    PYBIND11_OVERRIDE(
+        void, /* Return type */
+        ProcessGroup, /* Parent class */
+        setGroupDesc, /* Name of function in C++ */
+        group_desc);
+  }
+
+  c10::intrusive_ptr<Work> allgather(
+      std::vector<std::vector<at::Tensor>>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const AllgatherOptions& opts = AllgatherOptions()) override {
+    WORK_OVERRIDE(
+        ProcessGroup, /* Parent class */
+        allgather, /* Name of function in C++ */
+        outputTensors,
+        inputTensors,
+        opts);
+  }
+
+  c10::intrusive_ptr<Work> allgather_into_tensor_coalesced(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const AllgatherOptions& opts = AllgatherOptions()) override {
+    WORK_OVERRIDE(
+        ProcessGroup, /* Parent class */
+        allgather_into_tensor_coalesced, /* Name of function in C++ */
+        outputTensors,
+        inputTensors,
+        opts);
+  }
+
+  c10::intrusive_ptr<Work> allreduce(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceOptions& opts = AllreduceOptions()) override {
+    WORK_OVERRIDE(
+        // py::object, /* Return type */
+        ProcessGroup, /* Parent class */
+        allreduce, /* Name of function in C++ */
+        tensors,
+        opts);
+  }
+
+  c10::intrusive_ptr<Work> allreduce_coalesced(
+      std::vector<at::Tensor>& tensors,
+      const AllreduceCoalescedOptions& opts =
+          AllreduceCoalescedOptions()) override {
+    WORK_OVERRIDE(
+        ProcessGroup, /* Parent class */
+        allreduce_coalesced, /* Name of function in C++ */
+        tensors,
+        opts);
+  }
+
+  c10::intrusive_ptr<Work> alltoall_base(
+      at::Tensor& outputBuffer,
+      at::Tensor& inputBuffer,
+      std::vector<int64_t>& outputSplitSizes,
+      std::vector<int64_t>& inputSplitSizes,
+      const AllToAllOptions& opts = AllToAllOptions()) override {
+    WORK_OVERRIDE(
+        ProcessGroup, /* Parent class */
+        alltoall_base, /* Name of function in C++ */
+        outputBuffer,
+        inputBuffer,
+        outputSplitSizes,
+        inputSplitSizes,
+        opts);
+  }
+
+  c10::intrusive_ptr<Work> barrier(
+      const BarrierOptions& opts = BarrierOptions()) override {
+    WORK_OVERRIDE(
+        ProcessGroup, /* Parent class */
+        barrier, /* Name of function in C++ */
+        opts);
+  }
+
+  c10::intrusive_ptr<Work> broadcast(
+      std::vector<at::Tensor>& tensors,
+      const BroadcastOptions& opts = BroadcastOptions()) override {
+    WORK_OVERRIDE(
+        ProcessGroup, /* Parent class */
+        broadcast, /* Name of function in C++ */
+        tensors,
+        opts);
+  }
+
+  c10::intrusive_ptr<Work> reduce_scatter(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<std::vector<at::Tensor>>& inputTensors,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override {
+    WORK_OVERRIDE(
+        ProcessGroup, /* Parent class */
+        reduce_scatter, /* Name of function in C++ */
+        outputTensors,
+        inputTensors,
+        opts);
+  }
+
+  c10::intrusive_ptr<Work> reduce_scatter_tensor_coalesced(
+      std::vector<at::Tensor>& outputTensors,
+      std::vector<at::Tensor>& inputTensors,
+      const ReduceScatterOptions& opts = ReduceScatterOptions()) override {
+    WORK_OVERRIDE(
+        ProcessGroup, /* Parent class */
+        reduce_scatter_tensor_coalesced, /* Name of function in C++ */
+        outputTensors,
+        inputTensors,
+        opts);
+  }
+
+  c10::intrusive_ptr<Work> send(
+      std::vector<at::Tensor>& tensors,
+      int dstRank,
+      int tag) override {
+    WORK_OVERRIDE(
+        ProcessGroup, /* Parent class */
+        send, /* Name of function in C++ */
+        tensors,
+        dstRank,
+        tag);
+  }
+
+  c10::intrusive_ptr<Work> recv(
+      std::vector<at::Tensor>& tensors,
+      int srcRank,
+      int tag) override {
+    WORK_OVERRIDE(
+        ProcessGroup, /* Parent class */
+        recv, /* Name of function in C++ */
+        tensors,
+        srcRank,
+        tag);
+  }
+};
+
+class TORCH_PYTHON_API PythonOnCompletionHook {
+ public:
+  // Wraps a py::object hook and acquires Python GIL in dtor before
+  // destructing the hook object.
+  PythonOnCompletionHook(py::object hook) : hook_(std::move(hook)) {}
+  PythonOnCompletionHook(const PythonOnCompletionHook&) = default;
+
+  // NOLINTNEXTLINE(bugprone-exception-escape)
+  ~PythonOnCompletionHook() {
+    py::gil_scoped_acquire ag;
+    hook_.dec_ref();
+    // Explicitly set hook_ to nullptr to prevent py::object's dtor
+    // to decref on the PyObject again.
+    // See Note [Destructing py::object] in python_ivalue.h
+    hook_.ptr() = nullptr;
+  }
+
+  void operator()(const std::shared_ptr<WorkInfo>& workInfo) const {
+    std::exception_ptr eptr;
+    {
+      py::gil_scoped_acquire acquire;
+      try {
+        hook_(workInfo);
+      } catch (py::error_already_set& e) {
+        // py::error_already_set requires GIL to destruct, take
+        // special care.
+        eptr = std::make_exception_ptr(std::runtime_error(e.what()));
+        e.restore();
+        PyErr_Clear();
+      } catch (std::exception& e) {
+        eptr = std::current_exception();
+      }
+    }
+    // No more Python-related stuff at this point, i.e., this
+    // exception can be captured and handled by PG backend.
+    if (eptr)
+      std::rethrow_exception(eptr);
+  }
+
+ private:
+  py::object hook_;
+};
+
+} // namespace c10d
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/RankLocal.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/RankLocal.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d730da55864378630ceeb3a34f0907cc2455047d
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/RankLocal.hpp
@@ -0,0 +1,73 @@
+
+#pragma once
+
+#include <shared_mutex>
+
+#include <torch/csrc/autograd/function.h>
+
+namespace c10d {
+
+// `RankLocal` maintains a unique instance of T for each non-autograd thread.
+// For non-autograd threads, `RankLocal<T>::get()` functions similar to
+// thread_local. For autograd threads, `RankLocal<T>::get()` returns the
+// instance of T corresponding to the enqueuing non-autograd thread. The
+// mechanism allows for rank-specific context shared between forward and
+// backward. It works for both the one-rank-per-process and one-rank-per-thread
+// scenarios.
+//
+// NOTE: RankLocal doesn't make the underlying objects thread-safe.
+template <typename T>
+class RankLocal {
+ public:
+  RankLocal(const RankLocal&) = delete;
+  RankLocal& operator=(const RankLocal&) = delete;
+
+  static T& get() {
+    // Fast path: non-autograd threads can simply return
+    // the object reference cached in TLS.
+    if (cached_ != nullptr) {
+      return *cached_;
+    }
+    const auto node = torch::autograd::get_current_node();
+    auto fwd_thread_id = node == nullptr ? at::RecordFunction::currentThreadId()
+                                         : node->thread_id();
+    // Optimistically acquire the read lock first, since most likely we are in
+    // an autograd thread and the object has already been constructed.
+    {
+      std::shared_lock read_lock(lock_);
+      auto it = thread_id_to_rank_local_.find(fwd_thread_id);
+      if (it != thread_id_to_rank_local_.end()) {
+        // Cache for non-autograd threads
+        if (node == nullptr) {
+          cached_ = &it->second;
+        }
+        return it->second;
+      }
+    }
+
+    std::unique_lock write_lock(lock_);
+    auto [it, _] = thread_id_to_rank_local_.try_emplace(fwd_thread_id);
+    // Cache for non-autograd threads
+    if (node == nullptr) {
+      cached_ = &it->second;
+    }
+    return it->second;
+  }
+
+ private:
+  RankLocal() = default;
+  thread_local static T* cached_;
+  static std::unordered_map<uint64_t, T> thread_id_to_rank_local_;
+  static std::shared_mutex lock_;
+};
+
+template <typename T>
+thread_local T* RankLocal<T>::cached_ = nullptr;
+
+template <typename T>
+std::unordered_map<uint64_t, T> RankLocal<T>::thread_id_to_rank_local_;
+
+template <typename T>
+std::shared_mutex RankLocal<T>::lock_;
+
+} // namespace c10d
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/Store.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/Store.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d1338987ea78f31559d67c9e344e4f42cb1040ba
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/Store.hpp
@@ -0,0 +1,149 @@
+#pragma once
+
+#include <chrono>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include <c10/macros/Macros.h>
+#include <torch/custom_class.h>
+
+namespace c10d {
+
+// callback function will be given arguments (std::optional<string> oldValue,
+// std::optional<string> newValue)
+using WatchKeyCallback =
+    std::function<void(std::optional<std::string>, std::optional<std::string>)>;
+
+class TORCH_API Store : public torch::CustomClassHolder {
+ public:
+  static constexpr std::chrono::milliseconds kDefaultTimeout =
+      std::chrono::seconds(300);
+  static constexpr std::chrono::milliseconds kNoTimeout =
+      std::chrono::milliseconds::zero();
+
+  Store() : timeout_(kDefaultTimeout) {}
+
+  explicit Store(const std::chrono::milliseconds& timeout)
+      : timeout_(timeout) {}
+
+  Store(const Store&) = default;
+  Store(Store&&) noexcept = default;
+
+  ~Store() override = default;
+
+  // Clone a thread safe copy of this store object that points to the same
+  // underlying store.
+  virtual c10::intrusive_ptr<Store> clone() = 0;
+
+  void set(const std::string& key, const std::string& value);
+
+  virtual void set(
+      const std::string& key,
+      const std::vector<uint8_t>& value) = 0;
+
+  std::string compareSet(
+      const std::string& key,
+      const std::string& currentValue,
+      const std::string& newValue);
+
+  virtual std::vector<uint8_t> compareSet(
+      const std::string& key,
+      const std::vector<uint8_t>& currentValue,
+      const std::vector<uint8_t>& newValue) {
+    C10_THROW_ERROR(NotImplementedError, "Not implemented.");
+  }
+
+  std::string get_to_str(const std::string& key);
+
+  virtual std::vector<uint8_t> get(const std::string& key) = 0;
+
+  virtual int64_t add(const std::string& key, int64_t value) = 0;
+
+  virtual bool deleteKey(const std::string& key) = 0;
+
+  virtual bool check(const std::vector<std::string>& keys) = 0;
+
+  virtual int64_t getNumKeys() = 0;
+
+  virtual void wait(const std::vector<std::string>& keys) = 0;
+
+  virtual void wait(
+      const std::vector<std::string>& keys,
+      const std::chrono::milliseconds& timeout) = 0;
+
+  virtual const std::chrono::milliseconds& getTimeout() const noexcept;
+
+  virtual void setTimeout(const std::chrono::milliseconds& timeout);
+
+  // watchKey() is deprecated and no longer supported.
+  virtual void watchKey(
+      const std::string& /* unused */,
+      // NOLINTNEXTLINE(performance-unnecessary-value-param)
+      WatchKeyCallback /* unused */) {
+    C10_THROW_ERROR(
+        NotImplementedError,
+        "watchKey is deprecated, no implementation support it.");
+  }
+
+  virtual void append(
+      const std::string& key,
+      const std::vector<uint8_t>& value);
+
+  virtual std::vector<std::vector<uint8_t>> multiGet(
+      const std::vector<std::string>& keys);
+
+  virtual void multiSet(
+      const std::vector<std::string>& keys,
+      const std::vector<std::vector<uint8_t>>& values);
+
+  // Returns true if this store support append, multiGet and multiSet
+  virtual bool hasExtendedApi() const;
+
+  virtual void queuePush(
+      const std::string& key,
+      const std::vector<uint8_t>& value) {
+    C10_THROW_ERROR(NotImplementedError, "queue support is not implemented.");
+  }
+
+  virtual std::vector<uint8_t> queuePop(const std::string& key, bool block) {
+    C10_THROW_ERROR(NotImplementedError, "queue support is not implemented.");
+  }
+
+  virtual int64_t queueLen(const std::string& key) {
+    C10_THROW_ERROR(NotImplementedError, "queue support is not implemented.");
+  }
+
+ protected:
+  std::chrono::milliseconds timeout_;
+};
+
+/*
+StoreTimeoutGuard is a RAII guard that will set the store timeout and restore it
+when it returns.
+*/
+class StoreTimeoutGuard {
+ public:
+  explicit StoreTimeoutGuard(
+      Store& store,
+      const std::chrono::milliseconds& timeout)
+      : store_(store), oldTimeout_(store.getTimeout()) {
+    store.setTimeout(timeout);
+  }
+
+  ~StoreTimeoutGuard() {
+    store_.setTimeout(oldTimeout_);
+  }
+
+  /* Disabling copy and move semantics */
+  StoreTimeoutGuard(const StoreTimeoutGuard&) = delete;
+  StoreTimeoutGuard& operator=(const StoreTimeoutGuard&) = delete;
+  StoreTimeoutGuard(StoreTimeoutGuard&&) = delete;
+  StoreTimeoutGuard& operator=(StoreTimeoutGuard&&) = delete;
+
+ private:
+  Store& store_;
+  std::chrono::milliseconds oldTimeout_{};
+};
+
+} // namespace c10d
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/TCPStore.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/TCPStore.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e388f1075e1f04ec1e5096c67e9a0dfe7bfad6ea
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/TCPStore.hpp
@@ -0,0 +1,169 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+
+#include <torch/csrc/distributed/c10d/Store.hpp>
+
+namespace c10d {
+namespace detail {
+
+// TCPStore is a key-value store used by PyTorch mainly for distributed
+// rendezvous, but for other purposes as well. (e.g., a centralized storage for
+// synchronization among different processes.)
+//
+// It is run via a classic client-server architecture, where the server runs
+// a separate background thread (alternatively we call it daemon thread). The
+// client and server communicate via TCP sockets.
+//
+// Currently we have two types of server backends:
+// 1. TCPStoreBackend: a single thread to handle all incoming request
+// synchronously.
+// 2. LibUVTCPStoreBackend: an event-driven asynchronous stream processing that
+// leverages libuv library (https://github.com/libuv/libuv) for better
+// performance. And this backend now is recommended to users. (We set the
+// default value of `useLibUV` inside `TCPStoreOptions` to true now, so users
+// should get it by default).
+//
+// Code structure:
+// ├── TCPStore client side API and server setup code:
+// │   TCPStore.hpp/TCPStore.cpp
+// ├── TCPStoreBackend server side API implementation code:
+// │   TCPStoreBackend.hpp/TCPStoreBackend.cpp
+// |   (actual class:`TCPStoreMasterDaemon`)
+// ├── LibUVTCPStoreBackend
+// │   TCPStoreLibUvBackend.cpp
+// |   (actual class: `LibUVStoreDaemon`)
+
+class TCPServer;
+
+class TCPClient;
+
+struct SocketAddress {
+  std::string host{};
+  std::uint16_t port{};
+};
+
+} // namespace detail
+
+struct TCPStoreOptions {
+  static constexpr std::uint16_t kDefaultPort = 29500;
+
+  std::uint16_t port = kDefaultPort;
+  bool isServer = false;
+  std::optional<std::size_t> numWorkers = std::nullopt;
+  bool waitWorkers = true;
+  std::chrono::milliseconds timeout = Store::kDefaultTimeout;
+
+  // A boolean value indicating whether multiple store instances can be
+  // initialized with the same host:port pair.
+  bool multiTenant = false;
+
+  // If specified, and if isServer is true, the underlying TCPServer will take
+  // over the bound socket associated to this fd. This option is useful to avoid
+  // port assignment races in certain scenarios.
+  std::optional<int> masterListenFd = std::nullopt;
+
+  // A boolean value indicating whether to use the experimental libUV backend.
+  bool useLibUV = true;
+};
+
+class TORCH_API TCPStore : public Store {
+ public:
+  static constexpr std::chrono::milliseconds kConnectRetryDelay{1000};
+
+  explicit TCPStore(std::string host, const TCPStoreOptions& opts = {});
+
+  ~TCPStore() override;
+
+  c10::intrusive_ptr<Store> clone() override;
+
+  void set(const std::string& key, const std::vector<uint8_t>& value) override;
+
+  std::vector<uint8_t> compareSet(
+      const std::string& key,
+      const std::vector<uint8_t>& expectedValue,
+      const std::vector<uint8_t>& desiredValue) override;
+
+  std::vector<uint8_t> get(const std::string& key) override;
+
+  int64_t add(const std::string& key, int64_t value) override;
+
+  bool deleteKey(const std::string& key) override;
+
+  bool check(const std::vector<std::string>& keys) override;
+
+  int64_t getNumKeys() override;
+
+  void wait(const std::vector<std::string>& keys) override;
+
+  void wait(
+      const std::vector<std::string>& keys,
+      const std::chrono::milliseconds& timeout) override;
+
+  void append(const std::string& key, const std::vector<uint8_t>& value)
+      override;
+
+  std::vector<std::vector<uint8_t>> multiGet(
+      const std::vector<std::string>& keys) override;
+
+  void multiSet(
+      const std::vector<std::string>& keys,
+      const std::vector<std::vector<uint8_t>>& values) override;
+
+  bool hasExtendedApi() const override;
+
+  void queuePush(const std::string& key, const std::vector<uint8_t>& value)
+      override;
+
+  std::vector<uint8_t> queuePop(const std::string& key, bool block) override;
+
+  int64_t queueLen(const std::string& key) override;
+
+  // Waits for all workers to join.
+  void waitForWorkers();
+
+  // Returns the hostname used by the TCPStore.
+  const std::string& getHost() const noexcept {
+    return addr_.host;
+  }
+
+  // Returns the port used by the TCPStore.
+  std::uint16_t getPort() const noexcept {
+    return addr_.port;
+  }
+
+  bool isLibUvBackend() const noexcept {
+    return usingLibUv_;
+  }
+
+  // note(xilunwu): this function is only for internal testing
+  void _splitSet(const std::string& key, const std::vector<uint8_t>& data);
+
+  std::string repr() const;
+
+ private:
+  int64_t incrementValueBy(const std::string& key, int64_t delta);
+
+  void ping();
+  void validate();
+
+  std::vector<uint8_t> doGet(const std::string& key);
+
+  void doWait(
+      c10::ArrayRef<std::string> keys,
+      std::chrono::milliseconds timeout);
+
+  detail::SocketAddress addr_;
+  std::shared_ptr<detail::TCPServer> server_;
+  std::unique_ptr<detail::TCPClient> client_;
+  std::optional<std::size_t> numWorkers_;
+
+  const std::string initKey_ = "init/";
+  const std::string keyPrefix_ = "/";
+  std::mutex activeOpLock_;
+  bool usingLibUv_ = true;
+};
+
+} // namespace c10d
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/TCPStoreBackend.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/TCPStoreBackend.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..fdd3427130dabdfb0177c9415d689bcbe0dbb9bd
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/TCPStoreBackend.hpp
@@ -0,0 +1,77 @@
+#pragma once
+
+#include <thread>
+
+#include <torch/csrc/distributed/c10d/TCPStore.hpp>
+#include <torch/csrc/distributed/c10d/socket.h>
+
+#ifdef _WIN32
+#include <io.h>
+#include <winsock2.h>
+#else
+#include <poll.h>
+#include <unistd.h>
+#endif
+
+namespace c10d::detail {
+
+// Magic number for client validation.
+static const uint32_t validationMagicNumber = 0x3C85F7CE;
+
+enum class QueryType : uint8_t {
+  VALIDATE,
+  SET,
+  COMPARE_SET,
+  GET,
+  ADD,
+  CHECK,
+  WAIT,
+  GETNUMKEYS,
+  DELETE_KEY,
+  APPEND,
+  MULTI_GET,
+  MULTI_SET,
+  CANCEL_WAIT,
+  PING,
+  QUEUE_PUSH,
+  QUEUE_POP,
+  QUEUE_LEN,
+};
+
+enum class CheckResponseType : uint8_t { READY, NOT_READY };
+
+enum class WaitResponseType : uint8_t { STOP_WAITING, WAIT_CANCELED };
+
+// Abstract base class to handle thread state for TCPStoreMasterDaemon.
+// Contains the windows/unix implementations to signal a
+// shutdown sequence for the thread
+class BackgroundThread {
+ public:
+  explicit BackgroundThread();
+
+  virtual ~BackgroundThread() = 0;
+  virtual std::uint16_t port() const = 0;
+
+  void start();
+  bool stop_requested();
+
+ protected:
+  void dispose();
+  virtual void run() = 0;
+  virtual void stop() = 0;
+  bool is_running() {
+    return is_running_.load();
+  }
+
+ private:
+  std::atomic<bool> is_running_{false};
+  std::thread daemonThread_{};
+};
+
+std::unique_ptr<BackgroundThread> create_tcpstore_backend(
+    const TCPStoreOptions& opts);
+std::unique_ptr<BackgroundThread> create_libuv_tcpstore_backend(
+    const TCPStoreOptions& opts);
+bool is_libuv_tcpstore_backend_available();
+
+} // namespace c10d::detail
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/TraceUtils.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/TraceUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..bab2825bdd89b9e2b9a43008acefd407e38cae99
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/TraceUtils.h
@@ -0,0 +1,319 @@
+#pragma once
+#include <c10/core/ScalarType.h>
+#include <c10/util/ApproximateClock.h>
+#include <c10/util/irange.h>
+#include <torch/csrc/distributed/c10d/Store.hpp>
+#include <torch/csrc/distributed/c10d/Types.hpp>
+#include <torch/csrc/distributed/c10d/Utils.hpp>
+#include <torch/csrc/jit/serialization/pickler.h>
+#include <torch/csrc/profiler/combined_traceback.h>
+
+#include <fmt/compile.h>
+#include <fmt/core.h>
+#include <fmt/ostream.h> // optional, for ostream fallback
+#include <fmt/ranges.h> // for fmt::join
+
+#include <sys/types.h>
+#include <cstdlib>
+#include <cstring>
+#include <iterator>
+#include <string>
+#include <vector>
+
+namespace c10d {
+
+inline std::string getTraceStartKey(const std::string& pgName, int rank) {
+  return fmt::format(FMT_COMPILE("{}_{}_trace_start"), pgName, rank);
+}
+
+inline std::string getTraceEndKey(const std::string& pgName, int rank) {
+  return fmt::format(FMT_COMPILE("{}_{}_trace_end"), pgName, rank);
+}
+
+inline bool traceUpdate(
+    c10::intrusive_ptr<Store>& store,
+    const std::string& key,
+    uint64_t seq,
+    const std::string& col) {
+  std::vector<uint8_t> value(col.size() + sizeof(seq) + 1);
+  std::memcpy(value.data(), &seq, sizeof(seq));
+  std::memcpy(value.data() + sizeof(seq), col.data(), col.size());
+  try {
+    store->set(key, value);
+    return true;
+  } catch (...) {
+    LOG(ERROR) << "Store is down while updating #" << seq << " with key "
+               << key;
+    return false;
+  }
+  return true;
+}
+
+enum TraceDebugEvent {
+  kEventStart,
+  kEventEnd,
+};
+// <seq, <rank, <col, start/end>>>
+using TraceMap =
+    std::map<uint64_t, std::map<int, std::pair<std::string, TraceDebugEvent>>>;
+
+inline std::string ranksToString(const std::vector<int>& ranks) {
+  return fmt::to_string(fmt::join(ranks, ", "));
+}
+
+inline std::string ranksFromTrace(
+    const std::vector<std::pair<int, std::string>>& items) {
+  fmt::memory_buffer buf;
+  bool first = true;
+  for (const auto& [rank, _] : items) {
+    if (!first) {
+      fmt::format_to(std::back_inserter(buf), ", ");
+    }
+    fmt::format_to(std::back_inserter(buf), "{}", rank);
+    first = false;
+  }
+  return fmt::to_string(buf);
+}
+
+inline std::string analyzeMissingRanks(const std::vector<int>& missingRanks) {
+  return c10::str(
+      "\n\t - To our best knowledge, ranks [",
+      ranksToString(missingRanks),
+      "] are the lagging ranks that caused this timeout. "
+      "They never joined any collectives");
+}
+
+inline std::string analyzeLaggingRanks(const TraceMap& traceMap) {
+  uint64_t lagSeq = traceMap.begin()->first;
+  std::vector<int> startRanks;
+  std::vector<int> endRanks;
+  for (auto& p : traceMap.begin()->second) {
+    if (p.second.second == kEventStart) {
+      startRanks.push_back(p.first);
+    } else {
+      endRanks.push_back(p.first);
+    }
+  }
+  std::string report =
+      "\n\t - To our best knowledge, the lagging/dead/mismatched ranks "
+      "that caused the desync are:";
+  if (!startRanks.empty()) {
+    report += c10::str(
+        "\n\t   - [",
+        ranksToString(startRanks),
+        "] joined but didn't finish collective #",
+        lagSeq,
+        " (count from 1)");
+  }
+  if (!endRanks.empty()) {
+    report += c10::str(
+        "\n\t     [",
+        ranksToString(endRanks),
+        "] finished collective #",
+        lagSeq,
+        ", but didn't join collective #",
+        lagSeq + 1,
+        " (count from 1)");
+  }
+  return report;
+}
+
+inline std::string dumpSnapshot(TraceMap& traceMap) {
+  std::string report = "\n\t - Snapshot of ranks' latest states:";
+  for (auto& tracePair : traceMap) {
+    uint64_t seq = tracePair.first;
+    std::map<int, std::pair<std::string, TraceDebugEvent>>& subMap =
+        tracePair.second;
+
+    std::unordered_map<std::string, std::vector<int>> collectivesStart;
+    std::unordered_map<std::string, std::vector<int>> collectivesEnd;
+    for (const auto& p : subMap) {
+      int rank = p.first;
+      const std::string& col = p.second.first;
+      if (p.second.second == kEventStart) {
+        collectivesStart[col].push_back(rank);
+      } else {
+        collectivesEnd[col].push_back(rank);
+      }
+    }
+
+    if (!collectivesStart.empty()) {
+      report += c10::str("\n\t   #", seq, " started ranks:");
+      for (auto& mapPair : collectivesStart) {
+        report += c10::str(
+            "\n\t     [",
+            ranksToString(mapPair.second),
+            "] started ",
+            mapPair.first);
+      }
+    }
+    if (!collectivesEnd.empty()) {
+      report += c10::str("\n\t   #", seq, " finished ranks:");
+      for (auto& mapPair : collectivesEnd) {
+        report += c10::str(
+            "\n\t     [",
+            ranksToString(mapPair.second),
+            "] finished ",
+            mapPair.first);
+      }
+    }
+  }
+  return report;
+}
+
+inline bool parseTraceValue(
+    c10::intrusive_ptr<Store>& store,
+    const std::string& key,
+    uint64_t& seq,
+    std::string& col) {
+  try {
+    std::vector<uint8_t> traceValue = store->get(key);
+    std::memcpy(&seq, traceValue.data(), sizeof(seq));
+    std::string colName((char*)traceValue.data() + sizeof(seq));
+    col = colName;
+    return true;
+  } catch (...) {
+    LOG(ERROR) << "Store is down while getting key " << key;
+    return false;
+  }
+  return true;
+}
+
+inline std::string retrieveDesyncReport(
+    c10::intrusive_ptr<Store>& store,
+    const std::string& pgName,
+    int myRank,
+    int worldSize) {
+  std::string report;
+
+  uint64_t thisSeq = 0;
+  std::string thisCol;
+
+  std::vector<int> missingRanks;
+  TraceMap traceMap;
+
+  for (const auto rank : c10::irange(worldSize)) {
+    // Build traceMapStart.
+    uint64_t seqStart = 0;
+    {
+      std::string traceKeyStart = getTraceStartKey(pgName, rank);
+      if (!store->check({traceKeyStart})) {
+        missingRanks.push_back(rank);
+        continue;
+      }
+      std::string col;
+      if (!parseTraceValue(store, traceKeyStart, seqStart, col)) {
+        return report;
+      }
+      traceMap[seqStart].emplace(rank, std::make_pair(col, kEventStart));
+      if (rank == myRank) {
+        thisSeq = seqStart;
+        thisCol = std::move(col);
+      }
+    }
+
+    // Build traceMapEnd.
+    {
+      std::string traceKeyEnd = getTraceEndKey(pgName, rank);
+      if (!store->check({traceKeyEnd})) {
+        continue;
+      }
+      uint64_t seq = 0;
+      std::string col;
+      if (!parseTraceValue(store, traceKeyEnd, seq, col)) {
+        return report;
+      }
+      if (seq == seqStart) {
+        traceMap[seq][rank].second = kEventEnd;
+      }
+    }
+  }
+
+  TORCH_INTERNAL_ASSERT(
+      !missingRanks.empty() || !traceMap.empty(),
+      "Trace shouldn't be empty while enabled GLOO_ASYNC_TIMEOUT_DEBUG");
+  TORCH_INTERNAL_ASSERT(
+      !thisCol.empty(),
+      "Timeout rank [",
+      myRank,
+      "] must have collective tracking iteam in c10::Store trace");
+  TORCH_INTERNAL_ASSERT(
+      traceMap[thisSeq][myRank].second == kEventStart,
+      "Timeout rank [",
+      myRank,
+      "] last trace item must be kEventStart. thisSeq = ",
+      thisSeq,
+      ", col = ",
+      thisCol);
+
+  report += c10::str(
+      "\n\t - [", myRank, "] Timeout at collective: ", thisCol, ", #", thisSeq);
+
+  if (!missingRanks.empty()) {
+    report += analyzeMissingRanks(missingRanks);
+  } else {
+    report += analyzeLaggingRanks(traceMap);
+    report += dumpSnapshot(traceMap);
+  }
+
+  return report;
+}
+
+inline std::string pickle_str(const c10::IValue& v) {
+  std::vector<char> result;
+  {
+    auto writer = [&](const char* data, size_t size) {
+      result.insert(result.end(), data, data + size);
+    };
+    torch::jit::Pickler pickler(
+        writer, nullptr, nullptr, nullptr, nullptr, false);
+    pickler.protocol();
+    pickler.pushIValue(v);
+    pickler.stop();
+  }
+  return std::string(result.begin(), result.end());
+}
+
+inline std::string get_python_cpp_trace() {
+  // usage:
+  // LOG(INFO) << "stacktrace: "
+  //           << get_python_cpp_trace();
+  // warn: might be slow in getting cpp traces
+  // because of slow/broken addr2line
+  // in different system libs
+  std::shared_ptr<torch::CapturedTraceback> tb =
+      torch::CapturedTraceback::gather(
+          /*python=*/true, /*script=*/true, /*cpp=*/true);
+  torch::SymbolizedTracebacks s_tbs = torch::symbolize({tb.get()});
+  const auto& s_tb = s_tbs.tracebacks.at(0);
+  constexpr auto TB_FMT_CSTR = FMT_COMPILE("#{} {} from {}:{}\n");
+  fmt::memory_buffer buf;
+  auto buf_iter = std::back_inserter(buf);
+  for (auto idx : c10::irange(s_tb.size())) {
+    auto frame_id = s_tb[idx];
+    const auto& frame = s_tbs.all_frames.at(frame_id);
+    fmt::format_to(
+        buf_iter,
+        TB_FMT_CSTR,
+        idx,
+        frame.funcname,
+        frame.filename,
+        frame.lineno);
+  }
+  return fmt::to_string(buf);
+}
+
+inline c10::Dict<c10::IValue, c10::IValue> new_dict() {
+  return c10::Dict<c10::IValue, c10::IValue>(
+      c10::AnyType::get(), c10::AnyType::get());
+}
+
+inline c10::List<c10::IValue> new_list() {
+  return c10::List<c10::IValue>(c10::AnyType::get());
+}
+
+inline std::string ranks_str(const std::vector<uint64_t>& ranks) {
+  return fmt::format("[{}]", fmt::join(ranks, ", "));
+}
+
+} // namespace c10d
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/Types.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/Types.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..89ce821c9df79e62fea2a622164efe3bb58a93c7
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/Types.hpp
@@ -0,0 +1,183 @@
+#pragma once
+
+#include <torch/csrc/distributed/c10d/Store.hpp>
+
+#include <chrono>
+#include <cstdint>
+
+#include <ATen/core/Tensor.h>
+#include <ATen/core/ivalue.h>
+
+#include <c10/macros/Macros.h>
+#include <c10/util/intrusive_ptr.h>
+
+namespace c10d {
+
+// Base class for supplementary data potentially needed by ReduceOps
+struct TORCH_API _SupplementBase : torch::CustomClassHolder {
+  ~_SupplementBase() override = default;
+};
+
+// Supplementary data specific to NCCL PREMUL_SUM
+// The point of use in ProcessGroupNCCL knows how to unpack it.
+struct NCCLPreMulSumSupplement : _SupplementBase {
+  double double_factor{0.0};
+  at::Tensor tensor_factor;
+  NCCLPreMulSumSupplement(double f) : double_factor{f} {}
+  NCCLPreMulSumSupplement(at::Tensor t) : tensor_factor{std::move(t)} {
+    TORCH_CHECK_EQ(tensor_factor.numel(), 1);
+  }
+};
+
+// Other ReduceOps that need different supplementary data can also
+// derive from _SupplementBase.
+struct TORCH_API ReduceOp : torch::CustomClassHolder {
+  // note(crcrpar): RedOpType could be defined outside of `ReduceOp`
+  enum RedOpType : uint8_t {
+    SUM = 0,
+    AVG = 1,
+    PRODUCT = 2,
+    MIN = 3,
+    MAX = 4,
+    BAND = 5, // Bitwise AND
+    BOR = 6, // Bitwise OR
+    BXOR = 7, // Bitwise XOR
+    PREMUL_SUM = 8, // Multiply by a user-supplied constant before summing.
+    UNUSED = 9
+  };
+
+  ReduceOp() = default;
+
+  ReduceOp(RedOpType op) : op_(op) {
+    TORCH_INTERNAL_ASSERT(
+        op_ != PREMUL_SUM,
+        "Use `torch.distributed._make_nccl_premul_sum` to create an instance of ReduceOp with PREMUL_SUM");
+  }
+
+  ReduceOp(
+      RedOpType op,
+      const c10::intrusive_ptr<_SupplementBase>& optional_supplement) {
+    if (optional_supplement) {
+      op_ = op;
+    } else {
+      supplement_ = optional_supplement;
+    }
+  }
+
+  // The heap resource supplement_, if it exists, is managed by a
+  // c10::intrusive_ptr, so constructors and operator= can be simple
+  ReduceOp(const ReduceOp& other) = default;
+  ReduceOp& operator=(const ReduceOp& other) = default;
+
+  ReduceOp(ReduceOp&& other) = default;
+  ReduceOp& operator=(ReduceOp&& other) = default;
+  ~ReduceOp() override = default;
+
+  operator RedOpType() const {
+    return op_;
+  }
+
+  bool operator==(const std::uint8_t other) {
+    TORCH_INTERNAL_ASSERT(other < 9, "Invalid other op value");
+    return other == op_;
+  }
+
+  bool operator==(const ReduceOp::RedOpType other) {
+    return *this == static_cast<std::uint8_t>(other);
+  }
+
+  // todo(crcrpar): Handle `RedOpType::PREMUL_SUM` with its scaling factor.
+  bool operator==(const ReduceOp& other) {
+    return *this == other.op_;
+  }
+
+  RedOpType op_ = SUM;
+  // supplement_ is "type-erased" storage for optional supplementary
+  // data the op might need.
+  // The point of use will know the derived type supplement_ really is,
+  // and downcast its pointer to extract the data as the needed type(s).
+  // Right now, only PREMUL_SUM needs supplementary data, but the same
+  // mechanism could extend to support other nontrivial reduce ops with
+  // different supplementary payloads.
+  c10::intrusive_ptr<_SupplementBase> supplement_;
+};
+
+template <typename T>
+ReduceOp makeNCCLPreMulSum(const T& factor) {
+  ReduceOp rop;
+  rop.op_ = ReduceOp::PREMUL_SUM;
+  rop.supplement_ = c10::make_intrusive<NCCLPreMulSumSupplement>(factor);
+  return rop;
+}
+
+constexpr auto kUnsetTimeout = std::chrono::milliseconds(-1);
+
+struct BroadcastOptions {
+  int64_t rootRank = 0;
+  int64_t rootTensor = 0;
+  std::chrono::milliseconds timeout = kUnsetTimeout;
+  bool asyncOp = true;
+};
+
+struct AllreduceOptions {
+  ReduceOp reduceOp = ReduceOp::SUM;
+  std::chrono::milliseconds timeout = kUnsetTimeout;
+  bool asyncOp = true;
+  std::optional<at::Tensor> sparseIndices = std::nullopt;
+};
+
+struct AllreduceCoalescedOptions : AllreduceOptions {};
+
+struct ReduceOptions {
+  ReduceOp reduceOp = ReduceOp::SUM;
+  int64_t rootRank = 0;
+  int64_t rootTensor = 0;
+  std::chrono::milliseconds timeout = kUnsetTimeout;
+  bool asyncOp = true;
+};
+
+struct AllgatherOptions {
+  std::chrono::milliseconds timeout = kUnsetTimeout;
+  bool asyncOp = true;
+};
+
+struct GatherOptions {
+  int64_t rootRank = 0;
+  std::chrono::milliseconds timeout = kUnsetTimeout;
+  bool asyncOp = true;
+};
+
+struct ScatterOptions {
+  int64_t rootRank = 0;
+  std::chrono::milliseconds timeout = kUnsetTimeout;
+  bool asyncOp = true;
+};
+
+struct ReduceScatterOptions {
+  ReduceOp reduceOp = ReduceOp::SUM;
+  std::chrono::milliseconds timeout = kUnsetTimeout;
+  bool asyncOp = true;
+};
+
+struct AllToAllOptions {
+  std::chrono::milliseconds timeout = kUnsetTimeout;
+  bool asyncOp = true;
+};
+
+struct BarrierOptions {
+  std::vector<int64_t> device_ids;
+  std::chrono::milliseconds timeout = kUnsetTimeout;
+  std::optional<at::Device> device;
+  bool asyncOp = true;
+};
+
+struct DistributedBackendOptions {
+  c10::intrusive_ptr<::c10d::Store> store;
+  int group_rank;
+  int group_size;
+  std::chrono::duration<float> timeout;
+  std::string group_id;
+  std::vector<int64_t> global_ranks_in_group;
+};
+
+} // namespace c10d
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/UCCTracing.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/UCCTracing.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d6a34920f6918f4437017e95894a56d7f71c95ff
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/UCCTracing.hpp
@@ -0,0 +1,58 @@
+#pragma once
+
+#ifdef USE_C10D_UCC
+
+#include <torch/csrc/distributed/c10d/UCCUtils.hpp>
+
+namespace c10d {
+
+#define RECORD_COMMS_TRACE(                                                    \
+    _comms_tracer, _work, _opType, _rank, _comm_size, _inTensors, _outTensors) \
+  do {                                                                         \
+    if (torch_ucc_config.enable_comms_logger) {                                \
+      _comms_tracer->recordComms(                                              \
+          opTypeToString(_opType),                                             \
+          (uintptr_t)_work.get(),                                              \
+          _rank,                                                               \
+          _comm_size,                                                          \
+          _inTensors,                                                          \
+          _outTensors);                                                        \
+    }                                                                          \
+  } while (0)
+
+// interfaces to collect communication traces
+class TORCH_API CommTraceLogger : public torch::CustomClassHolder {
+ private:
+  std::vector<std::string> comms_trace_;
+  std::vector<std::string> curBlocks_; /* unused */
+  std::vector<int64_t> curOutSplitSizes_;
+  std::vector<int64_t> curInSplitSizes_;
+  int curRoot_ = -1;
+  unsigned long seqnum = 0;
+
+ public:
+  void setCurBlock(const std::string& name); /* unused */
+  void popBlock(); /* unused */
+  // record root info if applicable, e.g., broadcast, gather, scatter
+  void recordOptionalInfo(int root = -1);
+  // record input/output splits of Alltoallv
+  void recordOptionalInfo(
+      const std::vector<int64_t>& outputSplitSizes = {},
+      const std::vector<int64_t>& inputSplitSizes = {});
+  // record essential comms information
+  void recordComms(
+      const std::string& collName,
+      const uintptr_t workReq = 0,
+      const int rank = -1,
+      const int world_size = -1,
+      const std::vector<at::Tensor>& inputTensors = {},
+      const std::vector<at::Tensor>& outputTensor = {});
+  // return collected comms traces
+  std::vector<std::string>& getCommsTrace() {
+    return comms_trace_;
+  }
+};
+
+} // namespace c10d
+
+#endif // USE_C10D_UCC
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/UCCUtils.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/UCCUtils.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..074889dc905327dd9112353b2072a9885504240e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/UCCUtils.hpp
@@ -0,0 +1,187 @@
+#pragma once
+
+#ifdef USE_C10D_UCC
+
+#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
+#include <torch/csrc/distributed/c10d/Store.hpp>
+#include <ucc/api/ucc.h>
+
+namespace c10d {
+
+// Macro to generate the error message on a non-successful UCC return value.
+#define TORCH_UCC_GET_ERROR_MSG(_err, _error_msg, _result) \
+  do {                                                     \
+    _err = c10::str(                                       \
+        "[",                                               \
+        std::string(__FILE__),                             \
+        ":",                                               \
+        std::to_string(__LINE__),                          \
+        "] ",                                              \
+        logger->getLogPrefix(),                            \
+        _error_msg,                                        \
+        ", error code ",                                   \
+        _result,                                           \
+        ": ",                                              \
+        ucc_status_string(_result),                        \
+        ", system error code ",                            \
+        errno);                                            \
+  } while (0)
+
+// Macro to throw on a non-successful UCC return value.
+#define TORCH_UCC_CHECK(_cmd, _error_msg)               \
+  do {                                                  \
+    ucc_status_t result = _cmd;                         \
+    if (result != UCC_OK) {                             \
+      std::string err;                                  \
+      TORCH_UCC_GET_ERROR_MSG(err, _error_msg, result); \
+      TORCH_CHECK(false, err);                          \
+    }                                                   \
+  } while (0)
+
+// Macro and throw on a non-successful UCC return value and free its request.
+#define TORCH_UCC_CHECK_REQUEST(_request, _cmd, _error_msg) \
+  do {                                                      \
+    ucc_status_t result = _cmd;                             \
+    if (result != UCC_OK) {                                 \
+      std::string err;                                      \
+      TORCH_UCC_GET_ERROR_MSG(err, _error_msg, result);     \
+      if (_request != nullptr) {                            \
+        ucc_collective_finalize(_request);                  \
+      }                                                     \
+      TORCH_CHECK(false, err);                              \
+    }                                                       \
+  } while (0)
+
+// Macros to print logs with unified format
+#define TORCH_UCC_LOG_ERROR(_phase, _msg) \
+  LOG(ERROR) << logger->getLogPrefix(_phase) << "[ERROR] " << _msg;
+#define TORCH_UCC_LOG_INFO(_phase, _msg) \
+  LOG(INFO) << logger->getLogPrefix(_phase) << "[INFO] " << _msg;
+#define TORCH_UCC_LOG_DEBUG(_phase, _msg) \
+  VLOG(1) << logger->getLogPrefix(_phase) << "[DEBUG] " << _msg;
+
+enum torch_ucc_phase_t {
+  TORCH_UCC_UNKNOWN = -1,
+  TORCH_UCC_INIT,
+  TORCH_UCC_HEALTH_CHECK,
+  TORCH_UCC_READY,
+  TORCH_UCC_COLL_POST,
+  TORCH_UCC_COLL_PROGRESS,
+  TORCH_UCC_FINALIZE,
+};
+
+const std::map<torch_ucc_phase_t, std::string> ucc_phase_map = {
+    {TORCH_UCC_UNKNOWN, "UNKNOWN"},
+    {TORCH_UCC_INIT, "INIT"},
+    {TORCH_UCC_HEALTH_CHECK, "HEALTH_CHECK"},
+    {TORCH_UCC_READY, "READY"},
+    {TORCH_UCC_COLL_POST, "COLL_POST"},
+    {TORCH_UCC_COLL_PROGRESS, "COLL_PROGRESS"},
+    {TORCH_UCC_FINALIZE, "FINALIZE"},
+};
+
+class CommTraceLogger;
+
+class TORCH_API ProcessGroupUCCLogger : public torch::CustomClassHolder {
+ public:
+  ProcessGroupUCCLogger();
+  ProcessGroupUCCLogger(std::string log_prefix, torch_ucc_phase_t phase);
+
+  std::string getLogPrefix(torch_ucc_phase_t phase = TORCH_UCC_UNKNOWN);
+  void setLogPrefix(std::string log_prefix);
+  inline void setPhase(torch_ucc_phase_t phase) {
+    local_phase = phase;
+  }
+
+  void initCommsTracer();
+  void flushComms(int rank, int world_size);
+  std::shared_ptr<CommTraceLogger> trace_generator = nullptr;
+
+ protected:
+  std::string log_prefix;
+  torch_ucc_phase_t local_phase = TORCH_UCC_UNKNOWN;
+  bool initialized_CommTraceLogger = false;
+};
+
+struct torch_ucc_oob_coll_info_t {
+  c10::intrusive_ptr<Store> store;
+  uint32_t comm_id;
+  int rank;
+  int size;
+  void* rbuf;
+  size_t msglen;
+  std::string getKey(std::string key) {
+    return std::to_string(comm_id) + key;
+  }
+};
+
+class CommBase {
+ public:
+  CommBase(const c10::intrusive_ptr<ProcessGroupUCCLogger>& logger_)
+      : logger(logger_) {}
+  virtual void progress() = 0;
+  virtual void free_request(ucc_coll_req_h request) = 0;
+  virtual ~CommBase() {}
+  c10::intrusive_ptr<ProcessGroupUCCLogger> logger;
+};
+class CommUCC : public CommBase {
+ public:
+  ucc_lib_h lib{nullptr};
+  ucc_context_h context{nullptr};
+
+ public:
+  void progress() override;
+  CommUCC(
+      std::shared_ptr<torch_ucc_oob_coll_info_t> oob,
+      const c10::intrusive_ptr<ProcessGroupUCCLogger>& logger);
+  void free_request(ucc_coll_req_h request) override;
+  ~CommUCC();
+};
+
+ucc_status_t oob_allgather(
+    void* sbuf,
+    void* rbuf,
+    size_t msglen,
+    void* coll_info,
+    void** req);
+
+ucc_status_t oob_allgather_test(void* req);
+
+ucc_status_t oob_allgather_free(void* req);
+
+// trim: remove spaces before and after the string view
+// implementation borrowed from https://stackoverflow.com/a/17976541
+inline std::string_view trim(std::string_view s) {
+  auto wsfront = std::find_if_not(
+      s.begin(), s.end(), [](int c) { return std::isspace(c); });
+  auto wsback = std::find_if_not(s.rbegin(), s.rend(), [](int c) {
+                  return std::isspace(c);
+                }).base();
+  return (
+      wsback <= wsfront ? "" : s.substr(wsfront - s.begin(), wsback - wsfront));
+}
+
+inline std::string tolower(std::string_view s) {
+  std::string result;
+  result.reserve(s.size());
+  for (auto c : s) {
+    result.push_back(std::tolower(c));
+  }
+  return result;
+}
+
+inline std::vector<std::string> parse_list(std::string list) {
+  std::vector<std::string> result;
+  list = tolower(trim(list));
+  while (!list.empty()) {
+    const auto end_pos = list.find_first_of(',');
+    const auto token = trim(list.substr(0, end_pos));
+    result.push_back(std::string(token));
+    list = (end_pos != std::string_view::npos) ? list.substr(end_pos + 1) : "";
+  }
+  return result;
+}
+
+} // namespace c10d
+
+#endif // USE_C10D_UCC
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/UnixSockUtils.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/UnixSockUtils.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f9e92107c38735ab8f1883fa3b17428681eb2a79
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/UnixSockUtils.hpp
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <torch/csrc/distributed/c10d/Utils.hpp>
+
+namespace c10d::tcputil {
+
+#define CONNECT_SOCKET_OFFSET 2
+
+inline int poll(struct pollfd* fds, unsigned long nfds, int timeout) {
+  return ::poll(fds, nfds, timeout);
+}
+
+inline void addPollfd(
+    std::vector<struct pollfd>& fds,
+    int socket,
+    short events) {
+  fds.push_back({.fd = socket, .events = events});
+}
+
+inline struct ::pollfd getPollfd(int socket, short events) {
+  struct ::pollfd res = {.fd = socket, .events = events};
+  return res;
+}
+
+} // namespace c10d::tcputil
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/Utils.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/Utils.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ce69a01fd3a9e0c1a97afada944b1fe657c3e4e8
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/Utils.hpp
@@ -0,0 +1,745 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <c10/util/Exception.h>
+#include <c10/util/accumulate.h>
+#include <c10/util/env.h>
+#include <c10/util/error.h>
+#include <c10/util/irange.h>
+#include <torch/csrc/distributed/c10d/Types.hpp>
+
+#ifdef _WIN32
+#include <winsock2.h>
+#include <ws2tcpip.h>
+typedef SSIZE_T ssize_t;
+#pragma comment(lib, "Ws2_32.lib")
+#else
+#include <fcntl.h>
+#include <netdb.h>
+#include <sys/poll.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#endif
+
+#include <sys/types.h>
+
+#include <cstdint>
+#include <cstdlib>
+#include <functional>
+#include <string>
+#include <vector>
+
+namespace c10d {
+
+TORCH_API size_t getTensorsNumel(const std::vector<at::Tensor>& tensors);
+
+// Retrieve tensor shapes from a given tensor.
+TORCH_API std::vector<at::Tensor> getTensorShapes(
+    const std::vector<at::Tensor>& tensors);
+
+// Use -2 to represent unset state of env vars
+#define C10D_ENV_NOT_SET -2
+
+#define WARN_ENV_VAR_ONCE(deprecated_env, new_env)                        \
+  TORCH_WARN_ONCE(                                                        \
+      "Environment variable " + deprecated_env + " is deprecated; use " + \
+      new_env + " instead");
+
+// Turns at::IntArrayRef into "(1, 2, 3, 4)".
+inline std::string toString(at::IntArrayRef l) {
+  std::stringstream ss;
+  ss << "(";
+  for (const auto i : c10::irange(l.size())) {
+    if (i > 0) {
+      ss << ", ";
+    }
+    ss << l[i];
+  }
+  ss << ")";
+  return ss.str();
+}
+
+inline std::string toString(const c10::Layout& layout) {
+  std::stringstream ss;
+  ss << layout;
+  return ss.str();
+}
+
+inline void assertSameType(
+    const at::DeprecatedTypeProperties& type,
+    const std::vector<at::Tensor>& tensors) {
+  for (const auto i : c10::irange(tensors.size())) {
+    if (!tensors[i].options().type_equal(type.options())) {
+      const std::string expected = type.toString();
+      const std::string actual = tensors[i].toString();
+      throw std::invalid_argument(
+          // NOLINTNEXTLINE(performance-inefficient-string-concatenation)
+          "mixed types (" + expected + " and " + actual + ")");
+    }
+  }
+}
+
+inline std::vector<std::string> split(
+    char separator,
+    const std::string& string) {
+  std::vector<std::string> pieces;
+  std::stringstream ss(string);
+  std::string item;
+  while (std::getline(ss, item, separator)) {
+    pieces.push_back(std::move(item));
+  }
+  return pieces;
+}
+
+inline std::string getCvarString(
+    const std::vector<std::string>& env,
+    const char* def) {
+  std::string ret(def);
+
+  if (env.empty()) {
+    TORCH_CHECK(false, "No environment variables passed");
+    return ret;
+  }
+
+  /* parse environment variable in reverse order, so the early
+   * versions of a variable get higher priority than the latter
+   * versions of the same variable */
+  for (ssize_t i = static_cast<ssize_t>(env.size()) - 1; i >= 0; i--) {
+    auto val = c10::utils::get_env(env[i].c_str());
+    if (!val.has_value()) {
+      continue;
+    } else if (i) {
+      WARN_ENV_VAR_ONCE(env[i], env[0]);
+    }
+
+    ret = val.value();
+  }
+
+  return ret;
+}
+
+inline int getCvarInt(const std::vector<std::string>& env, int def) {
+  int ret = def;
+
+  if (env.empty()) {
+    TORCH_CHECK(false, "No environment variables passed");
+    return ret;
+  }
+
+  /* parse environment variable in reverse order, so the early
+   * versions of a variable get higher priority than the latter
+   * versions of the same variable */
+  for (ssize_t i = static_cast<ssize_t>(env.size()) - 1; i >= 0; i--) {
+    const auto val = c10::utils::get_env(env[i].c_str());
+    if (!val.has_value()) {
+      continue;
+    } else if (i) {
+      WARN_ENV_VAR_ONCE(env[i], env[0]);
+    }
+
+    try {
+      ret = std::stoi(val.value());
+    } catch (std::exception&) {
+      TORCH_CHECK(false, "Invalid value for environment variable: " + env[i]);
+    }
+  }
+
+  return ret;
+}
+
+inline bool getCvarBool(const std::vector<std::string>& env, bool def) {
+  bool ret = def;
+
+  if (env.empty()) {
+    TORCH_CHECK(false, "No environment variables passed");
+    return ret;
+  }
+
+  /* parse environment variable in reverse order, so the early
+   * versions of a variable get higher priority than the latter
+   * versions of the same variable */
+  for (ssize_t i = static_cast<ssize_t>(env.size()) - 1; i >= 0; i--) {
+    auto val = c10::utils::get_env(env[i].c_str());
+    if (!val.has_value()) {
+      continue;
+    } else if (i) {
+      WARN_ENV_VAR_ONCE(env[i], env[0]);
+    }
+
+    for (auto& x : val.value()) {
+      // NOLINTNEXTLINE(*-narrowing-conversions)
+      x = std::tolower(x);
+    }
+
+    if (val == "y" || val == "yes" || val == "1" || val == "t" ||
+        val == "true") {
+      ret = true;
+    } else if (
+        val == "n" || val == "no" || val == "0" || val == "f" ||
+        val == "false") {
+      ret = false;
+    } else {
+      TORCH_CHECK(false, "Invalid value for environment variable: " + env[i]);
+      return ret;
+    }
+  }
+
+  return ret;
+}
+
+inline void assertSameSizes(
+    const at::IntArrayRef& sizes,
+    const std::vector<at::Tensor>& tensors) {
+  for (const auto i : c10::irange(tensors.size())) {
+    if (!tensors[i].sizes().equals(sizes)) {
+      const auto expected = toString(sizes);
+      const auto actual = toString(tensors[i].sizes());
+      throw std::invalid_argument(
+          // NOLINTNEXTLINE(performance-inefficient-string-concatenation)
+          "mixed sizes (" + expected + " and " + actual + ")");
+    }
+  }
+}
+
+inline void assertSameSizeAndType(const std::vector<at::Tensor>& tensors) {
+  // Ensure we have at least one tensor
+  if (tensors.empty()) {
+    throw std::invalid_argument("argument is empty");
+  }
+
+  // Ensure all tensors have identical type and shape
+  auto options = tensors[0].options();
+  auto sizes = tensors[0].sizes();
+  for (const auto i : c10::irange(1, tensors.size())) {
+    if (!tensors[i].options().type_equal(options)) {
+      const auto expected = toString(options);
+      const auto actual = toString(tensors[i].options());
+      throw std::invalid_argument(
+          // NOLINTNEXTLINE(performance-inefficient-string-concatenation)
+          "argument contains mixed types (" + expected + " and " + actual +
+          ")");
+    }
+    if (!tensors[i].sizes().equals(sizes)) {
+      const auto expected = toString(sizes);
+      const auto actual = toString(tensors[i].sizes());
+      throw std::invalid_argument(
+          // NOLINTNEXTLINE(performance-inefficient-string-concatenation)
+          "argument contains mixed types (" + expected + " and " + actual +
+          ")");
+    }
+  }
+}
+
+inline void assertTypeMatch(
+    const std::function<void(const std::string&)>& fn,
+    const at::DeprecatedTypeProperties& type,
+    const at::ArrayRef<at::Tensor> tensors,
+    size_t index) {
+  if (!tensors[index].options().type_equal(type.options())) {
+    fn("invalid tensor type at index " + std::to_string(index) + " (expected " +
+       type.toString() + ", got " + tensors[index].toString() + ")");
+  }
+}
+
+inline void assertTypeMatch(
+    const std::function<void(const std::string&)>& fn,
+    const at::TensorOptions& options,
+    const at::ArrayRef<at::Tensor> tensors,
+    size_t index) {
+  if (!tensors[index].options().type_equal(options)) {
+    fn("invalid tensor type at index " + std::to_string(index) + " (expected " +
+       toString(options) + ", got " + toString(tensors[index].options()) + ")");
+  }
+}
+
+inline void assertSizesMatch(
+    const std::function<void(const std::string&)>& fn,
+    const at::IntArrayRef& sizes,
+    const at::ArrayRef<at::Tensor> tensors,
+    size_t index) {
+  if (tensors[index].sizes() != sizes) {
+    fn("invalid tensor size at index " + std::to_string(index) + " (expected " +
+       toString(sizes) + ", got " + toString(tensors[index].sizes()) + ")");
+  }
+}
+
+inline void assertLayoutMatch(
+    const std::function<void(const std::string&)>& fn,
+    const c10::Layout& expected,
+    const at::ArrayRef<at::Tensor> tensors,
+    size_t index) {
+  const auto& actual = tensors[index].layout();
+  if (actual != expected) {
+    fn("invalid tensor layout at index " + std::to_string(index) +
+       " (expected " + toString(expected) + ", got " + toString(actual) + ")");
+  }
+}
+
+inline void assertLayoutMatch(
+    const std::function<void(const std::string&)>& fn,
+    const at::ArrayRef<at::Tensor> tensors) {
+  const auto& layout = tensors[0].layout();
+  for (const auto i : c10::irange(1, tensors.size())) {
+    assertLayoutMatch(fn, layout, tensors, i);
+  }
+}
+
+inline void assertNonEmpty(
+    const std::function<void(const std::string&)>& fn,
+    const at::ArrayRef<at::Tensor> tensors) {
+  if (tensors.empty()) {
+    fn("requires non-empty tensor list");
+  }
+}
+
+inline void assertSingleElement(
+    const std::function<void(const std::string&)>& fn,
+    const at::ArrayRef<at::Tensor> tensors) {
+  if (tensors.size() != 1) {
+    fn("requires a single-element tensor list");
+  }
+}
+
+inline void assertSingleElementInput(
+    const std::function<void(const std::string&)>& fn,
+    const at::ArrayRef<at::Tensor> tensors) {
+  if (tensors.size() != 1) {
+    fn("requires a single-element input tensor list");
+  }
+}
+
+inline void assertSingleElementOutput(
+    const std::function<void(const std::string&)>& fn,
+    const at::ArrayRef<at::Tensor> tensors) {
+  if (tensors.size() != 1) {
+    fn("requires a single-element output tensor list");
+  }
+}
+
+inline void assertRootRank(
+    const std::function<void(const std::string&)>& fn,
+    int64_t rank,
+    int64_t size) {
+  if (rank < 0 || rank >= size) {
+    fn("invalid root rank: " + std::to_string(rank));
+  }
+}
+
+inline void assertRootTensor(
+    const std::function<void(const std::string&)>& fn,
+    int64_t rank,
+    int64_t size) {
+  if (rank < 0 || rank >= size) {
+    fn("invalid root tensor: " + std::to_string(rank));
+  }
+}
+
+inline void assertDense(
+    const std::function<void(const std::string&)>& fn,
+    const at::ArrayRef<at::Tensor> tensors) {
+  const auto& layout = tensors[0].layout();
+  if (layout != at::kStrided) {
+    fn("only supports dense tensors");
+  }
+}
+
+inline void assertCPU(
+    const std::function<void(const std::string&)>& fn,
+    const at::ArrayRef<at::Tensor> tensors) {
+  const auto& device = tensors[0].device();
+  if (device.type() != at::kCPU) {
+    fn("only supports CPU tensors");
+  }
+}
+
+inline void assertSameDevice(
+    const std::function<void(const std::string&)>& fn,
+    const at::ArrayRef<at::Tensor> tensors) {
+  if (tensors.size() < 2) {
+    return;
+  }
+  const auto& device = tensors[0].device();
+  for (const auto i : c10::irange(1, tensors.size())) {
+    if (tensors[i].device() != device) {
+      fn("tensors should be on the same device");
+    }
+  }
+}
+
+inline void assertTypeAndSizesMatch(
+    const std::function<void(const std::string&)>& fn,
+    const at::ArrayRef<at::Tensor> tensors,
+    const at::DeprecatedTypeProperties& type,
+    const at::IntArrayRef& sizes) {
+  for (const auto i : c10::irange(tensors.size())) {
+    assertTypeMatch(fn, type, tensors, i);
+    assertSizesMatch(fn, sizes, tensors, i);
+  }
+}
+
+inline void assertTypeAndSizesMatch(
+    const std::function<void(const std::string&)>& fn,
+    const at::ArrayRef<at::Tensor> tensors,
+    const at::TensorOptions& options,
+    const at::IntArrayRef& sizes) {
+  for (const auto i : c10::irange(tensors.size())) {
+    assertTypeMatch(fn, options, tensors, i);
+    assertSizesMatch(fn, sizes, tensors, i);
+  }
+}
+
+inline void assertTypeAndSizesMatch(
+    const std::function<void(const std::string&)>& fn,
+    const at::ArrayRef<at::Tensor> tensors) {
+  const auto& options = tensors[0].options();
+  const auto sizes = tensors[0].sizes();
+  assertTypeAndSizesMatch(fn, tensors.slice(1), options, sizes);
+}
+
+// Copied from ATen/core/functional.h.
+template <typename F, typename T>
+inline auto fmap(T& inputs, const F& fn)
+    -> std::vector<decltype(fn(*inputs.begin()))> {
+  std::vector<decltype(fn(*inputs.begin()))> r;
+  r.reserve(inputs.size());
+  for (auto& input : inputs) {
+    r.push_back(fn(input));
+  }
+  return r;
+}
+
+// Copied from torch/csrc/utils/tensor_flatten.h.
+inline at::Tensor flattenDenseTensors(at::TensorList tensors) {
+  static const auto flatten = [](const at::Tensor& t) {
+    return t.contiguous().view({-1});
+  };
+  if (tensors.size() == 1) {
+    return flatten(tensors[0]);
+  }
+  return at::cat(::c10d::fmap(tensors, flatten));
+}
+
+inline at::Tensor newLikeFlat(
+    std::vector<std::vector<at::Tensor>>& tensors,
+    size_t deviceIdx) {
+  if (tensors.empty() || tensors[0].empty()) {
+    TORCH_CHECK(false, "Received an empty list");
+  }
+  if (deviceIdx >= tensors.size()) {
+    TORCH_CHECK(false, "Invalid device index");
+  }
+  auto& t = tensors[deviceIdx][0];
+  auto device = t.device();
+  for (const auto i : c10::irange(1, tensors[deviceIdx].size())) {
+    if (tensors[deviceIdx][i].device() != device) {
+      TORCH_CHECK(false, "Expecting all tensors on the same device");
+    }
+  }
+  at::DeviceGuard gpuGuard(device);
+  std::vector<int64_t> sizes{static_cast<int64_t>(tensors[deviceIdx].size())};
+  std::vector<int64_t> strides{static_cast<int64_t>(t.numel())};
+  sizes.insert(sizes.end(), t.sizes().begin(), t.sizes().end());
+  strides.insert(strides.end(), t.strides().begin(), t.strides().end());
+  return at::empty_strided(
+      sizes, strides, t.options().memory_format(std::nullopt));
+}
+
+inline at::Tensor newLikeFlat(std::vector<at::Tensor>& tensors) {
+  if (tensors.empty()) {
+    TORCH_CHECK(false, "Received an empty list");
+  }
+  auto& t = tensors[0];
+  at::DeviceGuard gpuGuard(t.device());
+  std::vector<int64_t> sizes{static_cast<int64_t>(tensors.size())};
+  sizes.insert(sizes.end(), t.sizes().begin(), t.sizes().end());
+  return at::empty(sizes, t.options());
+}
+
+inline std::vector<std::vector<int64_t>> getSizes(
+    const std::vector<at::Tensor>& tensors) {
+  std::vector<std::vector<int64_t>> sizes(tensors.size());
+  for (const auto i : c10::irange(tensors.size())) {
+    sizes[i] = tensors[i].sizes().vec();
+  }
+  return sizes;
+}
+
+inline std::vector<int> getDevices(const std::vector<at::Tensor>& tensors) {
+  std::vector<int> devices(tensors.size(), -1);
+  if (tensors[0].device().is_cuda()) {
+    for (const auto i : c10::irange(tensors.size())) {
+      // NOLINTNEXTLINE(bugprone-signed-char-misuse)
+      devices[i] = tensors[i].storage().device().index();
+    }
+  }
+  return devices;
+}
+
+template <typename T>
+inline T* getDataPointer(const at::Tensor& tensor) {
+  // This method is only used in ProcessGroupGloo for now. Call sites must make
+  // sure that the input tensor is contiguous. It is OK if the tensor does not
+  // start from the beginning of the storage. For example, it could come from
+  // chunk(..., dim=0)[1]. Hence, we need to use data_ptr() instead of
+  // tensor.storage().data()
+  // NB: not using tensor.data<T>() because tensor is not aware of gloo::TYPE
+  return static_cast<T*>(tensor.data_ptr());
+}
+
+template <typename T>
+std::vector<T*> getDataPointers(const std::vector<at::Tensor>& tensors) {
+  std::vector<T*> ptrs(tensors.size());
+  for (const auto i : c10::irange(tensors.size())) {
+    ptrs[i] = getDataPointer<T>(tensors[i]);
+  }
+  return ptrs;
+}
+
+// For alltoall split size sanity check
+inline void checkSplitSizes(
+    const std::vector<int64_t>& split_sizes,
+    const at::Tensor& tensor,
+    int group_size) {
+  if (split_sizes.empty()) {
+    TORCH_CHECK(
+        tensor.size(0) % group_size == 0,
+        "Tensor's dim 0 does not divide equally across group size");
+  } else {
+    TORCH_CHECK(
+        split_sizes.size() == static_cast<size_t>(group_size),
+        "Number of tensor splits not equal to group size");
+    const auto sum = c10::sum_integers(split_sizes);
+    TORCH_CHECK(
+        sum == tensor.size(0), "Split sizes doesn't match total dim 0 size");
+  }
+}
+
+// Compute alltoall lengths and offsets, handling multi-dimension tensors
+template <typename T>
+size_t computeLengthsAndOffsets(
+    const std::vector<int64_t>& split_sizes,
+    const at::Tensor& tensor,
+    std::vector<T>* lengths,
+    std::vector<T>* offsets) {
+  size_t group_size = lengths->size();
+  bool equal_splits = false;
+  size_t dim0_size = tensor.size(0);
+  size_t row_size = (dim0_size ? tensor.numel() / dim0_size : 1);
+  size_t split_size = 0;
+  size_t offset = 0;
+
+  if (split_sizes.empty()) {
+    equal_splits = true;
+    split_size = tensor.size(0) / group_size;
+  }
+  for (const auto i : c10::irange(group_size)) {
+    size_t length = row_size * (equal_splits ? split_size : split_sizes[i]);
+    (*lengths)[i] = length;
+    (*offsets)[i] = offset;
+    // TODO: see if we should add overflow protection for offset
+    offset += length;
+  }
+  return offset;
+}
+
+template <typename T>
+size_t computeLengthsAndOffsets(
+    const std::vector<at::Tensor>& tensors,
+    std::vector<T>* lengths,
+    std::vector<T>* offsets) {
+  size_t group_size = lengths->size();
+  size_t offset = 0;
+  for (const auto i : c10::irange(group_size)) {
+    size_t length = tensors[i].numel();
+    (*lengths)[i] = length;
+    (*offsets)[i] = offset;
+    offset += length;
+  }
+  return offset;
+}
+
+// Get the start and stride of the global rank from a list of global ranks
+// If the global ranks do not follow the consecutive rule, the stride will be -1
+void TORCH_API getGlobalRankStartAndStride(
+    const std::vector<uint64_t>& globalRanksInGroup,
+    int& globalRankStart,
+    int& globalRankStride);
+
+using RankType = uint32_t;
+using SizeType = uint64_t;
+
+// `errno` is only meaningful when it fails. E.g., a  successful `fork()` sets
+// `errno` to `EINVAL` in child process on some macos
+// (https://stackoverflow.com/a/20295079), and thus `errno` should really only
+// be inspected if an error occurred.
+//
+// `success_cond` is an expression used to check if an error has happened. So
+// for `fork()`, we can use `SYSCHECK(pid = fork(), pid != -1)`. The function
+// output is stored in variable `__output` and may be used in `success_cond`.
+#ifdef _WIN32
+#define SYSCHECK(expr, success_cond)                                           \
+  while (true) {                                                               \
+    auto __output = (expr);                                                    \
+    auto errno_local = WSAGetLastError();                                      \
+    (void)__output;                                                            \
+    if (!(success_cond)) {                                                     \
+      if (errno == EINTR) {                                                    \
+        continue;                                                              \
+      } else if (                                                              \
+          errno_local == WSAETIMEDOUT || errno_local == WSAEWOULDBLOCK) {      \
+        C10_THROW_ERROR(DistNetworkError, "Socket Timeout");                   \
+      } else {                                                                 \
+        C10_THROW_ERROR(DistNetworkError, c10::utils::str_error(errno_local)); \
+      }                                                                        \
+    } else {                                                                   \
+      break;                                                                   \
+    }                                                                          \
+  }
+#else
+#define SYSCHECK(expr, success_cond)                                     \
+  while (true) {                                                         \
+    auto __output = (expr);                                              \
+    (void)__output;                                                      \
+    if (!(success_cond)) {                                               \
+      if (errno == EINTR) {                                              \
+        continue;                                                        \
+      } else if (errno == EAGAIN || errno == EWOULDBLOCK) {              \
+        C10_THROW_ERROR(DistNetworkError, "Socket Timeout");             \
+      } else {                                                           \
+        C10_THROW_ERROR(DistNetworkError, c10::utils::str_error(errno)); \
+      }                                                                  \
+    } else {                                                             \
+      break;                                                             \
+    }                                                                    \
+  }
+#endif
+
+// Most functions indicate error by returning `-1`. This is a helper macro for
+// this common case with `SYSCHECK`.
+// Since SOCKET_ERROR = -1 in MSVC, so also leverage SYSCHECK_ERR_RETURN_NEG1
+#define SYSCHECK_ERR_RETURN_NEG1(expr) SYSCHECK(expr, __output != -1)
+
+namespace tcputil {
+
+// Send and receive
+template <typename T>
+void sendBytes(
+    int socket,
+    const T* buffer,
+    size_t length,
+    bool moreData = false) {
+  size_t bytesToSend = sizeof(T) * length;
+  if (bytesToSend == 0) {
+    return;
+  }
+
+  auto currentBytes = reinterpret_cast<const char*>(buffer);
+
+  int flags = 0;
+
+#ifdef MSG_MORE
+  if (moreData) { // there is more data to send
+    flags |= MSG_MORE;
+  }
+#endif
+
+// Ignore SIGPIPE as the send() return value is always checked for error
+#ifdef MSG_NOSIGNAL
+  flags |= MSG_NOSIGNAL;
+#endif
+
+  while (bytesToSend > 0) {
+    ssize_t bytesSent = 0;
+    SYSCHECK_ERR_RETURN_NEG1(
+        bytesSent = ::send(socket, currentBytes, bytesToSend, flags))
+    if (bytesSent == 0) {
+      C10_THROW_ERROR(
+          DistNetworkError,
+          "Failed to send, sent 0 bytes. "
+          "Connection was likely closed. "
+          "Did the remote server shutdown or crash?");
+    }
+
+    bytesToSend -= bytesSent;
+    currentBytes += bytesSent;
+  }
+}
+
+template <typename T>
+void recvBytes(int socket, T* buffer, size_t length) {
+  size_t bytesToReceive = sizeof(T) * length;
+  if (bytesToReceive == 0) {
+    return;
+  }
+
+  auto currentBytes = reinterpret_cast<char*>(buffer);
+
+  while (bytesToReceive > 0) {
+    ssize_t bytesReceived = 0;
+    SYSCHECK_ERR_RETURN_NEG1(
+        bytesReceived = recv(socket, currentBytes, bytesToReceive, 0))
+    if (bytesReceived == 0) {
+      C10_THROW_ERROR(
+          DistNetworkError,
+          "Failed to recv, got 0 bytes. "
+          "Connection was likely closed. "
+          "Did the remote server shutdown or crash?");
+    }
+
+    bytesToReceive -= bytesReceived;
+    currentBytes += bytesReceived;
+  }
+}
+
+// send a vector's length and data
+template <typename T>
+void sendVector(int socket, const std::vector<T>& vec, bool moreData = false) {
+  SizeType size = vec.size();
+  sendBytes<SizeType>(socket, &size, 1, true);
+  sendBytes<T>(socket, vec.data(), size, moreData);
+}
+
+// receive a vector as sent in sendVector
+template <typename T>
+std::vector<T> recvVector(int socket) {
+  SizeType valueSize = 0;
+  recvBytes<SizeType>(socket, &valueSize, 1);
+  std::vector<T> value(valueSize);
+  recvBytes<T>(socket, value.data(), value.size());
+  return value;
+}
+
+// this is only for convenience when sending rvalues
+template <typename T>
+void sendValue(int socket, const T& value, bool moreData = false) {
+  sendBytes<T>(socket, &value, 1, moreData);
+}
+
+template <typename T>
+T recvValue(int socket) {
+  T value;
+  recvBytes<T>(socket, &value, 1);
+  return value;
+}
+
+// send a string's length and data
+inline void sendString(
+    int socket,
+    const std::string& str,
+    bool moreData = false) {
+  SizeType size = str.size();
+  sendBytes<SizeType>(socket, &size, 1, true);
+  sendBytes<char>(socket, str.data(), size, moreData);
+}
+
+// receive a string as sent in sendString
+inline std::string recvString(int socket) {
+  SizeType valueSize = 0;
+  recvBytes<SizeType>(socket, &valueSize, 1);
+  std::vector<char> value(valueSize);
+  recvBytes<char>(socket, value.data(), value.size());
+  return std::string(value.data(), value.size());
+}
+
+} // namespace tcputil
+} // namespace c10d
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/WinSockUtils.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/WinSockUtils.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..105cea7949b63d1fdd546dfd7eec4d71ab3cca78
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/WinSockUtils.hpp
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <torch/csrc/distributed/c10d/Utils.hpp>
+
+namespace c10d::tcputil {
+
+#define CONNECT_SOCKET_OFFSET 1
+
+inline int poll(struct pollfd* fdArray, unsigned long fds, int timeout) {
+  return WSAPoll(fdArray, fds, timeout);
+}
+
+inline void addPollfd(
+    std::vector<struct pollfd>& fds,
+    int socket,
+    short events) {
+  fds.push_back({(SOCKET)socket, events});
+}
+
+inline struct ::pollfd getPollfd(int socket, short events) {
+  struct ::pollfd res = {(SOCKET)socket, events};
+  return res;
+}
+
+} // namespace c10d::tcputil
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/Work.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/Work.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..887bdc42c733f0656a20df20ecc041f8ff72f1bd
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/Work.hpp
@@ -0,0 +1,178 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <chrono>
+#include <mutex>
+#include <vector>
+
+constexpr auto kNoTimeout = std::chrono::milliseconds(0);
+
+namespace c10d {
+
+constexpr const char* const kSeqNumStoreKey = "SEQ_NUM_STORE_KEY";
+
+enum class OpType : std::uint8_t {
+  BROADCAST = 0,
+  ALLREDUCE = 1,
+  ALLREDUCE_COALESCED = 2,
+  REDUCE = 3,
+  ALLGATHER = 4,
+  _ALLGATHER_BASE = 5,
+  ALLGATHER_COALESCED = 6,
+  GATHER = 7,
+  SCATTER = 8,
+  REDUCE_SCATTER = 9,
+  ALLTOALL_BASE = 10,
+  ALLTOALL = 11,
+  SEND = 12,
+  RECV = 13,
+  RECVANYSOURCE = 14,
+  BARRIER = 15,
+  _REDUCE_SCATTER_BASE = 16,
+  COALESCED = 17,
+  _ALLREDUCE_SPARSE = 18,
+  UNKNOWN = 100,
+};
+
+// TODO: support different types of failures/errors
+enum class WorkResult : std::uint8_t {
+  SUCCESS = 0,
+  TIMEOUT = 1,
+  COMM_ERROR = 2,
+  UNKNOWN = 100,
+};
+
+// Converts OpType to human readable string.
+TORCH_API std::string opTypeToString(OpType opType);
+
+// Whether or not an OP is an p2p op (SEND, RECV, RECVANYSOURCE)
+TORCH_API bool isP2POp(OpType opType, bool batchP2P = false);
+
+// Please do not use Work API, it is going away, to be
+// replaced by ivalue::Future.
+// Python binding for this class might change, please do not assume
+// this will be bound using pybind.
+class TORCH_API Work : public torch::CustomClassHolder {
+ public:
+  Work(
+      int rank = -1,
+      OpType opType = OpType::UNKNOWN,
+      const char* profilingTitle = nullptr,
+      const std::optional<std::vector<at::Tensor>>& inputTensors =
+          std::nullopt);
+
+  ~Work() override;
+
+  // Checks if request has completed. Non-blocking operation.
+  virtual bool isCompleted();
+
+  // Returns if the work completed successfully.
+  // If false, the exception function can be called to get details.
+  virtual bool isSuccess() const;
+
+  // Returns exception if isSuccess() returned false.
+  virtual std::exception_ptr exception() const;
+
+  // Returns source rank if this objects represents a recv-from-any.
+  virtual int sourceRank() const;
+
+  // Returns result tensors, if applicable.
+  // If work is not supposed to have result, we return empty list.
+  virtual std::vector<at::Tensor> result();
+
+  // Ensures that operations on the output tensors that are invoked
+  // after this function returns are correctly sequenced after the
+  // asynchronous completion of this work.
+  //
+  // For CUDA tensors, it inserts stream synchronization such that
+  // the streams of the caller wait for completion of the
+  // asynchronous operations on the destination tensors.
+  //
+  // For CPU tensors, it is currently a nop.
+  //
+  // This function should only be used if the caller polls for
+  // completion through the `isCompleted` function, it has returned
+  // true, and the `isSuccess` function also has returned true.
+  //
+  virtual void synchronize();
+
+  // Waits until request completes. Blocking operation.
+  // Throws if the work completed with an exception.
+  // Returns false if the work is aborted.
+  // Otherwise, it always returns true, indicating the work is completed.
+  //
+  // Functionally equivalent to:
+  //
+  //   while (!isCompleted()) { /* nop */ }
+  //   auto success = isSuccess();
+  //   if (!success) { std::rethrow_exception(exception()); }
+  //   return success;
+  //
+  virtual bool wait(std::chrono::milliseconds timeout = kNoTimeout);
+
+  virtual void abort();
+
+  // Returns a Future object that will be associated with the completion of
+  // work. Only NCCL backend is currently supported.
+  virtual c10::intrusive_ptr<c10::ivalue::Future> getFuture();
+
+  // Get a Future object that would be marked as either success or failure
+  // This API can be used by the user to track the completion of the work
+  // and handle the exception if any.
+  virtual c10::intrusive_ptr<c10::ivalue::Future> getFutureResult();
+
+  virtual float getDuration() const;
+
+  virtual uint64_t getSequencenumber() const;
+
+  OpType retrieveOpType() const;
+
+  static c10::intrusive_ptr<Work> create_from_future(
+      const c10::intrusive_ptr<c10::ivalue::Future>&);
+
+ protected:
+  // Completes the work object and optionally sets the exception in a
+  // thread-safe manner. Notifies all waiting condition variables as well.
+  void finish(std::exception_ptr exception = nullptr);
+
+  // Similar to finish, but throws an exception if one is already set or
+  // provided by the user.
+  void finishAndThrow(std::exception_ptr exception);
+
+  mutable std::mutex mutex_;
+  std::condition_variable cv_;
+  bool completed_ = false;
+  std::exception_ptr exception_;
+
+  // Current rank of the node.
+  const int rank_;
+
+  // Operation type that this work object refers to.
+  OpType opType_;
+
+  // When profiling, the callback to record end of operation event. This
+  // callback needs to be called when collective operation is complete.
+  std::function<void()> recordFunctionEndCallback_;
+};
+
+struct TORCH_API WorkInfo {
+  WorkInfo(
+      const OpType& opType,
+      const uint64_t seq,
+      const std::chrono::time_point<std::chrono::system_clock>& timeStarted,
+      const std::chrono::time_point<std::chrono::system_clock>& timeFinished,
+      const std::chrono::duration<float>& activeDuration)
+      : opType(opType),
+        seq(seq),
+        timeStarted(timeStarted),
+        timeFinished(timeFinished),
+        activeDuration(activeDuration) {}
+
+  OpType opType;
+  uint64_t seq;
+  std::chrono::time_point<std::chrono::system_clock> timeStarted;
+  std::chrono::time_point<std::chrono::system_clock> timeFinished;
+  std::chrono::duration<float> activeDuration;
+};
+
+} // namespace c10d
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/c10d.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/c10d.h
new file mode 100644
index 0000000000000000000000000000000000000000..6153156bd19b2e8519036815305c9e52445a316b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/c10d.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+namespace torch::distributed::c10d {
+
+PyMethodDef* python_functions();
+
+} // namespace torch::distributed::c10d
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/comm.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/comm.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..8543e46c41064509f9d613443a34b31709518f7f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/comm.hpp
@@ -0,0 +1,142 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/core/ivalue.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
+#include <utility>
+
+namespace c10d {
+
+// Broadcast many tensors to all processes in the process group.
+TORCH_API void broadcast_coalesced(
+    const c10::intrusive_ptr<c10d::ProcessGroup>& process_group,
+    at::TensorList tensors,
+    size_t buffer_size,
+    int rank = 0);
+
+// This class passes bucket contents tensor to DDP communication hook.
+class TORCH_API GradBucket {
+ public:
+  explicit GradBucket(
+      size_t index,
+      size_t bucket_count,
+      at::Tensor tensor,
+      std::vector<size_t> offsets,
+      std::vector<size_t> lengths,
+      std::vector<c10::IntArrayRef> sizes_vec,
+      std::vector<at::Tensor> parameters,
+      std::optional<at::Tensor> sparse_grad_indices)
+      : index_(index),
+        bucket_count_(bucket_count),
+        buffer_(std::move(tensor)),
+        offsets_(std::move(offsets)),
+        lengths_(std::move(lengths)),
+        sizes_vec_(std::move(sizes_vec)),
+        parameters_(std::move(parameters)),
+        sparse_grad_indices_(std::move(sparse_grad_indices)) {}
+
+  // Returns the index of the bucket, which is unique across all the buckets.
+  size_t getIndex() const {
+    return index_;
+  }
+
+  const at::Tensor& getBuffer() const {
+    return buffer_;
+  }
+
+  // Returns a mutable buffer compared with the above method.
+  at::Tensor& getBufferRef() {
+    return buffer_;
+  }
+
+  // Overwrites the buffer at a specific index.
+  void setBuffer(at::Tensor& buffer) {
+    buffer_ = buffer;
+  }
+
+  // Each tensor in the list that getGradients corresponds to a
+  // parameter.
+  std::vector<at::Tensor> getGradients() const;
+
+  // Returns model parameters belonging to this bucket. They are returned in the
+  // same order as gradient tensors via getGradients(). For example,
+  // getParameters[i] will have its gradient stored in
+  // getGradients[i]
+  const std::vector<at::Tensor> getParameters() const {
+    return parameters_;
+  }
+
+  // Returns whether this bucket is the last bucket to allreduce in an
+  // iteration.
+  bool isLast() const {
+    return index_ == bucket_count_ - 1;
+  }
+
+  std::optional<at::Tensor>& getSparseGradIndices() {
+    return sparse_grad_indices_;
+  }
+
+ private:
+  size_t index_;
+  size_t bucket_count_;
+  at::Tensor buffer_;
+
+  // Per-variable info in buffer_.
+  std::vector<size_t> offsets_;
+  std::vector<size_t> lengths_;
+  std::vector<c10::IntArrayRef> sizes_vec_;
+
+  // Model parameters for this bucket.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const std::vector<at::Tensor> parameters_;
+
+  // Predefined sparse indices for this bucket (only used for sparse tensors).
+  // The gradients will be updated to have indices with these tensor values
+  std::optional<at::Tensor> sparse_grad_indices_;
+};
+
+// Base class of both `PythonCommHook` and `CppCommHook`.
+// Requires implementing 1) `runHook` method that communicates gradients
+// asynchronously, and 2) `parseHookResult` method that converts the hook
+// result into a tensor.
+class TORCH_API CommHookInterface {
+ public:
+  virtual ~CommHookInterface() = default;
+
+  // Passes the input grad bucket to the registered communication hook.
+  // Once the tensor in the bucket are ready, kicks off the hook asynchronously
+  // and returns a future that holds the communication results.
+  virtual c10::intrusive_ptr<c10::ivalue::Future> runHook(
+      GradBucket& bucket) = 0;
+
+  // Returns the resulting tensor once the communication hook result is
+  // ready. The resulting tensor will then be copied to the grads of
+  // individual parameters.
+  virtual at::Tensor parseHookResult(const c10::IValue& result) = 0;
+};
+
+namespace detail {
+// This helper function is called both by CppCommHookInterface below and inside
+// reducer.
+TORCH_API at::Tensor parseCppCommHookResult(const c10::IValue& result);
+} // namespace detail
+
+// This CppCommHook interface only requires implementing runHook method that
+// potentially uses a state.
+template <typename T>
+class CppCommHookInterface : public CommHookInterface {
+ public:
+  explicit CppCommHookInterface(T state) : state_(std::move(state)) {}
+
+  ~CppCommHookInterface() override = default;
+
+  at::Tensor parseHookResult(const c10::IValue& result) override {
+    return detail::parseCppCommHookResult(result);
+  }
+
+ protected:
+  T state_;
+};
+
+} // namespace c10d
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/control_collectives/ControlCollectives.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/control_collectives/ControlCollectives.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b7e8c6803b8a6aee498799b41baeac521a922613
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/control_collectives/ControlCollectives.hpp
@@ -0,0 +1,59 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <chrono>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include <c10/macros/Macros.h>
+#include <torch/custom_class.h>
+
+namespace c10d {
+
+using namespace std::chrono_literals;
+
+class TORCH_API ControlCollectives : public torch::CustomClassHolder {
+ public:
+  virtual void barrier(
+      const std::string& key,
+      std::chrono::milliseconds timeout = 5min,
+      bool block = true) = 0;
+
+  virtual void broadcastSend(
+      const std::string& key,
+      const std::vector<uint8_t>& data,
+      std::chrono::milliseconds timeout = 5min) = 0;
+  virtual std::vector<uint8_t> broadcastRecv(
+      const std::string& key,
+      std::chrono::milliseconds timeout = 5min) = 0;
+
+  virtual void gatherSend(
+      const std::string& key,
+      const std::vector<uint8_t>& data,
+      std::chrono::milliseconds timeout = 5min) = 0;
+  virtual std::vector<std::vector<uint8_t>> gatherRecv(
+      const std::string& key,
+      const std::vector<uint8_t>& data,
+      std::chrono::milliseconds timeout = 5min) = 0;
+
+  virtual std::vector<uint8_t> scatterSend(
+      const std::string& key,
+      const std::vector<std::vector<uint8_t>>& data,
+      std::chrono::milliseconds timeout = 5min) = 0;
+  virtual std::vector<uint8_t> scatterRecv(
+      const std::string& key,
+      std::chrono::milliseconds timeout = 5min) = 0;
+
+  virtual std::vector<std::vector<uint8_t>> allGather(
+      const std::string& key,
+      const std::vector<uint8_t>& data,
+      std::chrono::milliseconds timeout = 5min) = 0;
+
+  virtual int64_t allSum(
+      const std::string& key,
+      int64_t data,
+      std::chrono::milliseconds timeout = 5min) = 0;
+};
+
+} // namespace c10d
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/control_collectives/StoreCollectives.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/control_collectives/StoreCollectives.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..aab5dbd2a13ce839a3337207163b2d5a328d7b30
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/control_collectives/StoreCollectives.hpp
@@ -0,0 +1,68 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/FbcodeMaps.h>
+#include <torch/csrc/distributed/c10d/Store.hpp>
+#include <torch/csrc/distributed/c10d/control_collectives/ControlCollectives.hpp>
+
+namespace c10d {
+
+class TORCH_API StoreCollectives : public ControlCollectives {
+ public:
+  explicit StoreCollectives(
+      c10::intrusive_ptr<Store> store,
+      int rank,
+      int worldSize);
+
+  void barrier(
+      const std::string& key,
+      std::chrono::milliseconds timeout = 5min,
+      bool block = true) override;
+
+  void broadcastSend(
+      const std::string& key,
+      const std::vector<uint8_t>& data,
+      std::chrono::milliseconds timeout = 5min) override;
+  std::vector<uint8_t> broadcastRecv(
+      const std::string& key,
+      std::chrono::milliseconds timeout = 5min) override;
+
+  void gatherSend(
+      const std::string& key,
+      const std::vector<uint8_t>& data,
+      std::chrono::milliseconds timeout = 5min) override;
+  std::vector<std::vector<uint8_t>> gatherRecv(
+      const std::string& key,
+      const std::vector<uint8_t>& data,
+      std::chrono::milliseconds timeout = 5min) override;
+
+  std::vector<uint8_t> scatterSend(
+      const std::string& key,
+      const std::vector<std::vector<uint8_t>>& data,
+      std::chrono::milliseconds timeout = 5min) override;
+  std::vector<uint8_t> scatterRecv(
+      const std::string& key,
+      std::chrono::milliseconds timeout = 5min) override;
+
+  std::vector<std::vector<uint8_t>> allGather(
+      const std::string& key,
+      const std::vector<uint8_t>& data,
+      std::chrono::milliseconds timeout = 5min) override;
+
+  int64_t allSum(
+      const std::string& key,
+      int64_t data,
+      std::chrono::milliseconds timeout = 5min) override;
+
+ private:
+  void enforceUnique(const std::string& key);
+
+ private:
+  c10::intrusive_ptr<Store> store_;
+  int rank_;
+  int worldSize_;
+
+  c10::FastSet<std::string> seenKeys_{};
+};
+
+} // namespace c10d
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/control_plane/Handlers.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/control_plane/Handlers.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..53d48f299736b0a7fd9204c9acc89022e2cc56b7
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/control_plane/Handlers.hpp
@@ -0,0 +1,69 @@
+#pragma once
+
+#include <functional>
+#include <map>
+#include <string>
+#include <utility>
+
+#include <c10/macros/Export.h>
+
+namespace c10d::control_plane {
+
+// Request represents a request to the handler. This conceptually maps to an
+// HTTP request but could be called via other transports.
+class TORCH_API Request {
+ public:
+  virtual ~Request() = default;
+
+  virtual const std::string& body() const = 0;
+
+  virtual const std::multimap<std::string, std::string>& params() const = 0;
+};
+
+// Response represents a response to the handler. This conceptually maps to an
+// HTTP response but could be called via other transports.
+class TORCH_API Response {
+ public:
+  virtual ~Response() = default;
+
+  // Set the response body to the provided string.
+  // TODO: add support for chunked responses
+  virtual void setContent(
+      std::string&& content,
+      const std::string& content_type) = 0;
+
+  // Set the response status code.
+  // These should match standard HTTP status codes.
+  virtual void setStatus(int status) = 0;
+};
+
+using HandlerFunc = std::function<void(const Request&, Response&)>;
+
+// Registers a handler. The name needs to be unique and can be called by using
+// getHandler directly or via WorkerServer for remote requests.
+// These handlers are called from a background C++ thread concurrently with the
+// main thread. These handlers need to be thread safe and not cause issues
+// during Python training.
+TORCH_API void registerHandler(const std::string& name, HandlerFunc f);
+
+// Fetches a handler by name.
+TORCH_API HandlerFunc getHandler(const std::string& name);
+
+TORCH_API std::vector<std::string> getHandlerNames();
+
+// Registers a handler statically.
+// See registerHandler for more details.
+class TORCH_API RegisterHandler {
+ public:
+  RegisterHandler(const std::string& name, HandlerFunc f) {
+    registerHandler(name, std::move(f));
+  }
+
+  // disable move, copy
+  RegisterHandler(const RegisterHandler&) = delete;
+  RegisterHandler(RegisterHandler&&) = delete;
+  RegisterHandler& operator=(const RegisterHandler&) = delete;
+  RegisterHandler& operator=(RegisterHandler&&) = delete;
+};
+
+} // namespace c10d::control_plane
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/control_plane/WorkerServer.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/control_plane/WorkerServer.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3a7c96774ab984c1417f942bcde7d6dd91c3aab2
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/control_plane/WorkerServer.hpp
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <string>
+#include <thread>
+
+#include <c10/util/intrusive_ptr.h>
+#include <torch/csrc/distributed/c10d/control_plane/Handlers.hpp>
+
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wdeprecated-literal-operator")
+#include <httplib.h>
+C10_DIAGNOSTIC_POP()
+
+namespace c10d::control_plane {
+
+class TORCH_API WorkerServer : public c10::intrusive_ptr_target {
+ public:
+  WorkerServer(const std::string& hostOrFile, int port = -1);
+  ~WorkerServer() override;
+
+  void shutdown();
+
+ private:
+  httplib::Server server_;
+  std::thread serverThread_;
+};
+
+} // namespace c10d::control_plane
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/cuda/utils.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/cuda/utils.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..3255baf6d1dc48519b8fae6fda98fda61c83d5ca
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/cuda/utils.hpp
@@ -0,0 +1,10 @@
+#pragma once
+
+// This file contains utility functions common for CUDA, which can be used by
+// ProcessGroupNCCL or SymmetricMemory.
+
+namespace c10d::cuda {
+
+bool deviceSupportsMulticast(int device_idx);
+
+} // namespace c10d::cuda
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/debug.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/debug.h
new file mode 100644
index 0000000000000000000000000000000000000000..72090bc8961c13fb2fd207e021cfe6832186459d
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/debug.h
@@ -0,0 +1,23 @@
+// Copyright (c) Meta Platforms, Inc. and its affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <c10/macros/Macros.h>
+
+namespace c10d {
+
+enum class DebugLevel { Off = 0, Info = 1, Detail = 2 };
+
+TORCH_API void setDebugLevel(DebugLevel level);
+
+// Sets the debug level based on the value of the `TORCH_DISTRIBUTED_DEBUG`
+// environment variable.
+TORCH_API void setDebugLevelFromEnvironment();
+
+TORCH_API DebugLevel debug_level() noexcept;
+
+} // namespace c10d
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/default_comm_hooks.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/default_comm_hooks.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..cf57d5eb9287186f8627bc4e6a76fcde3b8930e1
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/default_comm_hooks.hpp
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
+#include <torch/csrc/distributed/c10d/comm.hpp>
+
+namespace c10d {
+
+enum class BuiltinCommHookType : uint8_t {
+  ALLREDUCE = 1,
+  FP16_COMPRESS = 2,
+};
+
+class AllReduceCommHook
+    : public CppCommHookInterface<c10::intrusive_ptr<ProcessGroup>> {
+ public:
+  explicit AllReduceCommHook(const c10::intrusive_ptr<ProcessGroup>& state)
+      : CppCommHookInterface<c10::intrusive_ptr<ProcessGroup>>(state) {}
+
+  ~AllReduceCommHook() override = default;
+
+  c10::intrusive_ptr<c10::ivalue::Future> runHook(GradBucket& bucket) override;
+};
+
+class FP16CompressCommHook
+    : public CppCommHookInterface<c10::intrusive_ptr<ProcessGroup>> {
+ public:
+  explicit FP16CompressCommHook(const c10::intrusive_ptr<ProcessGroup>& state)
+      : CppCommHookInterface<c10::intrusive_ptr<ProcessGroup>>(state) {}
+
+  ~FP16CompressCommHook() override = default;
+
+  c10::intrusive_ptr<c10::ivalue::Future> runHook(GradBucket& bucket) override;
+};
+
+// Almost same as AllReduceCommHook, but without division inside the hook.
+// This enables the optimization of fusing copy and division and saves one scan
+// over all the input parameters, when no communication hook is provided by the
+// user. Only used internally and not released as a public built-in
+// communication hook.
+class _AllReduceBySumCommHook
+    : public CppCommHookInterface<c10::intrusive_ptr<ProcessGroup>> {
+ public:
+  explicit _AllReduceBySumCommHook(
+      const c10::intrusive_ptr<ProcessGroup>& state)
+      : CppCommHookInterface<c10::intrusive_ptr<ProcessGroup>>(state) {}
+
+  ~_AllReduceBySumCommHook() override = default;
+
+  c10::intrusive_ptr<c10::ivalue::Future> runHook(GradBucket& bucket) override;
+};
+
+} // namespace c10d
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/error.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/error.h
new file mode 100644
index 0000000000000000000000000000000000000000..bd6ed17ef21776d34ee1440226d587aed3a8099f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/error.h
@@ -0,0 +1,54 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <cstring>
+#include <system_error>
+
+#include <fmt/format.h>
+
+namespace fmt {
+
+template <>
+struct formatter<std::error_category> {
+  constexpr decltype(auto) parse(format_parse_context& ctx) const {
+    return ctx.begin();
+  }
+
+  template <typename FormatContext>
+  decltype(auto) format(const std::error_category& cat, FormatContext& ctx)
+      const {
+    if (std::strcmp(cat.name(), "generic") == 0) {
+      return fmt::format_to(ctx.out(), "errno");
+    } else {
+      return fmt::format_to(ctx.out(), "{} error", cat.name());
+    }
+  }
+};
+
+template <>
+struct formatter<std::error_code> {
+  constexpr decltype(auto) parse(format_parse_context& ctx) const {
+    return ctx.begin();
+  }
+
+  template <typename FormatContext>
+  decltype(auto) format(const std::error_code& err, FormatContext& ctx) const {
+    return fmt::format_to(
+        ctx.out(), "({}: {} - {})", err.category(), err.value(), err.message());
+  }
+};
+
+} // namespace fmt
+
+namespace c10d::detail {
+
+inline std::error_code lastError() noexcept {
+  return std::error_code{errno, std::generic_category()};
+}
+
+} // namespace c10d::detail
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/exception.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/exception.h
new file mode 100644
index 0000000000000000000000000000000000000000..961a722b874485e0c80fe3807a86e8feffe09cf7
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/exception.h
@@ -0,0 +1,39 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+
+// Utility macro similar to C10_THROW_ERROR, the major difference is that this
+// macro handles exception types defined in the c10d namespace, whereas
+// C10_THROW_ERROR requires an exception to be defined in the c10 namespace.
+#define C10D_THROW_ERROR(err_type, ...)                      \
+  throw ::c10d::err_type(                                    \
+      {__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, \
+      c10::str(__VA_ARGS__))
+
+#define C10D_CHECK_WITH(error_t, cond, ...)                         \
+  if (C10_UNLIKELY_OR_CONST(!(cond))) {                             \
+    C10D_THROW_ERROR(                                               \
+        error_t, TORCH_CHECK_MSG(cond, "", c10::str(__VA_ARGS__))); \
+  }
+
+namespace c10d {
+
+using c10::DistNetworkError;
+using c10::DistStoreError;
+
+class TORCH_API SocketError : public DistNetworkError {
+  using DistNetworkError::DistNetworkError;
+};
+
+class TORCH_API TimeoutError : public DistNetworkError {
+  using DistNetworkError::DistNetworkError;
+};
+
+} // namespace c10d
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/logger.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/logger.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b1711368ece9de3c0093e1274d55458e195f4f01
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/logger.hpp
@@ -0,0 +1,171 @@
+#pragma once
+
+#include <c10/util/Logging.h>
+#include <torch/csrc/distributed/c10d/reducer.hpp>
+
+#include <utility>
+
+namespace c10d {
+
+// A struct to hold the latest status of the process group.
+struct ProcessGroupStatus {
+  // the sequential number of the last collective enqueued into workMetaList_
+  // This is useful for identifying a rank that has not join a collective
+  // initialized to be -1 to indicate no collective has been enqueued
+  int64_t lastEnqueuedSeq{-1};
+  // the sequential number of the last collective started as the kernel
+  int64_t lastStartedSeq{-1};
+  // the sequential number of the last collective completed marked by
+  // the watchdog thread
+  // initialized to be -1 to indicate no collective has been completed
+  int64_t lastCompletedSeq{-1};
+
+  // the name of the last collective enqueued into workMetaList_
+  std::string lastEnqueuedWorkName;
+  // the name of the last collective started as the kernel
+  std::string lastStartedWorkName;
+  // the name of the last collective completed
+  std::string lastCompletedWorkName;
+
+  // the sizes of the last work enqueued
+  size_t lastEnqueuedNumelIn;
+  size_t lastEnqueuedNumelOut;
+  // the sizes of the last work completed
+  size_t lastCompletedNumelIn;
+  size_t lastCompletedNumelOut;
+  // the sizes of the last work started
+  size_t lastStartedNumelIn;
+  size_t lastStartedNumelOut;
+};
+
+class TORCH_API Logger {
+ public:
+  explicit Logger(std::shared_ptr<c10d::Reducer> reducer);
+  // Set logging data that can be got during DistributedDataParallel
+  // construction time.
+  void set_construction_data_and_log(
+      const std::string& module_name,
+      const std::vector<int>& device_ids,
+      int output_device,
+      bool broadcast_buffers,
+      bool has_sync_bn,
+      bool static_graph);
+
+  void set_static_graph();
+
+  // An interface for users to get DDPLoggingData and log them
+  // in the applications. Explanation of logging fields are in
+  // "struct DDPLoggingData" of "torch/c10/util/Logging.h".
+  at::DDPLoggingData get_ddp_logging_data();
+
+  // Stream insertion operator for logging data to stream under
+  // TORCH_DISTRIBUTED_DEBUG.
+  friend std::ostream& operator<<(std::ostream& output, const Logger& logger);
+
+  ~Logger() noexcept(false) {
+    // Log if DDP graph is static in Logger dtor instead of Reducer dtor since
+    // Logger is deleted before Reducer.
+    log_if_graph_static(reducer_->ddp_graph_static());
+  }
+
+  // Set environment variables.
+  void set_env_variables();
+  // Set parameters stats.
+  void set_parameter_stats();
+  // Get size of each bucket (Bytes).
+  std::vector<int64_t> get_bucket_sizes();
+  // Get variable indices for each bucket.
+  std::vector<std::vector<size_t>> get_per_bucket_variable_indices();
+  // Set comm. hook, if used
+  void set_comm_hook(const std::string& hook);
+  // Set running with uneven input detection (model.join() context manager)
+  void set_uneven_input_join();
+
+  // Reset performance stats at current iteration
+  void reset_performance_stats();
+
+  // Calculate avg stats using cpu timer and gpu timer
+  // that has been recorded in reducer.
+  void calculate_avg_time(
+      int64_t& avg_time,
+      int64_t& time_duration,
+      Timer& timer,
+      Timer::Event start_event,
+      Timer::Event end_event);
+
+  // Set the absolute time of the event that has been recorded in reducer.
+  void set_event_time(int64_t& event_time, Timer& timer, Timer::Event event);
+  // Set stats that can be collected only during
+  // training loop. It is called at the beginning of forward call
+  // to record the run time stats of sampled iterations that previously ran.
+  // GPU performance stats are collected only for single process
+  // single device program and single device module right now.
+  // TODO to support single process multiple devices and multi device modules,
+  // events need to be created and recorded on multiple devices.
+  void set_runtime_stats_and_log();
+
+  // Called when DDP/reducer is failing with an error. The
+  // logging data structure will have two fields filled: "has_error" indicating
+  // that this iteration encountered an error and other fields are not valid,
+  // and "error", a string which contains the error message that DDP failed
+  // with.
+  template <typename... Args>
+  void set_error_and_log(const std::string& ddp_error, const Args&... args) {
+    ddp_logging_data_->ints_map["has_error"] = 1;
+    auto err = c10::str(ddp_error, args...);
+    ddp_logging_data_->strs_map["error"] = err;
+    // Report the iteration we are erroring at so user knows how many examples
+    // successfully processed before this error was hit.
+    ddp_logging_data_->ints_map["iteration"] = reducer_->num_iterations_;
+    at::LogPyTorchDDPUsage(*ddp_logging_data_);
+  }
+
+  // When running without static graph, called when reducer is destroyed to log
+  // if graph was actually static and is a candidate for static graph
+  // optimization.
+  void log_if_graph_static(bool is_static);
+
+ private:
+  // ddp_logging_data_ is used to hold all the ddp related logging
+  // data fields.
+  std::unique_ptr<at::DDPLoggingData> ddp_logging_data_;
+  std::shared_ptr<c10d::Reducer> reducer_;
+  // track the number of iterations when runtime stats are collected so far.
+  long num_iterations_stats_recorded_ = 0;
+};
+
+// a generic logging data struct that holds different types of logging data.
+// starting with key value pairs of strings and integers,
+// It can be extended to more types as needed.
+struct C10dLoggingData {
+  // logging fields that are string types.
+  std::map<std::string, std::string> strings;
+  // logging fields that are int64_t types.
+  std::map<std::string, int64_t> integers;
+};
+
+class TORCH_API C10dLogger {
+ public:
+  C10dLogger(const C10dLogger&) = default;
+  C10dLogger(C10dLogger&&) = delete;
+  C10dLogger& operator=(const C10dLogger&) = default;
+  C10dLogger& operator=(C10dLogger&&) = delete;
+  virtual ~C10dLogger() = default;
+  virtual void log(const C10dLoggingData& data);
+  static C10dLogger* getLogger();
+  static void registerLogger(std::unique_ptr<C10dLogger>);
+
+ protected:
+  // singletion, hide constructor from the public
+  C10dLogger(std::string logDestination)
+      : logDestination_(std::move(logDestination)) {}
+
+  // the name of the destination this logger should log to
+  std::string logDestination_;
+
+ private:
+  static std::unique_ptr<C10dLogger> logger_;
+  static std::atomic<bool> registered_;
+};
+
+} // namespace c10d
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/logging.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/logging.h
new file mode 100644
index 0000000000000000000000000000000000000000..0d2d9bc13b3e4439d13899a7acf3e9c99f53ec50
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/logging.h
@@ -0,0 +1,47 @@
+// Copyright (c) Meta Platforms, Inc. and its affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <string>
+
+#include <c10/macros/Macros.h>
+#include <c10/util/Logging.h>
+#include <fmt/format.h>
+
+namespace c10d::detail {
+
+enum class LogLevel { Trace, Debug, Info, Warning, Error };
+
+TORCH_API bool isLogLevelEnabled(LogLevel level) noexcept;
+
+template <typename... T>
+// NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
+std::string formatLogMessage(fmt::string_view fmt, T&&... args) {
+  return fmt::vformat(fmt, fmt::make_format_args(args...));
+}
+
+} // namespace c10d::detail
+
+#define C10D_ERROR(...)                                               \
+  if (c10d::detail::isLogLevelEnabled(c10d::detail::LogLevel::Error)) \
+  LOG(ERROR) << "[c10d] " << c10d::detail::formatLogMessage(__VA_ARGS__)
+
+#define C10D_WARNING(...)                                               \
+  if (c10d::detail::isLogLevelEnabled(c10d::detail::LogLevel::Warning)) \
+  LOG(WARNING) << "[c10d] " << c10d::detail::formatLogMessage(__VA_ARGS__)
+
+#define C10D_INFO(...)                                               \
+  if (c10d::detail::isLogLevelEnabled(c10d::detail::LogLevel::Info)) \
+  LOG(INFO) << "[c10d] " << c10d::detail::formatLogMessage(__VA_ARGS__)
+
+#define C10D_DEBUG(...)                                               \
+  if (c10d::detail::isLogLevelEnabled(c10d::detail::LogLevel::Debug)) \
+  LOG(INFO) << "[c10d - debug] " << c10d::detail::formatLogMessage(__VA_ARGS__)
+
+#define C10D_TRACE(...)                                               \
+  if (c10d::detail::isLogLevelEnabled(c10d::detail::LogLevel::Trace)) \
+  LOG(INFO) << "[c10d - trace] " << c10d::detail::formatLogMessage(__VA_ARGS__)
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/python_comm_hook.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/python_comm_hook.h
new file mode 100644
index 0000000000000000000000000000000000000000..0b85e0bd12e1f24b95649544139e6060c31f5798
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/python_comm_hook.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <torch/csrc/distributed/c10d/comm.hpp>
+
+#include <ATen/ATen.h>
+#include <ATen/core/ivalue.h>
+#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
+#include <torch/csrc/utils/pybind.h>
+
+namespace c10d {
+
+class TORCH_PYTHON_API PythonCommHook : public CommHookInterface {
+ public:
+  // Takes a state and a callable hook. The inputs are Python objects.
+  // The state is passed to the hook in runHook method, and it can be used to
+  // maintain and update any state information during the execution of the hook.
+  // The hook performs user-specified processing and returns a future indicating
+  // asynchronous communication of gradients.
+  PythonCommHook(py::object state, py::object hook)
+      : state_(std::move(state)), hook_(std::move(hook)) {}
+
+  ~PythonCommHook() override;
+
+  c10::intrusive_ptr<c10::ivalue::Future> runHook(GradBucket& bucket) override;
+
+  at::Tensor parseHookResult(const c10::IValue& result) override;
+
+ private:
+  // Only needed for stateful communication.
+  py::object state_;
+  py::object hook_;
+};
+
+} // namespace c10d
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/quantization/quantization.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/quantization/quantization.h
new file mode 100644
index 0000000000000000000000000000000000000000..906bb93d28cc091b19f209c12ac231c763aeea86
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/quantization/quantization.h
@@ -0,0 +1,15 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <ATen/ATen.h>
+
+namespace torch::distributed::c10d::quantization {
+
+at::Tensor _float_to_bfloat16_cpu(const at::Tensor& input);
+at::Tensor _bfloat16_to_float_cpu(const at::Tensor& input);
+
+} // namespace torch::distributed::c10d::quantization
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/quantization/quantization_gpu.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/quantization/quantization_gpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..ede2fe8e475187bdc835af83546acebadabd86d0
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/quantization/quantization_gpu.h
@@ -0,0 +1,15 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <ATen/ATen.h>
+
+namespace torch::distributed::c10d::quantization {
+
+at::Tensor _float_to_bfloat16_cuda(const at::Tensor& input);
+at::Tensor _bfloat16_to_float_cuda(const at::Tensor& input);
+
+} // namespace torch::distributed::c10d::quantization
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/quantization/quantization_utils.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/quantization/quantization_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..6099a1dcef05fb9b1816d86e2f32133fe1ea7f70
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/quantization/quantization_utils.h
@@ -0,0 +1,34 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <ATen/ATen.h>
+
+#include <typeinfo>
+
+inline std::string torch_tensor_device_name(const at::Tensor& ten) {
+  return c10::DeviceTypeName(ten.device().type());
+}
+
+#define TENSOR_NDIM_EQUALS(ten, dims)      \
+  TORCH_CHECK(                             \
+      (ten).ndimension() == (dims),        \
+      "Tensor '" #ten "' must have " #dims \
+      " dimension(s). "                    \
+      "Found ",                            \
+      (ten).ndimension())
+
+#define TENSOR_ON_CPU(x)                                      \
+  TORCH_CHECK(                                                \
+      !x.is_cuda(),                                           \
+      #x " must be a CPU tensor; it is currently on device ", \
+      torch_tensor_device_name(x))
+
+#define TENSOR_ON_CUDA_GPU(x)                                  \
+  TORCH_CHECK(                                                 \
+      x.is_cuda(),                                             \
+      #x " must be a CUDA tensor; it is currently on device ", \
+      torch_tensor_device_name(x))
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/reducer.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/reducer.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c52a1ff2e33d7b9949997dd88eef76afbdec6a0e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/reducer.hpp
@@ -0,0 +1,599 @@
+#pragma once
+
+#include <c10/core/ScalarType.h>
+#include <atomic>
+#include <memory>
+#include <mutex>
+#include <tuple>
+#include <unordered_map>
+#include <vector>
+
+#include <ATen/core/ivalue_inl.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/intrusive_ptr.h>
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/autograd/profiler.h>
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
+#include <torch/csrc/distributed/c10d/Utils.hpp>
+#include <torch/csrc/distributed/c10d/comm.hpp>
+#include <torch/csrc/distributed/c10d/debug.h>
+#include <torch/csrc/distributed/c10d/default_comm_hooks.hpp>
+#include <torch/csrc/distributed/c10d/reducer_timer.hpp>
+#ifndef _WIN32
+#include <torch/csrc/distributed/autograd/context/context.h>
+#endif
+
+namespace c10d {
+
+constexpr int kDefaultFirstBucketBytes = int(1024 * 1024);
+constexpr int kDefaultBucketBytesCap = int(25 * 1024 * 1024);
+// Collect runtime stats once for every kDDPRuntimeLoggingSampleRate iterations.
+constexpr int kDDPRuntimeLoggingSampleRate = 100;
+
+// Forward declaration
+class Logger;
+
+// Local accumulator type for a single bucket.
+struct BucketAccumulator {
+  std::vector<size_t> indices;
+  size_t size = 0;
+  size_t size_limit = 0;
+};
+
+class TORCH_API Reducer {
+ public:
+  // The constructor takes a list of variables (i.e. parameters) for this
+  // process's single model replica (as DDP assumes single-process
+  // single-device). The bucket assignment for this reducer, `bucket_indices`,
+  // is specified as a list of buckets, each of which is specified as a list of
+  // indices into the bucket's `variables` list.
+  explicit Reducer(
+      std::vector<at::Tensor> params,
+      std::vector<std::vector<size_t>> bucket_indices,
+      c10::intrusive_ptr<c10d::ProcessGroup> process_group,
+      std::vector<bool> expect_sparse_gradients,
+      int64_t bucket_bytes_cap,
+      bool find_unused_parameters,
+      bool gradient_as_bucket_view,
+      std::unordered_map<size_t, std::string> param_names,
+      int64_t first_bucket_bytes_cap,
+      bool skip_all_reduce_unused_params,
+      bool use_python_reducer);
+
+  ~Reducer() noexcept(false);
+
+  // To (re-)initialize bucket assignment, pass a list of buckets, each of
+  // which is specified by a list of indices in the bucket's `variables` list.
+  // This function performs validation that the variables within a bucket
+  // all live on the same device and have the same dimensionality.
+  void initialize_buckets(std::vector<std::vector<size_t>> bucket_indices);
+
+  void autograd_hook(size_t index);
+
+  // This function is called when the forward function has produced an output,
+  // and the user wishes to reduce gradients in the backwards pass.
+  // If they don't, and wish to accumulate gradients before reducing them,
+  // a call to this function can simply be omitted.
+  void prepare_for_backward(const std::vector<at::Tensor>& outputs);
+
+  // Called at the beginning of forward() inside DistributedDataParallel,
+  // right now it captures the starting time of forward in each iteration.
+  void prepare_for_forward();
+
+  // Returns the relative time in nanoseconds when gradients were ready,
+  // with respect to the time `prepare_for_backward` was called. The
+  // vector is for parameters for a single model replica.
+  std::vector<int64_t> get_backward_stats() const {
+    return backward_stats_;
+  }
+
+  // Registers a hook to the reducer. The hook is `CommHookInterface`
+  // type to allow both Python and CPP hooks. This function can only
+  // be called once before calling backward.
+  // Cannot combine with the call of `register_builtin_comm_hook`.
+  void register_comm_hook(std::unique_ptr<CommHookInterface> iface);
+
+  // Registers a built-in C++ comm hook to the reducer. This function can only
+  // be called once before calling backward.
+  // Cannot combine with the call of `register_comm_hook`.
+  void register_builtin_comm_hook(c10d::BuiltinCommHookType comm_hook_type);
+
+  // Informs reducer that optimizer is running in backward, so gradients
+  // don't need to be copied from buckets as the optimizer would've already
+  // been applied.
+  void set_optimizer_in_backward() {
+    optim_in_backward_ = true;
+  }
+
+  // Runs allreduce or installed communication hook given GradBucket instance.
+  c10::intrusive_ptr<c10::ivalue::Future> run_comm_hook(
+      GradBucket& grad_bucket);
+
+  // Runs default allreduce hook.
+  c10::intrusive_ptr<c10::ivalue::Future> run_allreduce_hook(
+      GradBucket& grad_bucket);
+
+  // Returns gradient buckets in sequential order of buckets_. This is the order
+  // in which buckets are reduced across processes. If return_zero_tensors=true,
+  // will return zero tensors of the same shape instead of the true tensors.
+  std::vector<c10d::GradBucket> get_grad_buckets(
+      bool return_zero_tensors = true) const;
+
+  // Rebuild buckets based on rebuilt_params_ and rebuilt_param_indices_
+  // according to when tensors received grads in the backward pass.
+  // TODO this function makes broadcast communication call and
+  // could be overlapped with next forward() call, thus
+  // it could be async. Will make it async when rebuilding buckets for
+  // find_unused_parameters = true case, as we could rebuild buckets more than
+  // once for find_unused_parameters = true case, where subgraphs are trained
+  // and parameter indices order may change more frequently.
+  // For find_unused_parameters = false case, buckets are only rebuilt once,
+  // the performance cost is negligible. Returns true if the buckets were
+  // rebuilt.
+  bool rebuild_buckets();
+
+  void setSparseMetadata(std::map<std::string, at::Tensor>& metadata);
+
+  // Install futures that should be awaited at end of backwards. Currently these
+  // are only used by user-defined custom buffer reduction hooks, but can be
+  // generalized to any user-originating futures that need to be awaited.
+  void install_futures(
+      const c10::List<c10::intrusive_ptr<c10::ivalue::Future>>& futs);
+
+  // Returns true if we should rebuild buckets, else false. We only rebuild
+  // buckets once after the first iteration and never rebuild them if
+  // find_unused_parameters_.
+  inline bool should_rebuild_buckets() const {
+    return (static_graph_ || !find_unused_parameters_) && !has_rebuilt_bucket_;
+  }
+
+  // Pushes all parameters to be rebuilt.
+  void push_rebuilt_params_for_all_indices();
+
+  // Creates and sets ForwardPassWorkHandle given a Work and the
+  // corresponding tensor being reduced.
+  void set_forward_pass_work_handle(
+      c10::intrusive_ptr<c10d::Work> forwardPassWorkHandle,
+      bool useStaticWorldSize);
+
+  // Retrieve on-device tensors used to track locally unused parameters. It is
+  // a tensor where index i = 1 if the Variable with that index has been used.
+  at::Tensor get_local_used_map_on_device() const;
+
+  // An function for users to set sample_rate of collecting
+  // runtime stats. The time stats will be recorded for the
+  // first 10 iterations, after 10 iterations time stats will be
+  // recorded once every "sample_rate" training iterations.
+  void set_ddp_runtime_logging_sample_rate(int sample_rate);
+
+  // Specify the training graph is static.
+  void set_static_graph();
+
+  // Delay all reduce to be after all gradients' calculation is complete.
+  void delay_all_reduce();
+
+  void set_mixed_precision_param_dtype(c10::ScalarType dtype);
+
+  // Weak reference to associated DDP logger. The reference is weak to avoid
+  // refcycle between reducer and logger.
+  void set_logger(std::weak_ptr<c10d::Logger> logger);
+
+  // When graph is not explicitly set by user as static and has unused
+  // parameters, this will return whether the graph has been static until the
+  // current iteration, which means unused params set has not changed.
+  bool ddp_graph_static();
+
+  // Removes autograd hooks registered by the Reducer on the model parameters.
+  void remove_autograd_hooks();
+
+  // Checks whether or not the reducer has finalized the current backward
+  // iteration.
+  void check_finalized();
+
+  // Updates the underlying process group used by DDP with the new process
+  // group.
+  void update_process_group(
+      c10::intrusive_ptr<c10d::ProcessGroup> new_process_group);
+
+  // Resets reducer state.
+  void reset_state();
+
+ protected:
+  // Forward declaration.
+  struct Bucket;
+
+  void push_rebuilt_params(const size_t& index);
+
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  mutable std::mutex mutex_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  const std::vector<at::Tensor> params_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  c10::intrusive_ptr<::c10d::ProcessGroup> process_group_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::vector<bool> expect_sparse_gradients_;
+
+  std::vector<std::shared_ptr<torch::autograd::Node>>
+      grad_accumulators_; // NOLINT(cppcoreguidelines-non-private-member-variables-in-classes)
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::unordered_map<torch::autograd::Node*, size_t> gradAccToVariableMap_;
+  std::vector<std::pair<uintptr_t, std::shared_ptr<torch::autograd::Node>>>
+      hooks_; // NOLINT(cppcoreguidelines-non-private-member-variables-in-classes)
+
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  bool expect_autograd_hooks_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  bool require_finalize_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  size_t next_bucket_;
+
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  bool has_marked_unused_parameters_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  const bool find_unused_parameters_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  const bool gradient_as_bucket_view_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::vector<size_t> unused_parameters_;
+  // Previous iteration's unused params, used for checking if unused parameters
+  // change between iterations. Only filled during the first backwards call.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::vector<size_t> prev_iteration_unused_parameters_;
+  // Whether graph is static or not. When user does not explicitly set static
+  // graph, the only possible dynamism is set of unused parameters changing
+  // between iterations which is tracked by this flag.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  bool ddp_graph_static_{true};
+  // Locally used parameter maps indicating if parameters are used locally
+  // during the current iteration or no_sync session if no_sync is on.
+  // Each map is a one-dim int32 tensor of number of parameters. These tensors
+  // are marked in autograd_hook to indicate the corresponding param has been
+  // used, and get allreduced in the end of backward step of current iteration
+  // or no_sync session for figuring out the globally unused parameters.
+  //
+  // local_used_map_:     CPU tensor for bookkeeping locally used params
+  // local_used_map_dev_: dev tensor for reducing globally unused params
+  at::Tensor local_used_map_;
+  at::Tensor local_used_map_dev_;
+  // Indicate that reduction is done and D2H copy is done as well.
+  bool local_used_map_reduced_;
+
+  // Weak pointer to associated DDP logger.
+  std::weak_ptr<c10d::Logger> logger_;
+  // List of futures installed by Reducer::install_futures that should be
+  // awaited at the end of backwards pass.
+  std::optional<c10::List<c10::intrusive_ptr<c10::ivalue::Future>>>
+      installed_futures_{std::nullopt};
+  // Mixed precision parameter dtype for bucket type checking.
+  std::optional<c10::ScalarType> mixed_precision_param_dtype_{std::nullopt};
+
+  // Work handle for allreduce on local_used_map_
+  c10::intrusive_ptr<c10d::Work> local_used_work_;
+
+  void mark_variable_ready_dense(size_t variable_index);
+
+  void mark_variable_ready_sparse(size_t variable_index);
+
+  void mark_variable_ready(size_t variable_index);
+
+  void mark_bucket_ready(size_t bucket_index);
+
+  void finalize_bucket_dense(Bucket& bucket);
+
+  void finalize_backward();
+
+  // Returns list of model parameters corresponding to the given bucket.
+  // bucket_index is a key to cache after buckets are rebuilt, after which this
+  // mapping never changes.
+  std::vector<at::Tensor> get_variables_for_bucket(
+      size_t bucket_index,
+      const Bucket& bucket) const;
+
+  // Asserts that the reduction for the previous iteration has finished before
+  // rebuilding buckets or kicking off the next one.
+  void ensure_prior_reduction_finished();
+
+  // Broadcast rebuilt buckets from rank 0 to other ranks before initializing
+  // the buckets
+  void sync_bucket_indices(std::vector<std::vector<size_t>>& bucket_indices);
+
+  // We'd like to use DistAutogradContext::GradCallback here but dist autograd
+  // doesn't exist under Windows. So we just directly use the concrete type but
+  // to preserve and enforce our original intent we do a static assert when dist
+  // autograd is available.
+  using GradCallback = std::function<bool(at::Tensor&)>;
+#ifndef _WIN32
+  static_assert(
+      std::is_same_v<
+          GradCallback,
+          torch::distributed::autograd::DistAutogradContext::GradCallback>);
+#endif
+  void runGradCallbackForVariable(at::Tensor& variable, const GradCallback& cb);
+
+  // This function is called inside `initialize_buckets()`. It initializes both
+  // `bucket_views_in` and `bucket_views_out` with views for each variable's
+  // gradient into the bucket's flattened `gradients` tensor. Views serve as
+  // entry points to `copy_()` each grad's data in/out of the flattened
+  // `gradients` tensor.
+  void initialize_bucket_views(Bucket& bucket);
+
+  // This function is called inside `finalize_backward`, it happens only if
+  // DDP communication hook was registered to recreate just bucket_views_out
+  // with the result of `future_work`.
+  void populate_bucket_views_out(Bucket& bucket, at::Tensor& tensor);
+
+  // If gradient_as_bucket_view_ is false, after allreduce buckets,
+  // copy bucket results back to grads.
+  void copy_bucket_to_grad(
+      at::Tensor& variable,
+      Reducer::Bucket& bucket,
+      size_t intra_bucket_index,
+      bool global_unused);
+  // Check layout of grad and bucket_view before copying the grad to bucket.
+  void check_grad_layout(const at::Tensor& grad, const at::Tensor& bucket_view);
+
+  // A bucket contains [1..N] gradients to be reduced, where the gradients
+  // have the same dtype and device.
+  // Coalescing gradients together before reducing can result in lower overhead
+  // and/or faster time to completion. Coalescing requires the constituent
+  // gradients to have the same dtype and device, and the resulting flattened
+  // tensor uses that common dtype and device. The flattened tensor is filled
+  // as the corresponding gradients are computed (triggered by autograd hooks),
+  // and the buckets are reduced in a predetermined order consistent across
+  // processes.
+  struct Bucket {
+    // Gradients of the bucket flattened into a 1-dimensional tensor
+    at::Tensor gradients;
+
+    // Views into the `gradients` tensor for each individual gradient
+    // Each view is created with layout (size and stride) matching the
+    // gradient's expected layout (see the "Gradient Layout Contract" in
+    // torch/csrc/autograd/functions/accumulate_grad.h).
+    // `bucket_views_in[i].copy_(grad)` and `grad.copy_(bucket_views_out[i])`
+    // provide convenient ways to copy gradient data in/out of `gradients`,
+    // respectively.
+    // We keep both `bucket_views_in` and `bucket_views_out` because
+    // registering a DDP communication hook may re-initialize
+    // `bucket_views_out` with the value of the hook's `future_work` but we
+    // still need separate views into the bucket's original flattened gradient
+    // to copy in gradient data.
+    std::vector<at::Tensor> bucket_views_in;
+    std::vector<at::Tensor> bucket_views_out;
+
+    // Variables whose gradients are held in this bucket
+    // We use refcounted tensors here so that we can easily unflatten the
+    // bucket's flattened `gradients` tensor into the participating variables
+    // after reduction has completed.
+    std::vector<at::Tensor> variables;
+
+    // Per-variable offset/length into the flattened `gradients` tensor and
+    // the corresponding `GradBucket` instance for communication hooks
+    std::vector<size_t> offsets;
+    std::vector<size_t> lengths;
+
+    // Per-variable sizes slicing into the bucket's `gradients` tensor
+    std::vector<c10::IntArrayRef> sizes_vec;
+
+    // Number of gradients left to be computed before the bucket is ready to
+    // be reduced
+    size_t pending;
+
+    // Global indices of participating variables in the bucket
+    std::vector<size_t> variable_indices;
+
+    // Future work handle for DDP communication hook
+    // If no hook is registered, a temporary vanilla allreduce hook is used.
+    c10::intrusive_ptr<at::ivalue::Future> future_work;
+
+    // If this bucket should expect a single sparse gradient
+    // If `true`, then this implies that `bucket.variables.size() == 1`.
+    bool expect_sparse_gradient = false;
+
+    // Sparse indices tensor
+    std::optional<at::Tensor> sparse_tensor_indices = std::nullopt;
+
+    // TODO(@pietern)
+    // Memory copies from gradient tensors into the bucket are potentially
+    // done on different CUDA streams. We record an event for every copy
+    // so that we can synchronize with them prior to kicking off the reduction.
+    // std::vector<at::cuda::CUDAEvent> events;
+  };
+
+  std::vector<Bucket> buckets_;
+
+  // A variable locator locates a particular variable in the reducer's buckets
+  struct VariableLocator {
+    // Index of the bucket containing the variable in the `buckets_` vector
+    size_t bucket_index;
+    // Index of the variable in the bucket, which may be used consistently
+    // across `bucket_views_in`, `bucket_views_out`, `variables`, `offsets`,
+    // `lengths`, `sizes_vec`, and `variable_indices` in `Bucket`
+    size_t intra_bucket_index;
+
+    VariableLocator() = default;
+
+    VariableLocator(size_t bucket_index_, size_t intra_bucket_index_)
+        : bucket_index(bucket_index_),
+          intra_bucket_index(intra_bucket_index_) {}
+  };
+
+  // Map the index of a variable to its location in the bucket structure.
+  std::vector<VariableLocator> variable_locators_;
+
+  // track the number of iterations to synchronize grads in training so far.
+  long num_iterations_;
+  // track distinct iteration of backward call. This is distinct from
+  // num_iterations_, for example in the case of multiple forward before
+  // backward.
+  long num_bwd_calls_;
+  // whether the first autograd hook for a distinct backward pass has been
+  // called.
+  bool first_autograd_hook_called_;
+  // track the number of buckets that have been ready for
+  // communication calls like allReduce or communication hooks.
+  int num_buckets_ready_;
+  // track the number of buckets that have been reduced.
+  int num_buckets_reduced_;
+
+  // Timing information.
+  int64_t backward_compute_start_time_ = -1;
+  std::unique_ptr<Timer> timer_;
+
+  // We collect the relative timestamp of every gradient being ready
+  // when executing autograd. This can be used to derive a timeline of
+  // the point in time buckets were ready, or ideal bucket assignment/ordering.
+  std::vector<int64_t> backward_stats_;
+
+  bool should_collect_runtime_stats();
+  void record_forward_compute_start_time();
+  void record_backward_compute_start_time();
+  void record_backward_compute_end_time();
+  void record_backward_comm_start_time();
+  void record_backward_comm_end_time();
+
+  int get_ddp_runtime_logging_sample_rate();
+  int ddp_runtime_logging_sample_rate_ = kDDPRuntimeLoggingSampleRate;
+
+  bool is_multi_device_module_ = false;
+
+  // Following variables are to help build dynamic bucket order
+  bool has_rebuilt_bucket_;
+  std::vector<at::Tensor> rebuilt_params_;
+  std::vector<int64_t> rebuilt_param_indices_;
+  const int64_t bucket_bytes_cap_;
+
+#ifndef _WIN32
+  struct RpcContext {
+    using ContextPtr = torch::distributed::autograd::ContextPtr;
+    // The shared_ptr is to hold the context instance.
+    ContextPtr context_ptr_holder;
+    std::atomic<ContextPtr::element_type*> context_ptr{nullptr};
+
+    void set(ContextPtr&& new_context_ptr);
+  };
+  RpcContext rpc_context_;
+#endif
+
+  // A struct containing work handle and tensor for allreduce scheduled in
+  // forward pass, if applicable.
+  struct ForwardPassAllreduceWork {
+    c10::intrusive_ptr<c10d::Work> workHandle;
+    at::Tensor resultTensor;
+    // whether we should divide by the initial world_size or the no. of
+    // remaining DDP ranks.
+    bool useStaticWorldSize;
+  };
+
+  // Handle for the currently scheduled allreduce in the forward pass, if
+  // applicable.
+  ForwardPassAllreduceWork forwardPassWorkHandle_;
+
+  // Division factor for reduction of gradients.
+  // Equal to the process group size, with an exception of handling uneven
+  // input.
+  int div_factor_;
+
+  bool static_graph_;
+
+  bool skip_all_reduce_unused_params_;
+
+  // Key: size_t (index), Value: the number of times that a variable's
+  // autograd_hook() should be triggered before marking this variable's grad as
+  // ready for communication. Map will not change after 1st iteration.
+  std::unordered_map<size_t, int> numGradHooksTriggeredMap_;
+  // Key: size_t (index), Value: the number of times that a variable's
+  // autograd_hook() are left to be triggered before marking this variable's
+  // grad as ready for communication. Map will change after 1st iteration to
+  // track a grad is ready for communication or not.
+  std::unordered_map<size_t, int> numGradHooksTriggeredMapPerIteration_;
+
+ private:
+  // reset counting for buckets before backward starts
+  void reset_bucket_counting();
+  // search unused parameters beore backward starts
+  void search_unused_parameters(
+      const std::vector<torch::autograd::Variable>& outputs);
+  void set_divide_factor();
+  // kick off all reduce for the ready bucket
+  void all_reduce_bucket(Bucket& bucket);
+  // kick off all reduce to local used map, it can help find global unused
+  // parameters
+  void all_reduce_local_used_map();
+  // initialize locally used parameter maps
+  void initialize_local_used_map();
+  // get current cuda stream
+  const c10::Stream get_current_stream();
+  bool dynamic_graph_find_unused();
+  bool static_graph_first_iteration();
+  bool static_graph_after_first_iteration();
+
+  bool is_unused_bucket(Bucket& bucket);
+  bool should_skip_all_reduce_bucket(Bucket& bucket);
+
+  // comm_hook_ is used to access the DDP communication hook if registered.
+  std::unique_ptr<CommHookInterface> comm_hook_;
+
+  // Sparse metadata contains the indices that will be used
+  // when calling into sparse allreduce.
+  // This is only used in the sparse allreduce collective calls
+  std::unique_ptr<std::map<std::string, at::Tensor>> sparse_metadata_;
+
+  // Debug level setting. It is parsed once when Reducer is constructed, and
+  // remains the same across a single invocation of DDP training.
+  DebugLevel ddp_debug_level_;
+  // Mapping of variable index to fully qualified name of model to notify users
+  // about errors when certain parameters do not get gradient.
+  std::unordered_map<size_t, std::string> param_names_;
+  // Variable indices stored sequentially in order of when the gradient is ready
+  // for the current backwards pass.
+  std::vector<int64_t> grad_ready_order_indices_;
+  // Bytes capacity of first bucket, can be configured by user
+  int64_t first_bucket_bytes_cap_;
+  // Per iteration set of parameter indices that have been marked ready.
+  std::unordered_set<size_t> perIterationReadyParams_;
+  // Retrieves parameter names that have not been marked as ready as part of
+  // previous iteration.
+  std::vector<std::string> getUnmarkedParamsForIteration();
+  // Retrieves parameter indices that have not been marked as ready as part of
+  // previous iteration.
+  std::vector<size_t> getUnmarkedParamIndicesForIteration();
+  // Raises appropriate error if mark_variable_ready is called on the same
+  // variable twice, which is unexpected.
+  void checkAndRaiseMarkedTwiceError(size_t curVariableIndex);
+  // Retrieves parameter corresponding to the given VariableIndex.
+  at::Tensor& get_param_from_index(size_t index);
+  // Python reducer keeps C++ reducer initialized. To remove this flag,
+  // we need to refactor the DDP wrapper's initialization.
+  bool use_python_reducer_;
+
+  // Cached bucket index to model parameter mapping. Populated after buckets
+  // are rebuilt after which this mapping is static.
+  mutable std::unordered_map<size_t, std::vector<at::Tensor>>
+      cached_variables_for_bucket_;
+
+  bool optim_in_backward_{false};
+  friend class Logger;
+};
+
+// This is equivalent to take_tensors but returns indices into the
+// tensor list argument for bucket assignment. Also, it is aware
+// of device placement and will not allow buckets to span devices.
+// The index of tensors[i] assigned to bucket is tensor_indices[i],
+// when tensor_indices is empty, the index of tensors[i] assigned to
+// bucket is i.
+TORCH_API std::tuple<std::vector<std::vector<size_t>>, std::vector<size_t>>
+compute_bucket_assignment_by_size(
+    const std::vector<at::Tensor>& tensors,
+    const std::vector<size_t>& bucket_size,
+    const std::vector<bool>& expect_sparse_gradient = {},
+    const std::vector<int64_t>& tensor_indices = {},
+    const std::optional<std::weak_ptr<c10d::Logger>>& logger = {});
+
+// Verify models across all processes are the same as model on rank 0 with
+// respect to no. of params and matching dtype/size/layout.
+TORCH_API void verify_params_across_processes(
+    const c10::intrusive_ptr<c10d::ProcessGroup>& process_group,
+    const std::vector<at::Tensor>& params,
+    const std::optional<std::weak_ptr<c10d::Logger>>& logger);
+} // namespace c10d
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/reducer_timer.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/reducer_timer.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9c7e333050012fe8cc4809fdf31a08facd8fc06b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/reducer_timer.hpp
@@ -0,0 +1,81 @@
+#pragma once
+#include <c10/util/ApproximateClock.h>
+#include <torch/csrc/autograd/profiler.h>
+
+namespace c10d {
+constexpr int kUnsetTime = -1;
+
+inline int64_t current_time_in_nanos() {
+  return c10::getTime();
+}
+
+class TORCH_API Timer {
+ private:
+  // The timestamp of forward call start time in each iteration.
+  int64_t forward_start_time = kUnsetTime;
+  // The timestamp of backward computation start and end time in each
+  // iteration.
+  int64_t backward_compute_start_time = kUnsetTime;
+  int64_t backward_compute_end_time = kUnsetTime;
+  // The timestamp of first communication call start time in each iteration.
+  int64_t backward_comm_start_time = kUnsetTime;
+  // The timestamp of last communication call end time in each iteration.
+  int64_t backward_comm_end_time = kUnsetTime;
+
+ public:
+  enum class Event : uint8_t {
+    kForwardStart,
+    kBackwardComputeStart,
+    kBackwardComputeEnd,
+    kBackwardCommStart,
+    kBackwardCommEnd,
+  };
+
+  // Record the current event, i.e., mark it as having occurred now. Default
+  // CPU implementation.
+  virtual void record(Event event) {
+    getTimeRef(event) = current_time_in_nanos();
+  }
+
+  // Return the difference between when two events occurred, in nanoseconds.
+  // Or nullopt if one of them hasn't been recorded.
+  virtual std::optional<int64_t> measureDifference(Event start, Event end) = 0;
+
+  virtual ~Timer() = default;
+
+  // Return host-side timestamp, or nullopt if it has not yet been recorded.
+  std::optional<int64_t> getTimestamp(Event event) {
+    auto time = getTimeRef(event);
+    if (time == kUnsetTime) {
+      return std::nullopt;
+    } else {
+      return time;
+    }
+  }
+
+  // Return host-side time member variable corresponding to the given event.
+  int64_t& getTimeRef(Event event) {
+    switch (event) {
+      case Event::kForwardStart:
+        return forward_start_time;
+      case Event::kBackwardComputeStart:
+        return backward_compute_start_time;
+      case Event::kBackwardComputeEnd:
+        return backward_compute_end_time;
+      case Event::kBackwardCommStart:
+        return backward_comm_start_time;
+      case Event::kBackwardCommEnd:
+        return backward_comm_end_time;
+      default:
+        TORCH_INTERNAL_ASSERT(false);
+    }
+  }
+};
+
+TORCH_DECLARE_TYPED_REGISTRY(
+    TimerRegistry,
+    c10::DeviceType,
+    Timer,
+    std::unique_ptr,
+    c10::Device);
+} // namespace c10d
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/sequence_num.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/sequence_num.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..864c2690046efcf8ab9929976336e0d96690590d
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/sequence_num.hpp
@@ -0,0 +1,66 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/irange.h>
+#include <mutex>
+#include <optional>
+#include <vector>
+
+namespace c10d {
+constexpr int kUnsetSeqNum = 0;
+
+namespace {
+constexpr int kByteOffset = 8;
+} // namespace
+
+// Converts from int to char vec to write in store
+template <typename T>
+inline std::vector<T> toVec(uint64_t num, int numBytes) {
+  std::vector<T> values;
+  // Read off bytes from right to left, pushing them into
+  // char array.
+  for (const auto i : c10::irange(numBytes)) {
+    uint8_t x = (num >> (kByteOffset * i)) & 0xff;
+    values.push_back(static_cast<T>(x));
+  }
+  return values;
+}
+
+// Converts from char vec (such as from store read) to int.
+template <typename T>
+inline uint64_t fromVec(const std::vector<T>& values) {
+  uint64_t num = 0;
+  // Set each byte at the correct location on num
+  for (const auto i : c10::irange(values.size())) {
+    uint8_t x = static_cast<uint8_t>(values[i]);
+    num |= (static_cast<int64_t>(x) << (kByteOffset * i));
+  }
+  return num;
+}
+
+class TORCH_API SequenceNum {
+ public:
+  SequenceNum();
+  explicit SequenceNum(const uint64_t num);
+  // Retrieve num_. Will throw if not set.
+  uint64_t get() const;
+  // Increment num_. Will throw if not set.
+  void increment();
+  // Increment num_ and return the old value. Will throw if not set.
+  uint64_t getAndIncrement();
+  // Sets num_
+  void set(const uint64_t num);
+  // Returns true if this SequenceNum is properly initialized with a value, else
+  // false.
+  bool isSet() const;
+
+  SequenceNum& operator=(const SequenceNum& other);
+
+  SequenceNum(const SequenceNum& other);
+
+ private:
+  std::optional<uint64_t> num_;
+  mutable std::mutex lock_;
+};
+
+} // namespace c10d
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/socket.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/socket.h
new file mode 100644
index 0000000000000000000000000000000000000000..47900b8ee2262ea01b67912eedbfdf581f937cda
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/socket.h
@@ -0,0 +1,105 @@
+// Copyright (c) Meta Platforms, Inc. and its affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <chrono>
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <torch/csrc/distributed/c10d/Backoff.hpp>
+#include <torch/csrc/distributed/c10d/exception.h>
+
+namespace c10d::detail {
+
+class SocketOptions {
+ public:
+  SocketOptions& prefer_ipv6(bool value) noexcept {
+    prefer_ipv6_ = value;
+
+    return *this;
+  }
+
+  bool prefer_ipv6() const noexcept {
+    return prefer_ipv6_;
+  }
+
+  SocketOptions& connect_timeout(std::chrono::milliseconds value) noexcept {
+    connect_timeout_ = value;
+
+    return *this;
+  }
+
+  std::chrono::milliseconds connect_timeout() const noexcept {
+    return connect_timeout_;
+  }
+
+  // Sets the backoff policy to use for socket connect ops.
+  SocketOptions& connect_backoff(std::shared_ptr<Backoff> value) noexcept {
+    connect_backoff_ = std::move(value);
+
+    return *this;
+  }
+
+  const std::shared_ptr<Backoff>& connect_backoff() const noexcept {
+    return connect_backoff_;
+  }
+
+ private:
+  bool prefer_ipv6_ = true;
+  std::chrono::milliseconds connect_timeout_{std::chrono::seconds{30}};
+  std::shared_ptr<Backoff> connect_backoff_{
+      std::make_shared<FixedBackoff>(std::chrono::milliseconds(1000))};
+};
+
+class SocketImpl;
+
+class Socket {
+ public:
+  // This function initializes the underlying socket library and must be called
+  // before any other socket function.
+  static void initialize();
+
+  static Socket listen(std::uint16_t port, const SocketOptions& opts = {});
+
+  static Socket listenFromFd(int fd, std::uint16_t expected_port);
+
+  static Socket connect(
+      const std::string& host,
+      std::uint16_t port,
+      const SocketOptions& opts = {});
+
+  Socket() noexcept = default;
+
+  Socket(const Socket& other) = delete;
+
+  Socket& operator=(const Socket& other) = delete;
+
+  Socket(Socket&& other) noexcept;
+
+  Socket& operator=(Socket&& other) noexcept;
+
+  ~Socket();
+
+  Socket accept() const;
+
+  int handle() const noexcept;
+
+  std::uint16_t port() const;
+
+  bool waitForInput(std::chrono::milliseconds timeout);
+
+  std::string repr() const;
+
+ private:
+  explicit Socket(std::unique_ptr<SocketImpl>&& impl) noexcept;
+
+  std::unique_ptr<SocketImpl> impl_;
+};
+} // namespace c10d::detail
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/socket_fmt.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/socket_fmt.h
new file mode 100644
index 0000000000000000000000000000000000000000..2cb9e72e23568098289545144a5973cbe4caf628
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/socket_fmt.h
@@ -0,0 +1,30 @@
+// (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+/*
+This file should not be included from other .h files and only used in cpp files
+as it exposes the underlying platform specific socket headers.
+*/
+
+#include <string>
+
+#ifdef _WIN32
+#include <mutex>
+
+#include <winsock2.h>
+#include <ws2tcpip.h>
+#else
+#include <netinet/in.h>
+#endif
+
+namespace c10d::detail {
+
+// Returns a human-readable representation of the given socket address.
+std::string formatSockAddr(const struct ::sockaddr* addr, socklen_t len);
+
+} // namespace c10d::detail
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..a51cb0a52e0a3b19e7e6f1da48772d78606bd241
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h
@@ -0,0 +1,368 @@
+#pragma once
+
+#include <atomic>
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && CUDART_VERSION >= 12010
+#define NVCC_SUPPORTS_MULTICAST 1
+#endif
+
+#include <ATen/ATen.h>
+#if defined(USE_ROCM)
+#include <hip/hip_bf16.h>
+#endif
+#if !defined(USE_ROCM)
+#include <cuda_bf16.h>
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
+#include <cuda/atomic>
+#endif
+#endif
+#include <ATen/native/cuda/MemoryAccess.cuh>
+
+namespace c10d::symmetric_memory {
+
+template <int Size>
+using Vec = at::native::memory::Vec<Size>;
+
+template <class... T>
+inline constexpr bool dependent_false =
+    at::native::memory::dependent_false<T...>;
+
+using at::native::memory::get_alignment;
+
+template <std::memory_order Sem>
+__device__ __forceinline__ uint32_t
+cas(uint32_t* addr, uint32_t compare, uint32_t val) {
+#if !defined(USE_ROCM) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
+  ::cuda::atomic_ref<uint32_t, ::cuda::thread_scope_system> ref(*addr);
+  ref.compare_exchange_strong(compare, val, ::cuda::std::memory_order(Sem));
+  return compare;
+#elif defined(USE_ROCM)
+  __atomic_compare_exchange_n(
+      addr, &compare, val, false, static_cast<int>(Sem), __ATOMIC_RELAXED);
+  return compare;
+#else
+  CUDA_KERNEL_ASSERT(false);
+  return 0;
+#endif
+}
+
+__device__ __forceinline__ void trap() {
+#if defined(USE_ROCM)
+  // abort() calls trap() under the covers. However, on ROCm, the trap is
+  // handled differently inside hip runtime. It collects a gpu core dump and
+  // causes linux kernel to create a core dump of the host application.
+  abort();
+#else
+  __trap();
+#endif
+}
+
+__device__ __forceinline__ size_t global_timer_ns() {
+#if defined(USE_ROCM)
+  static constexpr double MI300_FREQ_GHZ = 2.1;
+  return clock64() / MI300_FREQ_GHZ;
+#else
+  size_t val;
+  asm volatile("mov.u64 %0, %globaltimer;" : "=l"(val) : : "memory");
+  return val;
+#endif
+}
+
+constexpr size_t ns_per_ms = 1e6;
+
+template <std::memory_order Sem>
+__device__ __forceinline__ bool try_put_signal(
+    uint32_t* addr,
+    size_t timeout_ms) {
+  size_t deadline = global_timer_ns() + timeout_ms * ns_per_ms;
+  while (cas<Sem>(addr, 0, 1) != 0) {
+    if (timeout_ms != 0 && global_timer_ns() > deadline) {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <std::memory_order Sem>
+__device__ __forceinline__ bool try_wait_signal(
+    uint32_t* addr,
+    size_t timeout_ms) {
+  size_t deadline = global_timer_ns() + timeout_ms * ns_per_ms;
+  while (cas<Sem>(addr, 1, 0) != 1) {
+    if (timeout_ms != 0 && global_timer_ns() > deadline) {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <std::memory_order Sem>
+__device__ __forceinline__ void put_signal(uint32_t* addr) {
+  while (cas<Sem>(addr, 0, 1) != 0)
+    ;
+}
+
+template <std::memory_order Sem>
+__device__ __forceinline__ void wait_signal(uint32_t* addr) {
+  while (cas<Sem>(addr, 1, 0) != 1)
+    ;
+}
+
+// Synchronizes blocks with matching blockIdx across participating devices.
+// Note: sync_remote_block itself is not a system level barrier/fence. It is a
+// building block for expressing different synchronization patterns.
+//
+// Pattern 0: Ensures that all writes to symm_mem buffers from previous
+// kernels across all devices are visible to the current kernel:
+//
+//   sync_remote_blocks<std::memory_order_relaxed>(...);
+//   __syncthreads();
+//
+// Pattern 1: Ensures that all writes to symm_mem buffers from the current
+// block are visible to all remote blocks with matching blockIdx:
+//
+//   __syncthreads();
+//   sync_remote_blocks<std::memory_order_acq_rel>(...);
+//   __syncthreads();
+//
+// Pattern 2: Ensures that symm_mem buffers read by the current kernel are safe
+// for writing by subsequent kernels across all devices.
+//
+//   __syncthreads();
+//   sync_remote_blocks<std::memory_order_relaxed>(...);
+template <std::memory_order Sem>
+__device__ __forceinline__ void sync_remote_blocks(
+    uint32_t** signal_pads,
+    size_t rank,
+    size_t world_size);
+
+template <>
+__device__ __forceinline__ void sync_remote_blocks<std::memory_order_relaxed>(
+    uint32_t** signal_pads,
+    size_t rank,
+    size_t world_size) {
+  if (threadIdx.x < world_size) {
+    auto target_rank = threadIdx.x;
+    put_signal<std::memory_order_relaxed>(
+        signal_pads[target_rank] + blockIdx.x * world_size + rank);
+    wait_signal<std::memory_order_relaxed>(
+        signal_pads[rank] + blockIdx.x * world_size + target_rank);
+  }
+}
+
+template <>
+__device__ __forceinline__ void sync_remote_blocks<std::memory_order_acq_rel>(
+    uint32_t** signal_pads,
+    size_t rank,
+    size_t world_size) {
+  if (threadIdx.x < world_size) {
+    auto target_rank = threadIdx.x;
+    put_signal<std::memory_order_release>(
+        signal_pads[target_rank] + blockIdx.x * world_size + rank);
+    wait_signal<std::memory_order_acquire>(
+        signal_pads[rank] + blockIdx.x * world_size + target_rank);
+  }
+}
+
+template <typename T>
+struct MultimemLdReduce {
+  template <int Alignment>
+  __device__ __inline__ Vec<Alignment> operator()(T* mc_ptr) {
+    static_assert(dependent_false<T>);
+  }
+};
+
+template <int Alignment, typename T>
+__device__ __inline__ Vec<Alignment> multimem_ld_reduce_add(T* mc_ptr) {
+  MultimemLdReduce<T> functor;
+  return functor.template operator()<Alignment>(mc_ptr);
+}
+
+#if defined(USE_ROCM) || !defined(NVCC_SUPPORTS_MULTICAST)
+#define SPECIALIZE_MULTIMEM_LD_REDUCE_VEC_32(type, asm_type, acc_prec) \
+  template <>                                                          \
+  struct MultimemLdReduce<type> {                                      \
+    template <int Alignment>                                           \
+    __device__ __inline__ Vec<Alignment> operator()(type* mc_ptr) {    \
+      CUDA_KERNEL_ASSERT(false);                                       \
+    }                                                                  \
+  };
+#else
+#define SPECIALIZE_MULTIMEM_LD_REDUCE_VEC_32(type, asm_type, acc_prec)    \
+  template <>                                                             \
+  struct MultimemLdReduce<type> {                                         \
+    template <int Alignment>                                              \
+    __device__ __inline__ Vec<Alignment> operator()(type* mc_ptr) {       \
+      Vec<Alignment> vec;                                                 \
+      if constexpr (Alignment == 16) {                                    \
+        asm("multimem.ld_reduce.relaxed.sys.global.add" acc_prec          \
+            ".v4" asm_type " {%0,%1,%2,%3}, [%4];"                        \
+            : "=r"(vec.u32[0]),                                           \
+              "=r"(vec.u32[1]),                                           \
+              "=r"(vec.u32[2]),                                           \
+              "=r"(vec.u32[3])                                            \
+            : "l"(mc_ptr)                                                 \
+            : "memory");                                                  \
+      } else if constexpr (Alignment == 8) {                              \
+        asm("multimem.ld_reduce.relaxed.sys.global.add" acc_prec          \
+            ".v2" asm_type " {%0,%1}, [%2];"                              \
+            : "=r"(vec.u32[0]), "=r"(vec.u32[1])                          \
+            : "l"(mc_ptr)                                                 \
+            : "memory");                                                  \
+      } else if constexpr (Alignment == 4) {                              \
+        asm("multimem.ld_reduce.relaxed.sys.global.add" acc_prec asm_type \
+            " %0, [%1];"                                                  \
+            : "=r"(vec.u32)                                               \
+            : "l"(mc_ptr)                                                 \
+            : "memory");                                                  \
+      }                                                                   \
+      return vec;                                                         \
+    }                                                                     \
+  };
+#endif
+
+SPECIALIZE_MULTIMEM_LD_REDUCE_VEC_32(at::BFloat16, ".bf16x2", ".acc::f32");
+SPECIALIZE_MULTIMEM_LD_REDUCE_VEC_32(float, ".f32", "");
+
+template <int Alignment, typename T>
+__device__ __inline__ void multimem_st(T* mc_ptr, Vec<Alignment>& vec) {
+#if defined(USE_ROCM) || !defined(NVCC_SUPPORTS_MULTICAST)
+  CUDA_KERNEL_ASSERT(false);
+#else
+  if constexpr (Alignment == 16) {
+    asm("multimem.st.relaxed.sys.global.v4.f32 [%0], {%1,%2,%3,%4};"
+        :
+        : "l"(mc_ptr),
+          "r"(vec.u32[0]),
+          "r"(vec.u32[1]),
+          "r"(vec.u32[2]),
+          "r"(vec.u32[3])
+        : "memory");
+  } else if constexpr (Alignment == 8) {
+    asm("multimem.st.relaxed.sys.global.v2.f32 [%0], {%1,%2};"
+        :
+        : "l"(mc_ptr), "r"(vec.u32[0]), "r"(vec.u32[1])
+        : "memory");
+  } else if constexpr (Alignment == 4) {
+    asm("multimem.st.relaxed.sys.global.f32 [%0], %1;"
+        :
+        : "l"(mc_ptr), "r"(vec.u32)
+        : "memory");
+  } else {
+    static_assert(dependent_false<T>);
+  }
+#endif
+}
+
+template <typename T>
+__device__ __inline__ T add_bf16x2(T a, T b) {
+  static_assert(sizeof(T) == 4);
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800))
+  CUDA_KERNEL_ASSERT(false);
+  return T{};
+#elif defined(USE_ROCM)
+  union bf2f {
+    float f;
+    __hip_bfloat16 bf[2];
+  } _bf2f_a = {.f = 0}, _bf2f_b = {.f = 0};
+
+  //__hip_bfloat162 is a struct wtih two __hip_bfloat16 elements called x and y
+  // This typecasts input a and b as bfloat16 and maps to low bits of a float
+  // and does the addition in float
+  _bf2f_a.bf[1] = reinterpret_cast<__hip_bfloat162*>(&a)->x;
+  _bf2f_b.bf[1] = reinterpret_cast<__hip_bfloat162*>(&b)->x;
+  union f2bf {
+    float f;
+    __hip_bfloat16 bf[2];
+  } _f2bf_res0, _f2bf_res1;
+  _f2bf_res0.f = _bf2f_a.f + _bf2f_b.f;
+
+  // Same thing for y elements of __hip_bfloat162
+  _bf2f_a.bf[1] = reinterpret_cast<__hip_bfloat162*>(&a)->y;
+  _bf2f_b.bf[1] = reinterpret_cast<__hip_bfloat162*>(&b)->y;
+  _f2bf_res1.f = _bf2f_a.f + _bf2f_b.f;
+
+  // Put the two results together
+  __hip_bfloat162 rtn(_f2bf_res0.bf[1], _f2bf_res1.bf[1]);
+  return *reinterpret_cast<T*>(&rtn);
+#else
+  auto res = __hadd2(
+      *reinterpret_cast<__nv_bfloat162*>(&a),
+      *reinterpret_cast<__nv_bfloat162*>(&b));
+  return *reinterpret_cast<T*>(&res);
+#endif
+}
+
+template <int Alignment, typename T>
+__device__ __inline__ Vec<Alignment> add_vec(
+    const Vec<Alignment>& a,
+    const Vec<Alignment>& b) {
+  Vec<Alignment> c{};
+  if constexpr (std::is_same_v<T, float>) {
+    if constexpr (Alignment == 16) {
+      c.f32[0] = a.f32[0] + b.f32[0];
+      c.f32[1] = a.f32[1] + b.f32[1];
+      c.f32[2] = a.f32[2] + b.f32[2];
+      c.f32[3] = a.f32[3] + b.f32[3];
+    } else if constexpr (Alignment == 8) {
+      c.f32[0] = a.f32[0] + b.f32[0];
+      c.f32[1] = a.f32[1] + b.f32[1];
+    } else if constexpr (Alignment == 4) {
+      c.f32 = a.f32 + b.f32;
+    } else {
+      static_assert(dependent_false<T>);
+    }
+  } else if constexpr (std::is_same_v<T, at::BFloat16>) {
+    if constexpr (Alignment == 16) {
+      c.u32[0] = add_bf16x2(a.u32[0], b.u32[0]);
+      c.u32[1] = add_bf16x2(a.u32[1], b.u32[1]);
+      c.u32[2] = add_bf16x2(a.u32[2], b.u32[2]);
+      c.u32[3] = add_bf16x2(a.u32[3], b.u32[3]);
+    } else if constexpr (Alignment == 8) {
+      c.u32[0] = add_bf16x2(a.u32[0], b.u32[0]);
+      c.u32[1] = add_bf16x2(a.u32[1], b.u32[1]);
+    } else if constexpr (Alignment == 4) {
+      c.u32 = add_bf16x2(a.u32, b.u32);
+    } else {
+      static_assert(dependent_false<T>);
+    }
+  } else {
+    static_assert(dependent_false<T>);
+  }
+  return c;
+}
+
+// With world_size specialization: perform balanced load from all peers before
+// performing reduction.
+template <typename T, int alignment, int k_world_size>
+__device__ inline std::enable_if_t<(k_world_size > 0), Vec<alignment>>
+load_and_reduce(T** ptrs, size_t rank, size_t world_size, size_t offset) {
+  Vec<alignment> vecs[k_world_size];
+#pragma unroll k_world_size
+  for (size_t step = 0; step < k_world_size; ++step) {
+    size_t remote_rank = (rank + step) % k_world_size;
+    vecs[remote_rank] =
+        at::native::memory::ld_vec<alignment>(ptrs[remote_rank] + offset);
+  }
+  auto acc = vecs[0];
+#pragma unroll k_world_size - 1
+  for (size_t r = 1; r < world_size; ++r) {
+    acc = add_vec<alignment, T>(acc, vecs[r]);
+  }
+  return acc;
+}
+
+// Without world_size specialization: perform ordered (unbalanced) load and
+// accumulate on each load.
+template <typename T, int alignment, int k_world_size>
+__device__ inline std::enable_if_t<(k_world_size <= 0), Vec<alignment>>
+load_and_reduce(T** ptrs, size_t rank, size_t world_size, size_t offset) {
+  Vec<alignment> acc{};
+  for (size_t step = 0; step < world_size; ++step) {
+    auto vec = at::native::memory::ld_vec<alignment>(ptrs[step] + offset);
+    acc = add_vec<alignment, T>(acc, vec);
+  }
+  return acc;
+}
+
+} // namespace c10d::symmetric_memory
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d8f2386b0a8adf8d5363fcf7cd7f9aef2abd1ba0
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.hpp
@@ -0,0 +1,126 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <torch/csrc/distributed/c10d/Store.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryTypes.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
+
+namespace c10d::symmetric_memory {
+
+// Resource wrapper that owns a (vaddr, allocation handle) pair. Upon
+// destruction, it unmaps the vaddr and releases the allocation handle.
+struct AllocationRef : public c10::intrusive_ptr_target {
+  void* ptr;
+  HandleType handle;
+  size_t block_size;
+  int device_idx;
+
+  AllocationRef(
+      void* ptr,
+      HandleType handle,
+      size_t block_size,
+      int device_idx);
+
+  ~AllocationRef();
+};
+
+class CUDASymmetricMemory : public SymmetricMemory {
+ public:
+  CUDASymmetricMemory(
+      std::vector<c10::intrusive_ptr<AllocationRef>> alloc_refs,
+      std::vector<void*> buffers,
+      std::vector<void*> signal_pads,
+      HandleType mc_handle,
+      void* mc_addr,
+      size_t buffer_size,
+      int local_device_idx,
+      int rank,
+      int world_size);
+
+  ~CUDASymmetricMemory() override {};
+
+  std::vector<void*> get_buffer_ptrs() override;
+  std::vector<void*> get_signal_pad_ptrs() override;
+  void** get_buffer_ptrs_dev() override;
+  void** get_signal_pad_ptrs_dev() override;
+  size_t get_buffer_size() override;
+  size_t get_signal_pad_size() override;
+
+  bool has_multicast_support() override;
+  void* get_multicast_ptr() override;
+
+  at::Tensor get_buffer(
+      int rank,
+      c10::IntArrayRef sizes,
+      c10::ScalarType dtype,
+      int64_t storage_offset) override;
+
+  at::Tensor get_signal_pad(
+      int rank,
+      c10::IntArrayRef sizes,
+      std::optional<c10::ScalarType> dtype,
+      int64_t storage_offset) override;
+
+  void barrier(int channel, size_t timeout_ms) override;
+  void put_signal(int dst_rank, int channel, size_t timeout_ms) override;
+  void wait_signal(int src_rank, int channel, size_t timeout_ms) override;
+
+  int get_rank() override;
+  int get_world_size() override;
+
+ private:
+  std::vector<c10::intrusive_ptr<AllocationRef>> alloc_refs_;
+  std::vector<void*> buffers_;
+  std::vector<void*> signal_pads_;
+  HandleType mc_handle_;
+  void* mc_addr_;
+  size_t buffer_size_;
+  int local_device_idx_;
+  int rank_;
+  int world_size_;
+  void** buffers_dev_;
+  void** signal_pads_dev_;
+};
+
+// Metadata associated with each allocation performed by
+// `CUDASymmetricMemoryAllocator`.
+struct Block : public c10::intrusive_ptr_target {
+  c10::intrusive_ptr<AllocationRef> alloc_ref;
+  int device_idx;
+  size_t block_size;
+  size_t buffer_size;
+  size_t signal_pad_offset;
+  std::optional<std::string> default_group_name;
+  std::map<std::string, c10::intrusive_ptr<CUDASymmetricMemory>> symm_mems;
+
+  Block(
+      c10::intrusive_ptr<AllocationRef> alloc_ref,
+      int device_idx,
+      size_t block_size,
+      size_t buffer_size,
+      size_t signal_pad_offset,
+      const std::optional<std::string>& group_name);
+};
+
+class CUDASymmetricMemoryAllocator : public SymmetricMemoryAllocator {
+ public:
+  void* alloc(
+      size_t size,
+      int device_idx,
+      const std::optional<std::string>& group_name) override;
+
+  void free(void* ptr) override;
+  size_t get_alloc_size(void* ptr) override;
+  c10::intrusive_ptr<SymmetricMemory> rendezvous(
+      void* ptr,
+      const std::optional<std::string>& group_name) override;
+  bool has_multicast_support(int device_idx) override;
+
+ private:
+  c10::intrusive_ptr<Block> find_block(void* ptr);
+
+  std::shared_mutex mutex_;
+  std::unordered_map<void*, c10::intrusive_ptr<Block>> ptr_to_block_;
+};
+
+} // namespace c10d::symmetric_memory
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryTypes.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryTypes.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e93f2684bac4d1ef681fbb3e238d44a834c9faa9
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryTypes.hpp
@@ -0,0 +1,15 @@
+#pragma once
+
+namespace c10d::symmetric_memory {
+
+constexpr size_t signal_pad_size = 2048;
+
+#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
+using HandleType = CUmemGenericAllocationHandle;
+#elif defined(USE_ROCM)
+using HandleType = hipMemGenericAllocationHandle_t;
+#else
+using HandleType = void*;
+#endif
+
+} // namespace c10d::symmetric_memory
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..84266c211783475680b739359c3f5956e2b9ee00
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.hpp
@@ -0,0 +1,115 @@
+#pragma once
+
+#include <torch/csrc/distributed/c10d/Store.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryTypes.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
+
+namespace c10d {
+namespace symmetric_memory {
+
+bool device_has_multicast_support(int device_idx);
+
+bool allow_overlapping_devices();
+
+// Query environment variable to get the backend used for CUDA Symmetric Memory.
+std::string getSymmMemBackendCUDA();
+
+class IpcChannel {
+ public:
+  IpcChannel();
+  ~IpcChannel();
+
+  void send_fd(int dst_pid, int fd);
+  int recv_fd();
+
+  std::vector<int> all_gather_fds(
+      int rank,
+      const std::vector<int>& pids,
+      int fd);
+
+  int broadcast_fds(
+      int rank,
+      int src_rank,
+      const std::vector<int>& pids,
+      int fd);
+
+ private:
+  static std::string get_socket_name(int pid);
+
+  std::string socket_name_;
+  int socket_;
+};
+
+// A set of store-based exchange methods with a preset prefix typically type of
+// the SymmetricMemory.  Most used as static instances at respective
+// SymmetricMemory implementation files.
+class StoreExchange {
+ public:
+  StoreExchange(const std::string& store_prefix)
+      : store_prefix_(store_prefix) {}
+
+  // Put template function in header file so that compiler can easily access it.
+  template <typename T>
+  std::vector<T> all_gather(
+      const c10::intrusive_ptr<c10d::Store>& store,
+      int rank,
+      int world_size,
+      T val) {
+    static_assert(std::is_trivially_copyable_v<T>);
+
+    std::vector<std::string> peer_keys;
+    peer_keys.reserve(world_size);
+    for (int r = 0; r < world_size; ++r) {
+      std::ostringstream oss;
+      oss << store_prefix_ << "/" << seq_id_ << "/" << r;
+      peer_keys.push_back(oss.str());
+    }
+    ++seq_id_;
+
+    {
+      std::vector<uint8_t> payload(
+          reinterpret_cast<uint8_t*>(&val),
+          reinterpret_cast<uint8_t*>(&val) + sizeof(T));
+      store->set(peer_keys[rank], payload);
+    }
+
+    std::vector<T> peer_vals;
+    peer_vals.reserve(world_size);
+    for (int r = 0; r < world_size; ++r) {
+      if (r == rank) {
+        peer_vals.push_back(val);
+        continue;
+      }
+      store->wait({peer_keys[r]});
+      auto payload = store->get(peer_keys[r]);
+      TORCH_CHECK(payload.size() == sizeof(T));
+      T peer_val{};
+      std::memcpy(&peer_val, payload.data(), sizeof(T));
+      peer_vals.push_back(peer_val);
+    }
+    return peer_vals;
+  }
+
+  void barrier(
+      const c10::intrusive_ptr<c10d::Store>& store,
+      int rank,
+      int world_size) {
+    // TODO: implement an efficient one?
+    all_gather(store, rank, world_size, 0);
+  }
+
+ private:
+  const std::string store_prefix_;
+  size_t seq_id_ = 0;
+};
+
+// Teturns a pointer of virtual address that is mapped to the physical memory
+// held by the handle.
+void map_block(
+    void** ptr,
+    c10d::symmetric_memory::HandleType handle,
+    size_t size,
+    int device_idx);
+
+} // namespace symmetric_memory
+} // namespace c10d
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9a67182a4be60a426bbf10874686ef8e874dab85
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/symm_mem/DMAConnectivity.hpp
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <ATen/ATen.h>
+
+namespace c10d {
+
+struct TORCH_API DMAConnectivity : c10::intrusive_ptr_target {
+  c10::DeviceType device_type;
+  std::string connection_type;
+
+  // This is an NxN matrix representing the connectivity between N devices,
+  // where each element matrix[i][j] indicates the connectivity between device
+  // i and device j. A value of 0 denotes that there is no connection between
+  // device i and j. The meaning of non-zero values are specific to the
+  // connection type (e.g., for NVLink it represents the number of NVLinks).
+  std::vector<std::vector<int>> matrix;
+
+  explicit DMAConnectivity(
+      c10::DeviceType device_type,
+      std::string connection_type,
+      std::vector<std::vector<int>> matrix);
+};
+
+struct DMAConnectivityDetector : c10::intrusive_ptr_target {
+  virtual c10::intrusive_ptr<DMAConnectivity> detect() = 0;
+  ~DMAConnectivityDetector() override = default;
+};
+
+C10_EXPORT void register_dma_connectivity_detector(
+    c10::DeviceType device_type,
+    const std::string& connection_type,
+    c10::intrusive_ptr<DMAConnectivityDetector> detector);
+
+TORCH_API c10::intrusive_ptr<DMAConnectivity> detect_dma_connectivity(
+    c10::DeviceType device_type,
+    const std::string& connection_type);
+
+} // namespace c10d
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6879851c05b6c3558eead3a2a9b3da2b6535ba97
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp
@@ -0,0 +1,176 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <torch/csrc/distributed/c10d/Store.hpp>
+
+namespace c10d::symmetric_memory {
+
+// SymmetricMemory represents symmetric allocations across a group of devices.
+// The allocations represented by a SymmetricMemory object are accessible by
+// all devices in the group. The class can be used for op-level custom
+// communication patterns (via the get_buffer APIs and the synchronization
+// primitives), as well as custom communication kernels (via the buffer and
+// signal_pad device pointers).
+//
+// To acquire a SymmetricMemory object, each rank first allocates
+// identical-sized memory via SymmetricMemoryAllocator::alloc(), then invokes
+// SymmetricMemoryAllocator::rendezvous() on the memory to establish the
+// association across peer buffers. The rendezvous is a one-time process, and
+// the mapping between a local memory memory and the associated SymmetricMemory
+// object is unique.
+//
+// NOTE [symmetric memory signal pad]
+// Signal pads are P2P-accessible memory regions designated for
+// synchronization. SymmetricMemory offers built-in synchronization primitives
+// such as barriers, put_signal, and wait_signal, which are all based on signal
+// pads. Users may utilize signal pads for their own synchronization logic,
+// provided that the signal pads remain zero-filled following successful
+// synchronization.
+//
+// NOTE [symmetric memory synchronization channel]
+// Synchronization channels allow users to use a single SymmetricMemory object
+// to perform isolated synchronizations on different streams. For example,
+// consider the case in which two barriers are issued on two streams for
+// different purposes. Without the concept of channels, we cannot guarantee the
+// correctness of the barriers since signals issued from barrier on stream A
+// can be received by the barrier on stream B. By specifying different channels
+// for these two barriers, they can operate correctly in parallel.
+class TORCH_API SymmetricMemory : public c10::intrusive_ptr_target {
+ public:
+  ~SymmetricMemory() override = default;
+
+  virtual std::vector<void*> get_buffer_ptrs() = 0;
+  virtual std::vector<void*> get_signal_pad_ptrs() = 0;
+
+  // get_buffer_ptrs_dev() and get_signal_pad_ptrs_dev() each return a pointer
+  // to a device array of size world_size, containing buffer pointers and
+  // signal pad pointers, respectively.
+  virtual void** get_buffer_ptrs_dev() = 0;
+  virtual void** get_signal_pad_ptrs_dev() = 0;
+  virtual size_t get_buffer_size() = 0;
+  virtual size_t get_signal_pad_size() = 0;
+
+  virtual bool has_multicast_support() = 0;
+  virtual void* get_multicast_ptr() = 0;
+
+  virtual at::Tensor get_buffer(
+      int rank,
+      c10::IntArrayRef sizes,
+      c10::ScalarType dtype,
+      int64_t storage_offset) = 0;
+
+  virtual at::Tensor get_signal_pad(
+      int rank,
+      c10::IntArrayRef sizes,
+      std::optional<c10::ScalarType> dtype = std::nullopt,
+      int64_t storage_offset = 0) = 0;
+
+  virtual void barrier(int channel, size_t timeout_ms) = 0;
+  virtual void put_signal(int dst_rank, int channel, size_t timeout_ms) = 0;
+  virtual void wait_signal(int src_rank, int channel, size_t timeout_ms) = 0;
+
+  virtual int get_rank() = 0;
+  virtual int get_world_size() = 0;
+
+  virtual const std::vector<int>& get_rank_to_global_rank() {
+    TORCH_CHECK(false, "NYI");
+  }
+
+  virtual int* get_rank_to_global_rank_dev() {
+    TORCH_CHECK(false, "NYI");
+  }
+};
+
+class SymmetricMemoryAllocator : public c10::intrusive_ptr_target {
+ public:
+  ~SymmetricMemoryAllocator() override = default;
+
+  virtual void* alloc(
+      size_t size,
+      int device_idx,
+      const std::optional<std::string>& group_name) = 0;
+
+  virtual void free(void* ptr) = 0;
+  virtual size_t get_alloc_size(void* ptr) = 0;
+  virtual c10::intrusive_ptr<SymmetricMemory> rendezvous(
+      void* ptr,
+      const std::optional<std::string>& group_name) = 0;
+  virtual bool has_multicast_support(int device_idx) = 0;
+};
+
+C10_EXPORT bool is_finalizing();
+
+C10_EXPORT void register_allocator(
+    c10::DeviceType device_type,
+    c10::intrusive_ptr<SymmetricMemoryAllocator> allocator);
+
+C10_EXPORT bool has_allocator(c10::DeviceType device_type);
+
+C10_EXPORT c10::intrusive_ptr<SymmetricMemoryAllocator> get_allocator(
+    c10::DeviceType device_type);
+
+// Set a store for rendezvousing symmetric allocations on a group of devices
+// identified by `group_name`. The concept of groups is logical; users can
+// utilize predefined groups (e.g., a group of device identified by a
+// ProcessGroup) or create custom ones. Note that a SymmetricMemoryAllocator
+// backends might employ a more efficient communication channel for the actual
+// rendezvous process and only use the store for bootstrapping purposes.
+TORCH_API void set_group_info(
+    const std::string& group_name,
+    int rank,
+    int world_size,
+    c10::intrusive_ptr<Store> store);
+
+struct GroupInfo {
+  int rank;
+  int world_size;
+  c10::intrusive_ptr<c10d::Store> store;
+  // Note this field is not automatically populated by set_group_info().  If a
+  // SymmetricMemory implementation needs to use it, it must be populated by a
+  // call to exchange_global_ranks() first.
+  std::vector<int> rank_to_global_rank;
+};
+
+C10_EXPORT GroupInfo& get_group_info(const std::string& group_name);
+
+// Identical to empty_strided, but allows symmetric memory access to be
+// established for the allocated tensor via SymmetricMemory::rendezvous(). This
+// function itself is not a collective operation. It invokes
+// SymmetricMemoryAllocator::alloc() for the requested device under the hood.
+//
+// NOTE [symmetric memory persistent allocation]
+// If an `alloc_id` is supplied, empty_strided_p2p will perform persistent
+// allocation. This makes the function cache allocated memory and ensure that
+// invocations with the same `alloc_id` receive tensors backed by the same
+// memory address. For safety, if a previous persistent allocation is still
+// active (i.e., the storage of the returned tensor is still alive), persistent
+// allocations with the same `alloc_id` will fail. This determinism coupled
+// with memory planning of communication buffers (e.g., by Inductor) allows
+// communication algorithms to reliably reuse previously established remote
+// memory access.
+TORCH_API at::Tensor empty_strided_p2p(
+    c10::IntArrayRef size,
+    c10::IntArrayRef stride,
+    c10::ScalarType dtype,
+    c10::Device device,
+    const std::optional<std::string>& group_name,
+    std::optional<uint64_t> alloc_id);
+
+// Establishes symmetric memory access on tensors allocated via
+// empty_strided_p2p() and empty_strided_p2p_persistent(). rendezvous() is a
+// one-time process, and the mapping between a local memory region and the
+// associated SymmetricMemory object is unique. Subsequent calls to
+// rendezvous() with the same tensor, or tensors allocated with
+// empty_strided_p2p_persistent() using the same alloc_id, will receive the
+// cached SymmetricMemory object.
+//
+// The function has a collective semantic and must be invoked simultaneously
+// from all rendezvous participants.
+TORCH_API c10::intrusive_ptr<SymmetricMemory> rendezvous(
+    const at::Tensor& tensor,
+    const std::optional<std::string>& group_name = std::nullopt);
+
+TORCH_API bool has_multicast_support(
+    c10::DeviceType device_type,
+    int device_idx);
+} // namespace c10d::symmetric_memory
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2a09e891b9b620874bfbe04107882d4ec875c892
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.hpp
@@ -0,0 +1,91 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <c10/cuda/CUDAStream.h>
+#include <torch/csrc/distributed/c10d/Store.hpp>
+#include <torch/csrc/distributed/c10d/Work.hpp>
+#include <torch/csrc/distributed/c10d/symm_mem/SymmetricMemory.hpp>
+
+namespace c10d::intra_node_comm {
+
+using namespace c10d::symmetric_memory;
+
+constexpr size_t kMaxDevices = 8;
+constexpr size_t kDefaultBufferSize = 10ull * 1024 * 1024;
+
+using NvlMesh = std::array<std::array<size_t, kMaxDevices>, kMaxDevices>;
+
+enum class Topology : uint8_t {
+  UNKNOWN = 0,
+  FULLY_CONNECTED = 1,
+};
+
+enum class AllReduceAlgo : uint8_t {
+  NONE = 0,
+  ONE_SHOT = 1,
+  TWO_SHOT = 2,
+};
+
+// NOTE: this class will be be removed soon in favor of SymmetricMemory
+class TORCH_API IntraNodeComm : public c10::intrusive_ptr_target {
+ public:
+  IntraNodeComm(
+      c10::intrusive_ptr<c10d::Store> store,
+      size_t rank,
+      size_t worldSize,
+      std::optional<size_t> bufferSize = std::nullopt);
+
+  ~IntraNodeComm() override;
+
+  static bool isEnabled();
+
+  /**
+   * Performs rendezvous.
+   * If rendezvous fails, the IntraNodeComm object will be in an invalid
+   * state and it is the caller's responsibility to dispose it.
+   */
+  bool rendezvous();
+
+  /**
+   * Selects a AllReduceAlgo that we think will outperform nccl.
+   * Returns AllReduceAlgo::NONE if we don't think we can outperform nccl.
+   */
+  AllReduceAlgo selectAllReduceAlgo(const at::Tensor& input);
+
+  at::Tensor allReduce(const at::Tensor& input, AllReduceAlgo algo);
+
+ private:
+  at::Tensor oneShotAllReduce(
+      const at::Tensor& input,
+      at::cuda::CUDAStream& stream);
+
+  at::Tensor twoShotAllReduce(
+      const at::Tensor& input,
+      at::cuda::CUDAStream& stream);
+
+  c10::intrusive_ptr<Store> store_;
+  size_t rank_;
+  size_t worldSize_;
+  size_t bufferSize_;
+
+  /**
+   * Members initialized after rendezvous
+   */
+  bool isInitialized_ = false;
+  int deviceIdx_{0};
+  Topology topology_ = Topology::UNKNOWN;
+  void* symmetricMemoryPtr_ = nullptr;
+  c10::intrusive_ptr<SymmetricMemory> symmetricMemory_ = nullptr;
+};
+
+class IntraNodeCommWork : public c10d::Work {
+ public:
+  bool wait(std::chrono::milliseconds timeout = kNoTimeout) override {
+    return true;
+  }
+};
+
+TORCH_API int64_t getIntraNodeCommUsageCounter();
+
+bool isIntraNodeCommSupported();
+} // namespace c10d::intra_node_comm
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/agent_utils.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/agent_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..5bd9a706a346bb2b940ecd1a8e420d5911c3bf3b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/agent_utils.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include <torch/csrc/distributed/c10d/PrefixStore.hpp>
+#include <torch/csrc/distributed/rpc/utils.h>
+
+namespace torch::distributed::rpc {
+
+// All RPC peers should call into this function at the same time. Each peer
+// provides its own id and name, and this function uses the given Store to
+// gather global name-to-id mapping on all peers.
+TORCH_API std::unordered_map<std::string, worker_id_t> collectNames(
+    ::c10d::PrefixStore store,
+    const worker_id_t selfId,
+    const std::string& selfName,
+    const int worldSize);
+
+// Ranks in dynamic RPC groups will initially call into this to establish the
+// name-to-id mapping for the current peers in the group. The current rank will
+// put its own worker info in the store and discover all the ranks that came
+// before it. NOTE: This needs to be called with the Dynamic RPC group
+// membership management token held.
+TORCH_API std::unordered_map<std::string, worker_id_t> collectCurrentNames(
+    ::c10d::PrefixStore store,
+    const worker_id_t selfId,
+    const std::string& selfName);
+
+// Remove name from Store, used in dynamic RPC groups.
+// NOTE: This needs to be called with the Dynamic RPC group
+// membership management token held.
+TORCH_API void removeCurrentName(
+    ::c10d::PrefixStore store,
+    const worker_id_t selfId,
+    const std::string& selfName);
+
+// This performs a synchronization of all call counts by using store.
+// All RPC peers wait for others to join to exit at the same time.
+TORCH_API int syncCallCount(
+    ::c10d::PrefixStore store,
+    const int worldSize,
+    int activeCalls = 0);
+
+} // namespace torch::distributed::rpc
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/message.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/message.h
new file mode 100644
index 0000000000000000000000000000000000000000..28b2adb637fdbd005edacb3a5d57144d5b25dcbb
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/message.h
@@ -0,0 +1,193 @@
+#pragma once
+
+#include <torch/types.h>
+#include <vector>
+
+namespace torch::distributed::rpc {
+
+// An enum denoting common RPC errors to allow specific error handling for them.
+// NOLINTNEXTLINE(performance-enum-size)
+enum RPCErrorType {
+  UNKNOWN_ERROR = 0, /* Indicates that error type could not be parsed */
+  TIMEOUT = 1, /* Indicates that the RPC has timed out */
+  INTENTIONAL_FAILURE = 2 /* Deliberate failure, such as those injected by
+                             FaultyAgent for testing */
+};
+
+// The enum values are bitwise ORed with MessageType
+// They are bit flags starting from 0x100 and should have
+// value such as 0x100, 0x200, 0x400, 0x800, 0xF00, etc.
+// NOLINTNEXTLINE(performance-enum-size)
+enum MessageTypeFlags {
+  REQUEST_TYPE = 0x100,
+  RESPONSE_TYPE = 0x200,
+};
+
+// Message types must have values between 0x00 to 0xff
+// NOLINTNEXTLINE(performance-enum-size)
+enum MessageType {
+  // messages for dist.rpc on builtin operators
+  SCRIPT_CALL = 0x00 | MessageTypeFlags::REQUEST_TYPE,
+  SCRIPT_RET = 0x01 | MessageTypeFlags::RESPONSE_TYPE,
+
+  // messages for dist.rpc on Python UDF
+  PYTHON_CALL = 0x02 | MessageTypeFlags::REQUEST_TYPE,
+  PYTHON_RET = 0x03 | MessageTypeFlags::RESPONSE_TYPE,
+
+  // messages for dist.remote on builtin operators and Python UDF
+  SCRIPT_REMOTE_CALL = 0x04 |
+      MessageTypeFlags::REQUEST_TYPE, // A remote call on a builtin operator
+  PYTHON_REMOTE_CALL =
+      0x05 | MessageTypeFlags::REQUEST_TYPE, // A remote call on a Python UDF
+  REMOTE_RET =
+      0x06 | MessageTypeFlags::RESPONSE_TYPE, // Response for remote calls for
+                                              // UDF, builtin, or script
+
+  // RRef related internal messages
+  SCRIPT_RREF_FETCH_CALL =
+      0x07 | MessageTypeFlags::REQUEST_TYPE, // A UserRRef<IValue> fetches value
+                                             // from owner
+  PYTHON_RREF_FETCH_CALL =
+      0x08 | MessageTypeFlags::REQUEST_TYPE, // A UserRRef<py::object> fetches
+                                             // value from owner
+  SCRIPT_RREF_FETCH_RET = 0x09 |
+      MessageTypeFlags::RESPONSE_TYPE, // An OwnerRRef sends ivalue to user
+  PYTHON_RREF_FETCH_RET = 0x0a |
+      MessageTypeFlags::RESPONSE_TYPE, // An OwnerRRef sends py::object to user
+  RREF_USER_DELETE = 0x0b |
+      MessageTypeFlags::REQUEST_TYPE, // A UserRRef tells the owner to deref
+  RREF_FORK_REQUEST =
+      0x0c | MessageTypeFlags::REQUEST_TYPE, // A child UserRRef tells the owner
+                                             // about itself
+  RREF_CHILD_ACCEPT =
+      0x0d | MessageTypeFlags::REQUEST_TYPE, // A child UserRRef tells parent
+                                             // that owner knows it
+  RREF_ACK =
+      0x0e | MessageTypeFlags::RESPONSE_TYPE, // ACK to internal RRef messages
+
+  // Messages with autograd info
+  FORWARD_AUTOGRAD_REQ = 0x0f | MessageTypeFlags::REQUEST_TYPE,
+  FORWARD_AUTOGRAD_RESP = 0x10 | MessageTypeFlags::RESPONSE_TYPE,
+
+  // Messages to propagate gradients on the backward pass.
+  BACKWARD_AUTOGRAD_REQ = 0x11 | MessageTypeFlags::REQUEST_TYPE,
+  BACKWARD_AUTOGRAD_RESP = 0x12 | MessageTypeFlags::RESPONSE_TYPE,
+
+  // Messages to tell workers to clean up their autograd context.
+  CLEANUP_AUTOGRAD_CONTEXT_REQ = 0x13 | MessageTypeFlags::REQUEST_TYPE,
+  CLEANUP_AUTOGRAD_CONTEXT_RESP = 0x14 | MessageTypeFlags::RESPONSE_TYPE,
+
+  // Messages that tell workers to run requests with profiling enabled.
+  RUN_WITH_PROFILING_REQ = 0x15 | MessageTypeFlags::REQUEST_TYPE,
+  RUN_WITH_PROFILING_RESP = 0x16 | MessageTypeFlags::RESPONSE_TYPE,
+
+  // Messages to support RRef.backward().
+  RREF_BACKWARD_REQ = 0x17 | MessageTypeFlags::REQUEST_TYPE,
+  RREF_BACKWARD_RESP = 0x18 | MessageTypeFlags::RESPONSE_TYPE,
+
+  // Other internal message types
+  EXCEPTION = 0x37 | MessageTypeFlags::RESPONSE_TYPE,
+  UNKNOWN = 0x3c
+};
+
+// A message to be sent/received by an RpcAgent.
+//
+// A Message object contains 4 fields:
+//    payload (std::vector<char>): a binary chunk of data.
+//    tensors (std::vector<torch::Tensor>): all tensors. Tensor data are not
+//        included in the payload, and it is up to the RpcAgent implementation
+//        to determine how to serialize them. This design is helpful for
+//        communicating super large tensors where serializing all the data at
+//        once leads to excessively large memory footprint. An implementation
+//        can then serialize and send tensors chunk-by-chunk, in the streaming
+//        fashion.
+//    type (MessageType): type of the message.
+//    id (int64_t): message id, this is used to match request and response.
+//               Other implementation can ignore it if they have their own
+//               ways to do matching.
+//
+// Layers above ``RpcAgent`` only converts ScriptCall, ScriptResp, PythonCall,
+// and PythonResp into a Message, and it is up to the RpcAgent
+// implementation to determine how to serialize a message.
+class TORCH_API Message final : public torch::CustomClassHolder {
+ private:
+  // Keep these private in order to force users to go through make_intrusive and
+  // thus prevent creating a Message that's not held by an intrusive_ptr.
+  Message();
+
+  Message(
+      std::vector<char>&& payload,
+      std::vector<torch::Tensor>&& tensors,
+      MessageType type);
+
+  Message(
+      std::vector<char>&& payload,
+      std::vector<torch::Tensor>&& tensors,
+      MessageType type,
+      int64_t id);
+
+  friend c10::intrusive_ptr<Message>;
+
+ public:
+  Message(const Message& other) = delete;
+  Message(Message&& other) = delete;
+  Message& operator=(Message const& rhs) = delete;
+  Message& operator=(Message&& rhs) = delete;
+  ~Message() override = default;
+
+  // Destructively retrieves the payload.
+  std::vector<char>&& movePayload() &&;
+  std::vector<torch::Tensor>&& moveTensors() &&;
+
+  std::vector<char>& payload();
+  const std::vector<char>& payload() const;
+  std::vector<torch::Tensor>& tensors();
+  const std::vector<torch::Tensor>& tensors() const;
+  MessageType type() const;
+
+  bool isRequest() const;
+  bool isResponse() const;
+  bool isShutdown() const;
+
+  // id is an optional field to match request/response. If an RpcAgent
+  // implementation is able to do the matching without using this id, it can be
+  // dropped during message serialization.
+  int64_t id() const;
+  void setId(int64_t id);
+
+  std::vector<c10::weak_intrusive_ptr<c10::StorageImpl>> getStorages() const;
+
+ private:
+  std::vector<char> payload_;
+  std::vector<torch::Tensor> tensors_;
+  MessageType type_ = MessageType::UNKNOWN;
+  int64_t id_ = -1;
+};
+
+// Create a response Message of type Exception.
+// The exception string representation will be used as the message's payload.
+// A message ID corresponding to the request that resulted in this response can
+// be provided for matching requests/responses.
+TORCH_API c10::intrusive_ptr<Message> createExceptionResponse(
+    const std::exception& e,
+    int64_t id);
+
+// Create a response Message of type Exception.
+// The passed in string representation will be used as the message's payload.
+// A message ID corresponding to the request that resulted in this response can
+// be provided for matching requests/responses.
+TORCH_API c10::intrusive_ptr<Message> createExceptionResponse(
+    const std::string& exceptionStr,
+    int64_t id);
+
+inline std::tuple<
+    c10::intrusive_ptr<Message>,
+    std::vector<c10::weak_intrusive_ptr<c10::StorageImpl>>>
+withStorages(c10::intrusive_ptr<Message> message) {
+  auto storages = message->getStorages();
+  return std::make_tuple(std::move(message), std::move(storages));
+}
+
+using JitFuture = c10::ivalue::Future;
+
+} // namespace torch::distributed::rpc
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/metrics/RpcMetricsHandler.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/metrics/RpcMetricsHandler.h
new file mode 100644
index 0000000000000000000000000000000000000000..667d17bb73481b661be8076d756f172127f5e0bc
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/metrics/RpcMetricsHandler.h
@@ -0,0 +1,40 @@
+#pragma once
+#include <c10/util/Registry.h>
+#include <string>
+
+namespace torch::distributed::rpc {
+// All metrics are prefixed with the following key.
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+constexpr char kRpcMetricsKeyPrefix[] = "torch.distributed.rpc.";
+// APIs for logging time-series metrics for RPC-based distributed
+// training. Implementations of this class should provide thread safety so that
+// metrics can be logged from multiple threads without the user needing to
+// coordinate serialization.
+class RpcMetricsHandler {
+ public:
+  // Accumulates the metric value specified by the name for purposes of
+  // computing aggregate statistics over time.
+  virtual void accumulateMetric(const std::string& name, double value) = 0;
+  // Increment a count for the metric given by the name.
+  virtual void incrementMetric(const std::string& name) = 0;
+  virtual ~RpcMetricsHandler() = default;
+};
+
+// Configuration struct for metrics handling.
+struct RpcMetricsConfig {
+  explicit RpcMetricsConfig(std::string handlerName, bool enabled)
+      : handlerName_(std::move(handlerName)), enabled_(enabled) {}
+
+  // Handler name
+  std::string handlerName_;
+  // Whether metrics exporting should be enabled or not.
+  bool enabled_;
+};
+
+// A registry for different implementations of RpcMetricsHandler. Classes
+// implementing the above interface should use this to register implementations.
+TORCH_DECLARE_REGISTRY(
+    RpcMetricsHandlerRegistry,
+    torch::distributed::rpc::RpcMetricsHandler);
+
+} // namespace torch::distributed::rpc
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/profiler/remote_profiler_manager.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/profiler/remote_profiler_manager.h
new file mode 100644
index 0000000000000000000000000000000000000000..10515ca6026024f6aa13a662d8d16bc9b273fa2b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/profiler/remote_profiler_manager.h
@@ -0,0 +1,55 @@
+#pragma once
+#include <torch/csrc/Export.h>
+#include <torch/csrc/distributed/rpc/types.h>
+#include <mutex>
+#include <optional>
+#include <unordered_map>
+
+namespace torch::distributed::rpc {
+extern const std::string REMOTE_PROFILING_KEY_PREFIX;
+
+class TORCH_API RemoteProfilerManager {
+ public:
+  // Retrieves the lazily-initialized RemoteProfilerManager singleton instance.
+  static RemoteProfilerManager& getInstance();
+  // Sets the current, thread-local profiling key.
+  void setCurrentKey(std::string key);
+  // Returns whether the current profiling key is set.
+  bool isCurrentKeySet() const;
+  // Unsets the current, thread-local profiling key to allow other RPCs to reset
+  // it.
+  void unsetCurrentKey();
+  // inserts a pair (globallyUniqueId, key) to an in-memory map. The
+  // corresponding ID is used in RPC deserialization to prefix remotely profiled
+  // events with the right key.
+  void saveRPCKey(
+      ProfilingId globallyUniqueId,
+      const std::string& rpcProfilingKey);
+  // Retrieves the profiling key corresponding to the given globallyUniqueId.
+  // Throws if it is not found.
+  std::string retrieveRPCProfilingKey(const ProfilingId& globallyUniqueId);
+  // Generates the next globally unique ID for profiling.
+  ProfilingId getNextProfilerId();
+  // Retrieves the currently set thread-local profiling key. Throws if it is not
+  // set.
+  std::string& getCurrentProfilingKey();
+  // erases the globallyUniqueId from the map. This can help save memory in the
+  // case that many RPCs are being profiled.
+  void eraseKey(const ProfilingId& globallyUniqueId);
+
+  RemoteProfilerManager(const RemoteProfilerManager& other) = delete;
+  RemoteProfilerManager operator=(const RemoteProfilerManager& other) = delete;
+  RemoteProfilerManager(RemoteProfilerManager&&) = delete;
+  RemoteProfilerManager& operator=(RemoteProfilerManager&&) = delete;
+
+ private:
+  RemoteProfilerManager();
+  ~RemoteProfilerManager() = default;
+  local_id_t getNextLocalId();
+  std::unordered_map<ProfilingId, std::string, ProfilingId::Hash>
+      profiledRpcKeys_;
+  static thread_local std::optional<std::string> currentThreadLocalKey_;
+  std::mutex mutex_;
+  local_id_t currentLocalId_;
+};
+} // namespace torch::distributed::rpc
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/profiler/server_process_global_profiler.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/profiler/server_process_global_profiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..2c47bafd3281da5fd27f4faab7561c3b2fb27028
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/profiler/server_process_global_profiler.h
@@ -0,0 +1,129 @@
+#pragma once
+
+#include <shared_mutex>
+#include <utility>
+
+#include <torch/csrc/autograd/profiler.h>
+
+namespace torch::distributed::rpc::profiler::processglobal {
+
+using namespace torch::autograd::profiler;
+
+// Process global profiler state.
+//
+// This class holds information about a profiling range, from "enable" to
+// "disable".
+// An instance of this ``State`` will be
+// pushed into a global stack, so nested profiling range is supported.
+//
+// It has 2 members.
+// One is ``autograd::profiler::ProfilerConfig``. It's set by user and
+// will be copied to thread-local profiler state of RPC threads.
+// The other is a container that aggregates recorded
+// ``autograd::profiler::Event``s from all thread-local profilers on RPC
+// threads.
+class State {
+ public:
+  explicit State(ProfilerConfig config) : config_(std::move(config)) {}
+  ~State() = default;
+
+  const ProfilerConfig& config() const {
+    return config_;
+  }
+
+  void pushResult(thread_event_lists result) {
+    std::unique_lock<std::mutex> lock(resultsMutex_);
+
+    // NB: When a thread wants to push an entry into the this container,
+    // main control logic might have exited the process-global profile range.
+    results_.emplace_back(std::move(result));
+  }
+
+  std::vector<thread_event_lists> results();
+
+ private:
+  // Each result comes from a profile range. In each profile range, there is a
+  // "__profiler_start" marker event that all following events calculate time
+  // relative to it, so it's required to call
+  // parse_cpu_trace(result) for results of all profile range.
+  std::mutex resultsMutex_;
+  std::vector<thread_event_lists> results_;
+  const ProfilerConfig config_ = ProfilerConfig(ProfilerState::Disabled);
+};
+
+class StateStackEntry;
+
+#if defined(__MACH__)
+// Compiler error: 'shared_timed_mutex' is unavailable: introduced in
+// macOS 10.12
+using mutexType = std::mutex;
+// Compiler error: 'shared_lock' is unavailable: introduced in
+// macOS 10.12
+using rLockType = std::unique_lock<std::mutex>;
+using wLockType = std::unique_lock<std::mutex>;
+#else
+using mutexType = std::shared_timed_mutex;
+using rLockType = std::shared_lock<std::shared_timed_mutex>;
+using wLockType = std::unique_lock<std::shared_timed_mutex>;
+#endif
+
+// This is the global stack of ``State``s.
+TORCH_API extern std::shared_ptr<StateStackEntry> currentStateStackEntryPtr;
+TORCH_API extern mutexType currentStateStackEntryMutex;
+
+// This class is used to implement a stack of ``State``s.
+// It has 2 members.
+// One is `prevPtr`, a shared_ptr pointing to previous element in the
+// stack.
+// The other is ``statePtr``, a shared_ptr pointing to ``State``.
+class StateStackEntry {
+ public:
+  StateStackEntry(
+      std::shared_ptr<StateStackEntry> prevPtr,
+      std::shared_ptr<State> statePtr)
+      : prevPtr_(std::move(prevPtr)), statePtr_(std::move(statePtr)) {}
+
+  static void pushRange(std::shared_ptr<State> profilerProcessGlobalStatePtr);
+  static std::shared_ptr<State> popRange();
+
+  static std::shared_ptr<StateStackEntry> current() {
+    rLockType rlock(currentStateStackEntryMutex);
+
+    return currentStateStackEntryPtr;
+  }
+
+  std::shared_ptr<StateStackEntry> prevPtr() const {
+    return prevPtr_;
+  }
+
+  std::shared_ptr<State> statePtr() const {
+    return statePtr_;
+  }
+
+ private:
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const std::shared_ptr<StateStackEntry> prevPtr_{nullptr};
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const std::shared_ptr<State> statePtr_{nullptr};
+};
+
+// Push the result to ``State``s of current profile range and recursively outer
+// profile ranges.
+TORCH_API void pushResultRecursive(
+    std::shared_ptr<StateStackEntry> stateStackEntryPtr,
+    const thread_event_lists& result);
+
+// User-facing API.
+//
+// Enter a server-side process-global profiling range.
+// Profiling range can be neste, so it's ok to call this API for multiple
+// times. This enables all RPC threads running server-side request callbacks.
+TORCH_API void enableServer(const ProfilerConfig& new_config);
+//
+// Exit a server-side process-global profiling range.
+// Profiling range can be neste, so it's possible that profiler is still on
+// after calling this API.
+// This enables all RPC threads running server-side request callbacks.
+TORCH_API std::vector<thread_event_lists> disableServer();
+
+} // namespace torch::distributed::rpc::profiler::processglobal
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/py_rref.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/py_rref.h
new file mode 100644
index 0000000000000000000000000000000000000000..aaf77a25848283fa24cf1de008a08504eb5e169d
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/py_rref.h
@@ -0,0 +1,81 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/rref_impl.h>
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch::distributed::rpc {
+
+// NOLINTNEXTLINE(performance-enum-size)
+enum RRefProxyType { RPC_SYNC, RPC_ASYNC, REMOTE };
+
+// Python wrapper of an RRef shared_ptr that supports Python
+// pickle and unpickle.
+class PYBIND11_EXPORT PyRRef {
+ public:
+  // The first ctor can only be called while holding GIL. See its implementation
+  // for more explanations.
+  explicit PyRRef(const py::object& value, const py::object& type_hint);
+  explicit PyRRef(c10::intrusive_ptr<RRef> rref);
+  PyRRef(const PyRRef&) = default;
+  ~PyRRef();
+
+  bool isOwner() const;
+  bool confirmedByOwner() const;
+  WorkerInfo owner() const;
+  std::string ownerName() const;
+  py::object toHere(
+      const float timeoutSeconds =
+          torch::distributed::rpc::kUnsetRpcTimeout) const;
+  py::object localValue() const;
+  std::string str() const;
+  py::tuple pickle() const;
+  static PyRRef unpickle(const py::tuple& t);
+  c10::IValue toIValue() const;
+  // Future that is associated with the creation of this RRef on the remote end.
+  // This is only used to get the future corresponding to the rref for profiling
+  // use cases.
+  c10::intrusive_ptr<JitFuture> getFuture() const;
+  // Keeps track of the future responsible for profiling owner creation
+  // acknowledgement
+  c10::intrusive_ptr<JitFuture> getProfilingFuture() const;
+  // Sets the future responsible for profiling owner creation acknowledgement.
+  // This future is set from python to be a future that returns when profiling
+  // callbacks have been run.
+  void setProfilingFuture(c10::intrusive_ptr<JitFuture> profilingFuture);
+
+  // create a proxy on this RRef, which can be used to launch RPC on the owner
+  // of this RRef to run functions on the object referenced by this RRef.
+  py::object createRRefProxy(
+      const RRefProxyType& mode,
+      float timeoutSeconds = rpc::kUnsetRpcTimeout) const;
+
+  // get the type of the data object referenced by this RRef. Timeout argument
+  // is only used in the first invocation of this function as an argument to the
+  // RPC to the owner node of the RRef.
+  py::object getRRefType(
+      float timeout = rpc::kUnsetRpcTimeout,
+      bool blocking = true);
+
+  // Run the backward pass with the RRef as the root.
+  void backward(int64_t autogradContextId, bool retainGraph);
+
+  // Helper static function to run backward on a given rref.
+  static void backward(
+      int64_t autogradContextId,
+      bool retainGraph,
+      const c10::intrusive_ptr<RRef>& rref);
+
+  // Specialization of backward if the rref is an OwnerRRef.
+  static void backwardOwnerRRef(
+      int64_t autogradContextId,
+      bool retainGraph,
+      IValue value);
+
+ private:
+  c10::intrusive_ptr<RRef> rref_;
+  std::optional<c10::intrusive_ptr<JitFuture>> profilingFuture_;
+  std::optional<py::object> type_;
+};
+
+} // namespace torch::distributed::rpc
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/python_call.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/python_call.h
new file mode 100644
index 0000000000000000000000000000000000000000..3145e969b27b345d535e2d26eb6a4f97074daf41
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/python_call.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+#include <torch/csrc/distributed/rpc/types.h>
+
+namespace torch::distributed::rpc {
+
+// RPC call representing calling a Python function over RPC.
+class TORCH_API PythonCall final : public RpcCommandBase {
+ public:
+  PythonCall(SerializedPyObj&& serializedPyObj, bool isAsyncExecution);
+
+  c10::intrusive_ptr<Message> toMessageImpl() && override;
+
+  static std::unique_ptr<PythonCall> fromMessage(const Message& message);
+
+  const SerializedPyObj& serializedPyObj() const;
+
+  inline bool isAsyncExecution() const {
+    return isAsyncExecution_;
+  }
+
+ private:
+  SerializedPyObj serializedPyObj_;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const bool isAsyncExecution_;
+};
+
+} // namespace torch::distributed::rpc
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/python_functions.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/python_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..e0ea688392314bbeee1c62bc190248adf9c02f34
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/python_functions.h
@@ -0,0 +1,66 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/py_rref.h>
+#include <torch/csrc/distributed/rpc/rpc_agent.h>
+#include <torch/csrc/jit/python/pybind_utils.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch::distributed::rpc {
+
+// Converts an internal ivalue::Future of Message into a user-facing
+// ivalue::Future of py::object type by creating a new ivalue::Future and call
+// its  markCompleted as a callback in the given ivalue::Future.
+// If hasValue is true, the Message will be converted into a py::object and then
+// wrap it with an IValue. If hasValue is false, this ivalue::Future is only
+// used for signaling and launching callbacks. In this case, the message will be
+// discarded and then set the ivalue::Future using an empty IValue or the given
+// FutureError if there is an error.
+c10::intrusive_ptr<JitFuture> toPyJitFuture(
+    const c10::intrusive_ptr<JitFuture>& messageJitFuture,
+    bool hasValue = true);
+
+c10::intrusive_ptr<JitFuture> pyRpcBuiltin(
+    const WorkerInfo& dst,
+    const std::string& opName,
+    const py::args& args,
+    const py::kwargs& kwargs,
+    const float rpcTimeoutSeconds);
+
+c10::intrusive_ptr<JitFuture> pyRpcPythonUdf(
+    const WorkerInfo& dst,
+    std::string& pickledPythonUDF,
+    std::vector<torch::Tensor>& tensors,
+    const float rpcTimeoutSeconds,
+    const bool isAsyncExecution);
+
+c10::intrusive_ptr<JitFuture> pyRpcTorchscript(
+    const std::string& dstWorkerName,
+    const std::string& qualifiedNameStr,
+    const py::tuple& argsTuple,
+    const py::dict& kwargsDict,
+    const float rpcTimeoutSeconds,
+    const bool isAsyncExecution);
+
+PyRRef pyRemoteBuiltin(
+    const WorkerInfo& dst,
+    const std::string& opName,
+    const float rpcTimeoutSeconds,
+    const py::args& args,
+    const py::kwargs& kwargs);
+
+PyRRef pyRemotePythonUdf(
+    const WorkerInfo& dst,
+    std::string& pickledPythonUDF,
+    std::vector<torch::Tensor>& tensors,
+    const float rpcTimeoutSeconds,
+    const bool isAsyncExecution);
+
+PyRRef pyRemoteTorchscript(
+    const std::string& dstWorkerName,
+    const std::string& qualifiedNameStr,
+    const float rpcTimeoutSeconds,
+    const bool isAsyncExecution,
+    const py::args& args,
+    const py::kwargs& kwargs);
+
+} // namespace torch::distributed::rpc
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/python_remote_call.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/python_remote_call.h
new file mode 100644
index 0000000000000000000000000000000000000000..6ac097f5fe1a0c60d4bec6343d168ab75e5b9658
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/python_remote_call.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+#include <torch/csrc/distributed/rpc/types.h>
+#include <torch/csrc/jit/serialization/pickler.h>
+namespace torch::distributed::rpc {
+
+class TORCH_API PythonRemoteCall : public RpcCommandBase {
+ public:
+  PythonRemoteCall(
+      SerializedPyObj&& serializedPyObj,
+      at::IValue retRRefId,
+      at::IValue retForkId,
+      const bool isAsyncExecution);
+
+  inline const SerializedPyObj& serializedPyObj() const {
+    return serializedPyObj_;
+  }
+
+  inline const at::IValue& retRRefId() const {
+    return retRRefId_;
+  }
+
+  inline const at::IValue& retForkId() const {
+    return retForkId_;
+  }
+
+  inline bool isAsyncExecution() const {
+    return isAsyncExecution_;
+  }
+
+  c10::intrusive_ptr<Message> toMessageImpl() && override;
+  static std::unique_ptr<PythonRemoteCall> fromMessage(const Message& message);
+
+ private:
+  SerializedPyObj serializedPyObj_;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const at::IValue retRRefId_;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const at::IValue retForkId_;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const bool isAsyncExecution_;
+};
+
+} // namespace torch::distributed::rpc
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/python_resp.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/python_resp.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e52e14650d967a73775eabffbebab0fee8c65f5
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/python_resp.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+#include <torch/csrc/distributed/rpc/types.h>
+
+namespace torch::distributed::rpc {
+
+// RPC call representing the response of a Python UDF over RPC.
+class TORCH_API PythonResp final : public RpcCommandBase {
+ public:
+  explicit PythonResp(SerializedPyObj&& serializedPyObj);
+
+  c10::intrusive_ptr<Message> toMessageImpl() && override;
+
+  static std::unique_ptr<PythonResp> fromMessage(const Message& message);
+
+  const SerializedPyObj& serializedPyObj() const;
+
+ private:
+  SerializedPyObj serializedPyObj_;
+};
+
+} // namespace torch::distributed::rpc
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/python_rpc_handler.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/python_rpc_handler.h
new file mode 100644
index 0000000000000000000000000000000000000000..d493bb8858efae8571b023b0b7214d3d39ef1207
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/python_rpc_handler.h
@@ -0,0 +1,129 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/types.h>
+#include <torch/csrc/jit/frontend/script_type_parser.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch::distributed::rpc {
+
+// Singleton class provides interface to execute python UDF remote call
+// and deserialize the returned results by running python function
+// in internal_rpc_utilities.
+// The singleton object is constructed at first when RPC agent is
+// constructed, where the python function in
+// torch/distributed/internal_rpc_utils.py are imported only once.
+class PYBIND11_EXPORT PythonRpcHandler {
+ public:
+  struct RRefProxyFunctions {
+    py::object rrefProxyCtor_;
+    py::object rpcSync_;
+    py::object rpcAsync_;
+    py::object remote_;
+  };
+
+  struct RRefTypeFunctions {
+    py::object onOwner_;
+    py::object onUser_;
+  };
+
+  static PythonRpcHandler& getInstance();
+
+  // Run a pickled Python UDF and return the result py::object
+  py::object runPythonUdf(const py::object& pythonUdf);
+
+  // Serialized a py::object into a string
+  SerializedPyObj serialize(const py::object& obj);
+
+  // Deserialize a string into a py::object
+  py::object deserialize(const SerializedPyObj& serializedObj);
+
+  // Check if obj is RemoteException, then throw it
+  void handleException(const py::object& obj);
+  // Alternative if the caller is already holding the GIL.
+  void handleExceptionGILHeld(const py::object& obj);
+  // Check if obj is an RemoteException instance.
+  bool isRemoteException(const py::object& obj);
+
+  // Explicitly clean up py::objects to avoid segment faults when
+  // py::objects with CPython are cleaned up later at program exit
+  // See similar issues reported https://github.com/pybind/pybind11/issues/1598
+  // and https://github.com/pybind/pybind11/issues/1493
+  // Our local tests also caught this segment faults if py::objects are cleaned
+  // up at program exit. The explanation is: CPython cleans up most critical
+  // utilities before cleaning up PythonRpcHandler singleton, so when
+  // PythonRpcHandler singleton cleans up py::objects and call dec_ref(), it
+  // will crash.
+  // The solution is to clean up py::objects earlier when Rpc agent join().
+  // Be note that py::objects can not be cleaned up when Rpc agent is destroyed
+  // as well, as Rpc agent is global variable and it will have same issue as
+  // PythonRpcHandler.
+  void cleanup();
+
+  std::shared_ptr<torch::jit::CompilationUnit> jitCompilationUnit();
+
+  // Parse the string to recover the jit_type, this is used for RRef python
+  // pickling/unpickling type recovery. The type string inference rule is as
+  // follows:
+  // 1. first try to parse if this is primitive types.
+  //    i.e. TensorType, IntType, PyObjectType, etc.
+  // 2. if not primitive type, we query the python_cu to see if it is a
+  //    class type or interface type registered in python
+  // We use a ScriptTypeParser instance with custom PythonTypeResolver
+  // to resolve types according to the above rules.
+  TypePtr parseTypeFromStr(const std::string& typeStr);
+
+  // Return a set of Python functions for RRef helpers.
+  const RRefProxyFunctions& getRRefProxyFunctions() const;
+
+  // Return a set of Python functions to retrieve the type of the object
+  // referenced by a given RRef.
+  const RRefTypeFunctions& getRRefTypeFunctions() const;
+
+  PythonRpcHandler(const PythonRpcHandler&) = delete;
+  PythonRpcHandler& operator=(const PythonRpcHandler&) = delete;
+  PythonRpcHandler(PythonRpcHandler&&) = delete;
+  PythonRpcHandler& operator=(PythonRpcHandler&&) = delete;
+
+ private:
+  void init();
+  PythonRpcHandler();
+  ~PythonRpcHandler() = default;
+
+  // Ref to `torch.distributed.rpc.internal._run_function`.
+  py::object pyRunFunction_;
+
+  // Ref to `torch.distributed.rpc.internal.serialize`.
+  py::object pySerialize_;
+
+  // Ref to `torch.distributed.rpc.internal.deserialize`.
+  py::object pyDeserialize_;
+
+  // Ref to 'torch.distributed.rpc.internal._handle_exception'
+  py::object pyHandleException_;
+
+  // Python functions for RRef proxy
+  RRefProxyFunctions rrefProxyFunctions_;
+
+  // Ref to 'torch.distributed.rpc.api._rref_typeof_on_'
+  RRefTypeFunctions rrefTypeFunctions_;
+
+  // Shared ptr to python compilation unit in jit, it is constructed in python
+  // side (see _python_cu = torch._C.CompilationUnit() in jit/__init__.py)
+  // and imported in C++ (see get_python_cu() in
+  // csrc/jit/python/pybind_utils.h). We import the compilation unit here only
+  // once for less cost and thread safety.
+  std::shared_ptr<torch::jit::CompilationUnit> jitCompilationUnit_;
+
+  // jit type parser to parse type_str back to TypePtr for RRef type
+  // recovery when pickling and unpickling RRef
+  std::shared_ptr<jit::ScriptTypeParser> typeParser_;
+
+  // Indicates whether or not we have properly initialized the handler.
+  bool initialized_;
+
+  // Lock to protect initialization.
+  std::mutex init_lock_;
+};
+
+} // namespace torch::distributed::rpc
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/request_callback.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/request_callback.h
new file mode 100644
index 0000000000000000000000000000000000000000..13e9c16adb710ca08e52fdeb90d6b62d5f624b00
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/request_callback.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/message.h>
+
+namespace torch::distributed::rpc {
+
+// Functor which is invoked to process an RPC message. This is an abstract class
+// with some common functionality across all request handlers. Users need to
+// implement this interface to perform the actual business logic.
+class TORCH_API RequestCallback {
+ public:
+  // Invoke the callback.
+  c10::intrusive_ptr<JitFuture> operator()(
+      Message& request,
+      std::vector<c10::Stream> streams) const;
+
+  virtual ~RequestCallback() = default;
+
+ protected:
+  // RpcAgent implementation should invoke ``RequestCallback`` to process
+  // received requests. There is no restriction on the implementation's
+  // threading model. This function takes an rvalue reference of the Message
+  // object. It is expected to return the future to a response message or
+  // message containing an exception. Different rpc agent implementations are
+  // expected to ensure delivery of the response/exception based on their
+  // implementation specific mechanisms.
+  virtual c10::intrusive_ptr<JitFuture> processMessage(
+      Message& request,
+      std::vector<c10::Stream> streams) const = 0;
+};
+
+} // namespace torch::distributed::rpc
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/request_callback_impl.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/request_callback_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..7eeb1539454d3fba185efaaa0be16a39b6925406
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/request_callback_impl.h
@@ -0,0 +1,61 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/request_callback_no_python.h>
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+#include <torch/csrc/jit/python/pybind.h>
+
+namespace torch::distributed::rpc {
+
+class TORCH_API RequestCallbackImpl : public RequestCallbackNoPython {
+ public:
+  std::unique_ptr<RpcCommandBase> deserializePythonRpcCommand(
+      std::unique_ptr<RpcCommandBase> rpc,
+      const MessageType& messageType) const override;
+
+  c10::intrusive_ptr<JitFuture> processPythonCall(
+      RpcCommandBase& rpc,
+      const std::vector<c10::Stream>& streams) const override;
+
+  c10::intrusive_ptr<JitFuture> processScriptCall(
+      RpcCommandBase& rpc,
+      const std::vector<c10::Stream>& streams) const override;
+
+  c10::intrusive_ptr<JitFuture> processScriptRemoteCall(
+      RpcCommandBase& rpc,
+      const std::vector<c10::Stream>& streams) const override;
+
+  c10::intrusive_ptr<JitFuture> processPythonRemoteCall(
+      RpcCommandBase& rpc,
+      const std::vector<c10::Stream>& streams) const override;
+
+  c10::intrusive_ptr<JitFuture> processPythonRRefFetchCall(
+      RpcCommandBase& rpc) const override;
+
+  void handleRRefDelete(c10::intrusive_ptr<RRef>& rref) const override;
+
+  c10::intrusive_ptr<JitFuture> processRpcWithErrors(
+      RpcCommandBase& rpc,
+      const MessageType& messageType,
+      const std::vector<c10::Stream>& streams) const override;
+
+  bool cudaAvailable() const override;
+
+  c10::intrusive_ptr<JitFuture> processRRefBackward(
+      RpcCommandBase& rpc) const override;
+
+  // Helpers to run user-defined functions, operators and other computations.
+
+  c10::intrusive_ptr<JitFuture> runJitFunction(
+      const c10::QualifiedName& name,
+      std::vector<at::IValue>& stack,
+      const std::vector<c10::Stream>& streams,
+      bool isAsyncExecution) const;
+
+  c10::intrusive_ptr<JitFuture> runPythonFunction(
+      const py::object& function,
+      const std::vector<c10::Stream>& streams,
+      bool isAsyncExecution) const;
+};
+
+} // namespace torch::distributed::rpc
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/request_callback_no_python.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/request_callback_no_python.h
new file mode 100644
index 0000000000000000000000000000000000000000..e4179cb25b1fbe3e48fda74fcdee3c2f45cfa294
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/request_callback_no_python.h
@@ -0,0 +1,115 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/request_callback.h>
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+#include <torch/csrc/distributed/rpc/rref_impl.h>
+#include <torch/csrc/distributed/rpc/script_call.h>
+#include <torch/csrc/distributed/rpc/script_remote_call.h>
+
+namespace torch::distributed::rpc {
+
+// RequestCallback implementation with no Python dependencies.
+class TORCH_API RequestCallbackNoPython : public RequestCallback {
+ public:
+  c10::intrusive_ptr<JitFuture> processMessage(
+      Message& request,
+      std::vector<c10::Stream> streams) const override;
+
+ protected:
+  virtual std::unique_ptr<RpcCommandBase> deserializePythonRpcCommand(
+      std::unique_ptr<RpcCommandBase> rpc,
+      const MessageType& messageType) const;
+
+  virtual c10::intrusive_ptr<JitFuture> processScriptCall(
+      RpcCommandBase& rpc,
+      const std::vector<c10::Stream>& streams) const;
+
+  virtual c10::intrusive_ptr<JitFuture> processPythonCall(
+      RpcCommandBase& rpc,
+      const std::vector<c10::Stream>& streams) const;
+
+  c10::intrusive_ptr<JitFuture> assignOwnerRRef(
+      const RRefId& rrefId,
+      const RRefId& forkId,
+      const c10::intrusive_ptr<JitFuture>& valueFuture) const;
+
+  virtual c10::intrusive_ptr<JitFuture> processScriptRemoteCall(
+      RpcCommandBase& rpc,
+      const std::vector<c10::Stream>& streams) const;
+
+  virtual c10::intrusive_ptr<JitFuture> processPythonRemoteCall(
+      RpcCommandBase& rpc,
+      const std::vector<c10::Stream>& streams) const;
+
+  c10::intrusive_ptr<JitFuture> retrieveOwnerRRef(const RRefId& rrefId) const;
+
+  c10::intrusive_ptr<JitFuture> processScriptRRefFetchCall(
+      RpcCommandBase& rpc) const;
+
+  virtual c10::intrusive_ptr<JitFuture> processPythonRRefFetchCall(
+      RpcCommandBase& rpc) const;
+
+  c10::intrusive_ptr<JitFuture> processRRefUserDelete(
+      RpcCommandBase& rpc) const;
+
+  c10::intrusive_ptr<JitFuture> processRRefChildAccept(
+      RpcCommandBase& rpc) const;
+
+  c10::intrusive_ptr<JitFuture> processRRefForkRequest(
+      RpcCommandBase& rpc) const;
+
+  c10::intrusive_ptr<JitFuture> processForwardAutogradReq(
+      RpcCommandBase& rpc,
+      const std::vector<c10::Stream>& streams) const;
+
+  c10::intrusive_ptr<JitFuture> processBackwardAutogradReq(
+      RpcCommandBase& rpc,
+      const std::vector<c10::Stream>& streams) const;
+
+  c10::intrusive_ptr<JitFuture> processCleanupAutogradContextReq(
+      RpcCommandBase& rpc) const;
+
+  c10::intrusive_ptr<JitFuture> processRunWithProfilingReq(
+      RpcCommandBase& rpc) const;
+
+  virtual void handleRRefDelete(c10::intrusive_ptr<RRef>& rref) const;
+
+  c10::intrusive_ptr<JitFuture> processRpc(
+      RpcCommandBase& rpc,
+      const MessageType& messageType,
+      const std::vector<c10::Stream>& streams) const;
+
+  virtual c10::intrusive_ptr<JitFuture> processRpcWithErrors(
+      RpcCommandBase& rpc,
+      const MessageType& messageType,
+      const std::vector<c10::Stream>& streams) const;
+
+  c10::intrusive_ptr<Message> handleError(
+      const std::exception& e,
+      const MessageType messageType,
+      int64_t messageId) const;
+
+  virtual bool cudaAvailable() const;
+
+  virtual c10::intrusive_ptr<JitFuture> processRRefBackward(
+      RpcCommandBase& rpc) const;
+
+  // Helpers to run user-defined functions, operators and other computations.
+
+  c10::intrusive_ptr<JitFuture> runJitOperator(
+      const jit::Operator& op,
+      std::vector<at::IValue>& stack,
+      const std::vector<c10::Stream>& streams) const;
+
+  // Helpers to convert various kinds of objects into already-completed futures.
+
+  c10::intrusive_ptr<JitFuture> asFuture(IValue value, TypePtr type) const;
+
+  c10::intrusive_ptr<JitFuture> asFuture(
+      c10::intrusive_ptr<Message> message) const;
+
+  c10::intrusive_ptr<JitFuture> asFuture(std::exception_ptr err) const;
+};
+
+} // namespace torch::distributed::rpc
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/rpc.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/rpc.h
new file mode 100644
index 0000000000000000000000000000000000000000..79de89e14d86f6b2ad9b88bf9c96e98e9f2af8af
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/rpc.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+namespace torch::distributed::rpc {
+
+PyMethodDef* python_functions();
+
+} // namespace torch::distributed::rpc
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/rpc_agent.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/rpc_agent.h
new file mode 100644
index 0000000000000000000000000000000000000000..f24e78d1127b88ab6059e172ed9353b2b8a66661
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/rpc_agent.h
@@ -0,0 +1,339 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/request_callback.h>
+#include <torch/csrc/distributed/rpc/types.h>
+
+#include <cctype>
+#include <chrono>
+#include <condition_variable>
+#include <mutex>
+#include <thread>
+
+namespace torch::distributed::rpc {
+
+using DeviceMap = std::unordered_map<c10::Device, c10::Device>;
+
+// Default RPC timeout
+constexpr float kDefaultRpcTimeoutSeconds = 60;
+// Unset RPC timeout. This is the value agent::send() will have if user does not
+// pass in a specific timeout, and indicates that we must use the default
+// timeout for RPCs.
+constexpr float kUnsetRpcTimeout = -1;
+constexpr auto kDefaultInitMethod = "env://";
+constexpr float kSecToMsConversion = 1000;
+constexpr auto kRpcTimeoutErrorStr =
+    "RPC ran for more than set timeout ({} ms) and will now be marked with an error";
+
+using steady_clock_time_point =
+    std::chrono::time_point<std::chrono::steady_clock>;
+// Input is qualified name string, output is JIT StrongTypePtr
+// Same as jit::TypeResolver, did not import jit::TypeResolver to here
+// because it could introduce cyclic dependencies.
+using TypeResolver =
+    std::function<c10::StrongTypePtr(const c10::QualifiedName&)>;
+
+struct TORCH_API RpcBackendOptions {
+  RpcBackendOptions()
+      : RpcBackendOptions(kDefaultRpcTimeoutSeconds, kDefaultInitMethod) {}
+
+  RpcBackendOptions(float rpcTimeoutSeconds, std::string initMethod)
+      : rpcTimeoutSeconds(rpcTimeoutSeconds),
+        initMethod(std::move(initMethod)) {
+    TORCH_CHECK(rpcTimeoutSeconds >= 0, "RPC Timeout must be non-negative");
+  }
+
+  float rpcTimeoutSeconds;
+  std::string initMethod;
+};
+
+// A globally unique ID to identify an RpcAgent
+struct TORCH_API WorkerInfo : torch::CustomClassHolder {
+  WorkerInfo(std::string name, int64_t id);
+
+  WorkerInfo(std::string name, worker_id_t id);
+
+  bool operator==(const WorkerInfo& rhs) {
+    return (id_ == rhs.id_) && (name_ == rhs.name_);
+  }
+
+  static constexpr size_t MAX_NAME_LEN = 128;
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const std::string name_;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const worker_id_t id_;
+};
+
+struct TORCH_API RegisterWorkerInfoOnce {
+  RegisterWorkerInfoOnce();
+};
+
+TORCH_API std::ostream& operator<<(
+    std::ostream& os,
+    const WorkerInfo& workerInfo);
+
+// Struct for options to configure the RPC Retry protocol.
+struct TORCH_API RpcRetryOptions {
+  // Using a default constructor like all other Options structs in the RPC
+  // codebase. TORCH_CHECKs for input validation are done in the
+  // sendWithRetries function.
+  RpcRetryOptions() = default;
+  // Maximum number of times we will retry the RPC
+  int maxRetries{5};
+  // Initial duration between consecutive RPC send attempts
+  std::chrono::milliseconds rpcRetryDuration{std::chrono::milliseconds(1000)};
+  // Constant for exponential backoff used while calculating future wait
+  // durations
+  float retryBackoff{1.5};
+};
+
+// Struct that stores all the metadata needed to retry a given RPC.
+struct TORCH_API RpcRetryInfo {
+  RpcRetryInfo(
+      const WorkerInfo& to,
+      c10::intrusive_ptr<Message> message,
+      c10::intrusive_ptr<JitFuture> originalFuture,
+      int retryCount,
+      RpcRetryOptions options)
+      : to_(to),
+        message_(std::move(message)),
+        originalFuture_(std::move(originalFuture)),
+        retryCount_(retryCount),
+        options_(options) {}
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const WorkerInfo& to_;
+  c10::intrusive_ptr<Message> message_;
+  // Future that is returned to the caller of sendWithRetries().
+  c10::intrusive_ptr<JitFuture> originalFuture_;
+  // Number of send attempts completed so far.
+  int retryCount_;
+  RpcRetryOptions options_;
+};
+
+// ``RpcAgent`` is the base class for sending and receiving RPC messages. It
+// provides a unified ``send`` API for both request and response messages, and
+// will invoke the given ``RequestCallback`` to process received requests. It
+// should immediately become ready to serve request and accept response after
+// construction.
+class TORCH_API RpcAgent {
+ public:
+  // `WorkerInfo` is the globally unique identifier for this RpcAgent instance.
+  // It contains a ``name_`` field and an ``id_`` field. ``name_`` is the
+  // globally unique name for this ``RpcAgent``. It is up to the ``RpcAgent``
+  // implementation to determine how to resolve names. ``id_`` is the globally
+  // unique ID for this ``RpcAgent``. This should be determined by the
+  // ``RpcAgent`` implementation.
+  // The ``RequestCallback`` will be invoked to handle received requests. This
+  // ``RpcAgent`` base class makes no assumption on the thread-safeness of the
+  // ``RequestCallback``. ``RpcAgent`` implementations need to make sure that
+  // its threading model conform to ``RequestCallback``'s requirement.
+  // NB: RpcAgent implementations should not start serving requests until
+  // ``start()`` is called, as there could be other contexts that have not been
+  // initialized yet at this time.
+  RpcAgent(
+      WorkerInfo id,
+      std::unique_ptr<RequestCallback> cb,
+      std::chrono::milliseconds rpcTimeout);
+
+  virtual ~RpcAgent();
+
+  // Send a message to the ``RpcAgent`` of id ``to`` and returns a
+  // ``JitFuture`` ptr. The implementation must be asynchronous, i.e., it
+  // cannot block until it receives the response.
+  //
+  // If ``message.isRequest()`` is true, the ``JitFuture`` will be
+  // completed when the response arrives. For other message types, the Future
+  // should be ignored by the caller.
+  virtual c10::intrusive_ptr<JitFuture> send(
+      const WorkerInfo& to,
+      c10::intrusive_ptr<Message> message,
+      const float rpcTimeoutSeconds = kUnsetRpcTimeout,
+      const DeviceMap& deviceMap = {}) = 0;
+
+  // Retries sending the message up to maxRetries times until an ACK is
+  // received. The duration between consecutive sends is increased over
+  // time using an exponential backoff algorithm.
+  //
+  // Sends ``message`` to the ``RpcAgent`` of id ``to`` and returns a
+  // ``JitFuture`` ptr, just like send(). Caller can specify the maximum
+  // number of retries for this RPC (default is 5), initial duration between
+  // sends (default is 1000ms), and backoff constant (default is 1.5) by
+  // passing in the RpcRetryOptions struct. This API might end up
+  // executing a method twice on the remote end (it does not guarantee
+  // exactly-once semantics). Therefore, the user must ensure their requests
+  // are idempotent.
+  c10::intrusive_ptr<JitFuture> sendWithRetries(
+      const WorkerInfo& to,
+      c10::intrusive_ptr<Message> message,
+      RpcRetryOptions retryOptions = RpcRetryOptions());
+
+  // Return a reference to the ``WorkerInfo`` of this RpcAgent.
+  // NB: not using ``std::optional<const std::string&>`` here because we might
+  // need to create a separate RPC API lib and avoid forcing all ``RpcAgent``
+  // implementations to depend on libtorch.
+  const WorkerInfo& getWorkerInfo() const;
+
+  // Return a reference to the ``WorkerInfo`` of the given ``workerName``.
+  virtual const WorkerInfo& getWorkerInfo(
+      const std::string& workerName) const = 0;
+
+  virtual const WorkerInfo& getWorkerInfo(worker_id_t id) const = 0;
+
+  virtual std::vector<WorkerInfo> getWorkerInfos() const = 0;
+
+  // Retrieve the timeout for all RPCs.
+  inline std::chrono::milliseconds getRpcTimeout() const {
+    return rpcTimeout_.load();
+  }
+
+  // Set the timeout for all RPCs
+  inline void setRpcTimeout(const std::chrono::milliseconds& rpcTimeout) {
+    rpcTimeout_.store(rpcTimeout);
+  }
+
+  // Call sync and join all internal threads. This method should be called
+  // before every RPC process exits.
+  virtual void join(bool shutdown = false, float timeout = 0) = 0;
+
+  // Synchronize the this process with other ``RpcAgent`` processes. Block until
+  // all ``RpcAgent``s reach this method and send all pending messages.
+  virtual void sync() = 0;
+
+  // Sets up backend-agnostic state for accepting requests. Currently, this
+  // entails setting rpcAgentRunning_ to true, creating the retry thread, and
+  // calling the backend's startImpl.
+  void start();
+
+  // Derived classes must override this function to start accepting requests.
+  // This is used to initialize any backend-specific state. Users must call
+  // start, not startImpl, to initialize the RPC Agent.
+  virtual void startImpl() = 0;
+
+  // Stop accepting requests and shutdown the RPC framework as soon as possible
+  // by terminating all RPC threads.
+  void shutdown();
+
+  // Derived classes must override this function to start accepting requests.
+  // THis is used to clean up any backend-specific state. Users must call
+  // shutdown, not shutdownImpl, to shutdown the RPC Agent.
+  virtual void shutdownImpl() = 0;
+
+  // Check if current RPC agent is set.
+  static bool isCurrentRpcAgentSet();
+
+  // Retrieve the valid current RPC agent.
+  static std::shared_ptr<RpcAgent> getCurrentRpcAgent();
+
+  // Set the current RPC agent.
+  static void setCurrentRpcAgent(std::shared_ptr<RpcAgent> rpcAgent);
+
+  // Retrieve metrics as KV map
+  virtual std::unordered_map<std::string, std::string> getMetrics() = 0;
+
+  // Retrieve debug info in addition to metrics as KV map
+  virtual std::unordered_map<std::string, std::string> getDebugInfo();
+
+  // Flag to control whether GIL wait times
+  // should be profiled or not.
+  void enableGILProfiling(bool flag);
+
+  // Retrieve wheher we should profile GIL wait times or not.
+  bool isGILProfilingEnabled();
+
+  // Set type resolver that will be passed to JIT pickler to resolver type Ptr
+  // based on type str.
+  void setTypeResolver(std::shared_ptr<TypeResolver> typeResolver);
+
+  // Get the type resolver
+  std::shared_ptr<TypeResolver> getTypeResolver();
+
+  // Retrieves the device map for the provided destination worker.
+  virtual DeviceMap getDeviceMap(const WorkerInfo& dst) const;
+
+  // Retrieve the (non-CPU) devices that are supported by the agent.
+  virtual const std::vector<c10::Device>& getDevices() const;
+
+ protected:
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  const WorkerInfo workerInfo_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  const std::unique_ptr<RequestCallback> cb_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::atomic<std::chrono::milliseconds> rpcTimeout_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::atomic<bool> profilingEnabled_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::shared_ptr<TypeResolver> typeResolver_;
+  // Atomic boolean indicating whether this agent is running. It controls
+  // whether several background threads should be running. It is set in
+  // RpcAgent::start() and unset in the derived class shutdown().
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::atomic<bool> rpcAgentRunning_;
+
+ private:
+  static std::shared_ptr<RpcAgent> currentRpcAgent_;
+  // Add GIL wait time data point to metrics
+  virtual void addGilWaitTime(const std::chrono::microseconds gilWaitTime) = 0;
+  friend class PythonRpcHandler;
+
+  // Map that stores metadata for RPC's that may need to be re-tried as well as
+  // the timepoint at which we should re-try them.
+  std::map<
+      steady_clock_time_point,
+      std::unordered_set<std::shared_ptr<RpcRetryInfo>>>
+      rpcRetryMap_;
+
+  // Thread that checks for retryable RPC's in the rpcRetryMap_ and sleeps until
+  // the next unACKed RPC's timeout has expired.
+  std::thread rpcRetryThread_;
+
+  // Function that rpcRetryThread_ calls in a loop as long as RpcAgent is
+  // running.
+  void retryExpiredRpcs();
+
+  // This is the callback attached to futures corresponding to send retries.
+  // This handles 3 cases: 1). send was completed, 2). send failed with an
+  // error and we've done maxRetries failed send attempts, and 3). send
+  // failed with an error and we have more retries to go. In case 1, we mark
+  // the original future as complete. In case 2, we mark the future with an
+  // error and do not retry again. In case 3, we move the RpcRetryInfo struct
+  // to another time point in the map to schedule the RPC for a future send.
+  void rpcRetryCallback(
+      JitFuture& message,
+      steady_clock_time_point newTime,
+      std::shared_ptr<RpcRetryInfo> earliestRpc);
+
+  // Function that uses the exponential backoff algorithm to compute the next
+  // time point to retry a given RPC.
+  inline steady_clock_time_point computeNewRpcRetryTime(
+      RpcRetryOptions& options,
+      int retryCount) {
+    // The exponential backoff algorithm being used here is:
+    // newTime = timeNow + (retryDuration * (backoffConstant ^ retryCount)).
+    std::chrono::milliseconds timedelta =
+        std::chrono::duration_cast<std::chrono::milliseconds>(
+            options.rpcRetryDuration * pow(options.retryBackoff, retryCount));
+    return std::chrono::time_point_cast<std::chrono::milliseconds>(
+        std::chrono::steady_clock::now() + timedelta);
+  }
+
+  // Condition Variable to signal when the rpcRetryMap_ has been populated.
+  std::condition_variable rpcRetryMapCV_;
+
+  // Mutex to protect RpcRetryMap_.
+  std::mutex rpcRetryMutex_;
+};
+
+} // namespace torch::distributed::rpc
+
+namespace std {
+template <>
+struct hash<torch::distributed::rpc::WorkerInfo> {
+  std::size_t operator()(
+      const torch::distributed::rpc::WorkerInfo& worker_info) const noexcept {
+    return worker_info.id_;
+  }
+};
+} // namespace std
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/rpc_command_base.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/rpc_command_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..c49a7e852e0aa6e6f638a6a9ab5c549cf97feb79
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/rpc_command_base.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/types.h>
+
+namespace torch::distributed::rpc {
+
+// Base class for all RPC request and responses.
+class RpcCommandBase {
+ public:
+  // Need to override this to serialize the RPC. This should destructively
+  // create a message for the RPC (Hence the &&).
+  c10::intrusive_ptr<Message> toMessage() && {
+    JitRRefPickleGuard jitPickleGuard;
+    return std::move(*this).toMessageImpl();
+  }
+  virtual c10::intrusive_ptr<Message> toMessageImpl() && = 0;
+  virtual ~RpcCommandBase() = 0;
+};
+
+inline RpcCommandBase::~RpcCommandBase() = default;
+
+} // namespace torch::distributed::rpc
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/rref_context.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/rref_context.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f43bb44eba4d177100c2b3e0669887b27b7610c
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/rref_context.h
@@ -0,0 +1,335 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/rpc_agent.h>
+#include <torch/csrc/distributed/rpc/rref_impl.h>
+#include <torch/csrc/distributed/rpc/types.h>
+#include <torch/csrc/distributed/rpc/utils.h>
+
+#include <atomic>
+#include <optional>
+
+namespace torch::distributed::rpc {
+
+namespace callback {
+// It's the callback for RemoteCall.
+void TORCH_API
+confirmPendingUser(const JitFuture& jitFuture, const ForkId& expectedForkId);
+
+// It's the callback for finishing creating owner rref, it returned deletedRRef,
+// so that the deletedRRef can be handled under GIL in python_functions.cpp if
+// deletedRRef contains python object.
+c10::intrusive_ptr<RRef> TORCH_API
+finishCreatingOwnerRRef(const JitFuture& jitFuture, const RRefId& rrefId);
+} // namespace callback
+
+// Manages RRef lifetime and keeps track of RRef forks.
+class TORCH_API RRefContext {
+ public:
+  static RRefContext& getInstance();
+  // NB: This method must be called before destructing RRefContext singleton.
+  // Similar to delForkOfOwner, this method returns a vector of OwnerRRefs that
+  // hold py::object. The call-site is also responsible for resetting those
+  // shared_ptr objects with a GIL. See comments at delForkOfOwner() for more
+  // details.
+  static std::vector<c10::intrusive_ptr<RRef>> destroyInstance(
+      bool ignoreRRefLeak = true);
+
+  static void handleException(const JitFuture& jitFuture);
+
+  // handle exception without throw ::c10::Error again
+  static void handleExceptionSilent(const JitFuture& jitFuture);
+
+  RRefContext(const RRefContext&) = delete;
+  RRefContext(RRefContext&& other) = delete;
+  void operator=(const RRefContext&) = delete;
+  RRefContext& operator=(RRefContext&& other) = delete;
+
+  ~RRefContext();
+
+  // get the worker id of the current worker
+  inline worker_id_t getWorkerId() const {
+    return agent_->getWorkerInfo().id_;
+  }
+
+  // get the worker name of the current worker
+  inline const std::string& getWorkerName() const {
+    return agent_->getWorkerInfo().name_;
+  }
+
+  //  generate a globally unique ID
+  inline GloballyUniqueId genGloballyUniqueId() {
+    return GloballyUniqueId(getWorkerId(), nextLocalId_++);
+  }
+
+  inline const std::shared_ptr<RpcAgent>& agent() const {
+    return agent_;
+  }
+
+  // create a ``UserRRef`` owned by the worker ``ownerId``
+  c10::intrusive_ptr<UserRRef> createUserRRef(
+      worker_id_t ownerId,
+      const TypePtr& type);
+
+  // Convert an RRefForkData into an RRef. This RRef could be user or owner.
+  // This RRef could have already existed before, or could be created in this
+  // method, we pass type here to validate or help the rref creation.
+  c10::intrusive_ptr<RRef> getOrCreateRRef(
+      const RRefForkData& rfd,
+      const TypePtr& type);
+
+  // Get the ``OwnerRRef`` of id ``rrefId``. If it does not exist, create a new
+  // one. This function is called in two places:
+  // 1. when processing ``rpc.remote()``, i.e., ``SCRIPT_REMOTE_CALL``
+  //    ``PYTHON_REMOTE_CALL``.
+  // 2. when unpickling ``OwnerRRef``.
+  // What's common in these two cases are, 1) the RRefId is already generated
+  // 2) the TypePtr is presented. So it can always create the ``OwnerRRef`` if
+  // it is not yet available.
+  c10::intrusive_ptr<OwnerRRef> getOrCreateOwnerRRef(
+      const RRefId& rrefId,
+      const TypePtr& type);
+
+  // Create an empty owner rref of type.
+  // This method is called to first time generate an ``OwnerRRef``, e.g.,
+  // 1) ``rpc.RRef(obj)``
+  // 2) create the ``OwnerRRef`` on `rpc.remote()` caller side.
+  // What's common in these two cases are, 1) the RRefId hasn't been generated
+  // 2) the TypePtr is presented.
+  c10::intrusive_ptr<OwnerRRef> createOwnerRRef(const TypePtr& type);
+
+  // Returns a Future of the OwnerRRef, which will be marked completed when
+  // ``OwnerRRef`` is created. This method is used when the TypePtr is not
+  // available, e.g., when processing to_here(). The forceCreated flag can be
+  // used to ensure that the rref is created on the owner, otherwise throw in
+  // cases where the user of this API expects this to return a completed future.
+  // Note that the return value is a intrusive_ptr to a c10::ivalue::Future that
+  // holds the RRef.
+  c10::intrusive_ptr<JitFuture> getOwnerRRef(
+      const RRefId& rrefId,
+      bool forceCreated = false);
+
+  // Adding the RRefId of an OwnerRRef into the forks_ map. This is useful when
+  // making a remote call to self, which as for now, still goes through serde
+  // and invokes request callback. In this case, the OwnerRRef has already been
+  // created on the send side, and we need to pass it to the receive side,
+  // instead of creating a new OwnerRRef. This is done by adding the OwnerRRef
+  // into owners_. However, that alone is not enough, as it could be deleted
+  // when all UserRRef die, which would then remove the OwnerRRef from owners_
+  // and this could happen before the self remote call finishes. To prevent
+  // that, this API adds the RRefId as a ForkId, which will then delete the
+  // ForkId when the self remote is done.
+  void addSelfAsFork(c10::intrusive_ptr<OwnerRRef>& rref);
+
+  // Register a fork of the ``OwnerRRef``, and inserts a intrusive_ptr of the
+  // ``OwnerRRef`` in a map to keep it alive.
+  void addForkOfOwner(const RRefId& rrefId, const ForkId& forkId);
+  // Performs the same function as addForkOfOwner but ignores duplicate
+  // requests. This idempotent function is used with RREF_FORK_REQUEST calls,
+  // whereas all other message types use the non-idempotent variant.
+  void addForkOfOwnerIfNotPresent(const RRefId& rrefId, const ForkId& forkId);
+  // Delete a fork of the ``OwnerRRef``. NB: this could trigger deletion on the
+  // IValue or py::object. For the later, this method will acquire GIL.
+  // NB: If this fork deletion triggered deleting OwnerRRef, this method will
+  // return a shared_ptr to the OwnerRRef, which is likely to be the last
+  // shared_ptr instance for it. Therefore, deleting this shared_ptr<OwnerRRef>
+  // will also trigger deleting the object it points to. If OwnerRRef holds a
+  // py::object, deleting it require GIL. The call site should guarded it with
+  // a GIL and reset the shared_ptr. The GIL-guarded deletion is intentionally
+  // left out of this function to avoid creating dependency on pybind.
+  c10::intrusive_ptr<RRef> delForkOfOwner(
+      const RRefId& rrefId,
+      const ForkId& forkId);
+
+  // Invoked when pickling an RRef to setup child/fork properly
+  RRefForkData prepareChildFork(const c10::intrusive_ptr<RRef>& rref);
+  // Invoked when unpickling an RRef to send RREF_FORK_REQUEST to owner and
+  // send RREF_CHILD_ACCEPT to the parent.
+  // NB: forkId is necessary here as the rref could be an OwnerRRef
+  void notifyOwnerAndParentOfFork(
+      const ForkId& forkId,
+      worker_id_t parent,
+      const c10::intrusive_ptr<RRef>& rref);
+
+  // When a UserRRef is forked to another worker (user or owner), it is added
+  // into pendingChildren_ to be held alive until it receives RREF_CHILD_ACCEPT
+  // from the child.
+  // NB: This is necessary for both user and owner child. As we do not have FIFO
+  // communication between workers, we need this strategy to make sure that all
+  // previously submitted rpc/remote calls are acked before sending out the
+  // RREF_USER_DELETE message. Otherwise, the OwnerRRef could be deleted too
+  // soon.
+  void addPendingChild(
+      const ForkId& forkId,
+      const c10::intrusive_ptr<RRef>& rref);
+  void delPendingChild(const ForkId& forkId);
+
+  // When a UserRRef is created, it is added into pendingUsers_ to be held alive
+  // until it receives RREF_USER_ACCEPT from the owner.
+  void addPendingUser(
+      const ForkId& forkId,
+      const c10::intrusive_ptr<RRef>& rref);
+  void delPendingUser(const ForkId& forkId);
+  void addConfirmedUser(
+      const ForkId& forkId,
+      const c10::intrusive_ptr<RRef>& rref);
+
+  // Retrieve a pending user given the fork ID. Throws if the user has already
+  // been confirmed (i.e. is no longer in the pendingUsers_ map).
+  c10::intrusive_ptr<RRef> getPendingUser(const ForkId& forkId);
+
+  // Start recording new pending UserRRefs. All pending UserRRefs introduced
+  // after this point will be put into the thread_local userTable_, which will
+  // then be consumed and cleared in waitForThreadLocalPendingRRefs().
+  void recordThreadLocalPendingRRefs();
+  // End recording new pending UserRRefs, and clear the thread_local userTable_.
+  // Returns a Future which will be marked as completed when all pending
+  // UserRRefs in the current userTable_ are confirmed by their owners. The bool
+  // value in the Future is unused.
+  // This method is useful to make sure RRefs in user function arguments are
+  // confirmed before launching user code.
+  // NB: Callers of this method does not need to keep the returned Future alive,
+  // because this Future is already captured in callbacks of the
+  // PendingUserState. If there is no pending UserRRefs, this method returns a
+  // completed future.
+  c10::intrusive_ptr<JitFuture> waitForThreadLocalPendingRRefs();
+  // Only call this function when there are errors during a recording session,
+  // and it is likely that waitForThreadLocalPendingRRefs() cannot be invoked
+  // properly.
+  // TODO: make this a context guard
+  void clearRecordedPendingRRefsOnError();
+
+  void delUser(
+      const worker_id_t owner,
+      const RRefId& rrefId,
+      const ForkId& forkId);
+  void delAllUsersAndUnforkedOwners(std::chrono::milliseconds timeoutMillis);
+
+  std::unordered_map<std::string, std::string> getDebugInfo();
+
+ private:
+  struct PendingUserState {
+    PendingUserState(c10::intrusive_ptr<RRef> rref)
+        : rref_(std::move(rref)),
+          confirmationFuture_(c10::make_intrusive<JitFuture>(BoolType::get())) {
+    }
+
+    inline void confirm() {
+      c10::static_intrusive_pointer_cast<UserRRef>(rref_)->confirm();
+      confirmationFuture_->markCompleted();
+    }
+
+    c10::intrusive_ptr<RRef> rref_;
+    // Use Future.wait() and Future.markCompleted() to block and unblock user
+    // functions. The bool value wrapped by the future_ is not used.
+    c10::intrusive_ptr<JitFuture> confirmationFuture_;
+  };
+
+  RRefContext(std::shared_ptr<RpcAgent>);
+
+  c10::intrusive_ptr<UserRRef> createUserRRef(
+      worker_id_t ownerId,
+      const RRefId& rrefId,
+      const ForkId& forkId,
+      const TypePtr& type);
+
+  void finishForkRequest(const ForkId& forkId, worker_id_t parent);
+
+  // If there is any leak on any RRef, this method will throw an error.
+  void checkRRefLeaks(bool ignoreRRefLeak);
+
+  static std::atomic<local_id_t> nextLocalId_;
+
+  const std::shared_ptr<RpcAgent> agent_;
+  mutable std::mutex mutex_;
+  // Keep OwnerRRefs alive until there is no living UserRRefs.
+  std::unordered_map<RRefId, c10::intrusive_ptr<RRef>, RRefId::Hash> owners_;
+  // A map to track OwnerRRefs that are requested but not yet created. This can
+  // happen if the to_here() message is processed on the owner before the
+  // corresponding creator rpc.remote() message. If this happens, instead of
+  // to_here() RPC thread to block waiting for the OwnerRRef creation, the
+  // RRefContext returns a Future, so that the RPC request processing logic can
+  // attach subsequent code as a callback to that Future.
+  // NB: the OwnerRRefs in this map must be cleared when the corresponding
+  // OwnerRRef is created. Note that the values in this map are intrusive_ptrs
+  // to c10::ivalue::Future that will be marked completed with the owner RRef.
+  std::unordered_map<RRefId, c10::intrusive_ptr<JitFuture>, RRefId::Hash>
+      pendingOwners_;
+  // Tracks known living UserRRefs of an OwnerRRef
+  std::unordered_map<
+      RRefId,
+      std::unordered_set<ForkId, ForkId::Hash>,
+      RRefId::Hash>
+      forks_;
+
+  // This cond var is used by deleteAllUsers(), a event notification is sent if
+  // number of pending UserRRef or UserRRef children is reduced, or
+  // number of owned OwnerRRef is reduced.
+  std::condition_variable deleteAllUsersCV_;
+  // The follow 3 maps keep UserRRefs alive by holding a intrusive_ptr to the
+  // RRef instances. A UserRRef must be added into this map if any of the
+  // following two conditions is true:
+  //
+  // (1) A UserRRef has not been accepted by owner yet.
+  //
+  //     It can be used or shared, but cannot be deleted, and hence kept alive
+  //     in this map. A message of type RREF_USER_ACCEPT will move the
+  //     corresponding RRef from pendingUsers_ map to confirmedUsers_ map.
+  std::unordered_map<ForkId, std::shared_ptr<PendingUserState>, ForkId::Hash>
+      pendingUsers_;
+  //     UserRRefs are added into this map when it is confirmed by the owner.
+  //     When destroying RRefContext this map helps to find local UserRRefs
+  //     and send delete messages if they are still not deleted by Python
+  //     garbage collection.
+  std::unordered_map<ForkId, c10::weak_intrusive_ptr<RRef>, ForkId::Hash>
+      confirmedUsers_;
+
+  // (2) A UserRRef has forked a child UserRRef which has not been accepted by
+  //     the owner yet.
+  //
+  //     In this case, this UserRRef cannot send out RREF_USER_DELETE message,
+  //     as it could potentially trigger the OwnerRRef been deleted before the
+  //     owner learns about the forked child.
+  std::unordered_map<ForkId, c10::intrusive_ptr<RRef>, ForkId::Hash>
+      pendingChildren_;
+
+  // The RRef context performs its operations through async RPC requests, in
+  // order to not block the user code. Therefore the RRef context's state may be
+  // lagging a bit behind what it is intended to be, while it waits for these
+  // requests to complete. To allow syncing when needed, we store the count of
+  // these pending requests, so that users can wait for it to reach zero.
+  std::atomic<int64_t> numPendingFutures_{0};
+
+  std::mutex destroyedMutex_;
+  bool destroyed_{false};
+
+  // Thread local states to keep UserRRefs deserialized from user function
+  // arguments.
+  static thread_local std::vector<std::shared_ptr<PendingUserState>> userTable_;
+  // A flag indicating whether subsequently created UserRRefs should be added to
+  // the thread_local userTable_. The flag is set to true before serializing
+  // RPC arguments and then set to false before running the corresponding
+  // user code. See addPendingUser and delPendingUser for more details.
+  // NB: The reason for having this flag is because addPendingUser are called in
+  // two cases, and we only want to track the 2nd case.
+  // (1) RRef as the return value: when calling rpc.remote, the UserRRef on the
+  //     caller side is added to the context using addPendingUser.
+  // (2) RRef as an argument: When running an RPC using RRefs as arguments, the
+  //     RRef is forwarded to the callee as new UserRRefs (if the callee is not
+  //     the owner). In this case, we block running the user function until all
+  //     UserRRefs are confirmed by the owner.
+  // This contract guarantees that no UserRRefs can be used remotely without
+  // confirmation. Note that, however, the UserRRef created by rpc.remote can
+  // still be passed to local functions as arguments and used there. This is by
+  // design, because this feature is especially useful when, say a master node
+  // creates multiple UserRRefs in a loop and then shares them with other nodes.
+  // Blocking every iteration in the loop until RRefs are confirmed will slow
+  // this down. This nuance on UserRRef can be interpreted as we only make
+  // exceptions for UserRRef creators. And using the UserRRef on its creator
+  // without confirmation is OK, because the creator would either call to_here
+  // or forward the UserRRef, and both would then require confirmations from the
+  // owner.
+  static thread_local bool recording_;
+};
+
+} // namespace torch::distributed::rpc
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/rref_impl.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/rref_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..11ed5720ebd7c09d8ad663340d51dd4099114f00
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/rref_impl.h
@@ -0,0 +1,421 @@
+#pragma once
+
+#include <ATen/core/jit_type.h>
+#include <ATen/core/rref_interface.h>
+#include <c10/core/Event.h>
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/rpc_agent.h>
+#include <torch/csrc/distributed/rpc/types.h>
+#include <optional>
+
+#include <atomic>
+
+namespace torch::distributed::rpc {
+
+class RRef;
+class RRefContext;
+class UserRRef;
+
+constexpr int OWNER_IDX = 0; // index of ownerId in the tuple
+constexpr int RREFID_ON_IDX = 1; // index of RRefId.createdOn_ in the tuple
+constexpr int RREFID_ID_IDX = 2; // index of RRefId.localId_ in the tuple
+constexpr int FORKID_ON_IDX = 3; // index of ForkId.createdOn_ in the tuple
+constexpr int FORKID_ID_IDX = 4; // index of ForkId.localId_ in the tuple
+constexpr int PARENT_IDX = 5; // index of parent in the tuple
+constexpr int TYPE_IDX = 6; // index of parent in the tuple
+
+// NB: if more fields are added, make sure this field is also bumped
+constexpr int RFD_TUPLE_SIZE = 7; // number of RRefForkData fields in py::tuple
+
+// Represents fork of an RRef to be sent over the wire.
+struct TORCH_API RRefForkData {
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const worker_id_t ownerId_;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const RRefId rrefId_;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const ForkId forkId_;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const worker_id_t parent_;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const std::string typeStr_;
+
+  RRefForkData(
+      worker_id_t ownerId,
+      const RRefId& rrefId,
+      const ForkId& forkId,
+      worker_id_t parent,
+      std::string typeStr);
+};
+
+// Note [RRef Protocol]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~
+//
+// [Background]
+//
+// RRef stands for Remote REFerence. Each RRef is owned by a single worker
+// (i.e., owner) and can be used by multiple users. The owner stores the real
+// data referenced by its RRefs. RRef needs to support fast and scalable RPC.
+// Hence, in the design, we avoid using a single global master to keep RRef
+// states, instead owners will keep track of the global reference counts
+// for its RRefs. Every RRef can be uniquely identified by a global RRefId,
+// which is assigned at the time it is first created either on a user or on the
+// owner.
+//
+// On the owner worker, there is only one OwnerRRef instance, which contains the
+// real data, while on user workers, there can be as many UserRRefs as
+// necessary, and UserRRef does not hold the data. All usage on the OwnerRRef
+// should retrieve the unique OwnerRRef instance using the globally unique
+// RRefId. //A UserRRef will be created when it is used as an argument or return
+// value in dist.rpc or dist.remote call, but RRef forking and reference
+// counting (RC) are completely transparent to applications. Every UserRRef will
+// also have its globally unique ForkId.
+//
+// [Assumptions]
+//
+// 1. Transient Network Failures
+//
+// TODO: current RRef implementation does not tolerate failures
+//
+// The RRef design handles transient network failures by retrying
+// messages. Node crashes or permanent network partition is beyond the scope.
+// When those incidents occur, the application may take down all workers, revert
+// to the previous checkpoint, and resume training.
+//
+// 2. Non-idempotent UDFs
+//
+// We assume UDFs are not idempotent and therefore cannot be retried. However,
+// internal RRef control messages are idempotent and retried upon message
+// failure.
+//
+// TODO: RRef internal messages are not yet idempotent
+//
+// 3. Out of Order Message Delivery
+//
+// We do not assume message delivery order between any pair of nodes, because
+// both sender and receiver are using multiple threads. There is no guarantee on
+// which message will be processed first.
+//
+// [RRef Lifetime]
+//
+// The goal of the protocol is to delete an OwnerRRef at an appropriate time.
+// The right time to delete an OwnerRRef is when there are no living UserRRefs
+// and Python GC also agrees to delete the OwnerRRef instance on the owner. The
+// tricky part is to determine if there are any living UserRRefs.
+//
+// A user can get a UserRRef in three situations:
+//
+// (1). Receiving a UserRRef from the owner.
+// (2). Receiving a UserRRef from another user.
+// (3). Creating a new UserRRef owned by another worker.
+//
+// (1) is the simplest case where the owner initiates the fork, and hence it can
+// easily increment local RC. The only requirement is that any UserRRef must
+// notify the owner before destruction. Hence, we need the first guarantee:
+//
+// G1. The owner will be notified when any UserRRef is deleted.
+//
+// As messages might come delayed or out-of-order, we need more one guarantee to
+// make sure the delete message is not sent out too soon. Let us first introduce
+// a new concept. If A sends an RPC to B that involves an RRef, we call the RRef
+// on A the parent RRef and the RRef on B the child RRef.
+//
+// G2. Parent RRef cannot be deleted until the child RRef is confirmed by the
+//     owner.
+//
+// Under (1), where the caller is UserRRef and callee is OwnerRRef, it simply
+// means that the user will not send out the delete message until all previous
+// messages are ACKed. Note that ACKed does not mean the owner finishes
+// executing the function, instead, it only means the owner has retrieved its
+// local OwnerRRef and about to pass it to the function, which is sufficient to
+// keep the OwnerRRef alive even if the delete message from the user arrives at
+// the owner before the function finishes execution.
+//
+// With (2) and (3), it is possible that the owner only partially knows the RRef
+// fork graph or not even knowing it at all. For example, the RRef could be
+// constructed on a user, and before the owner receives the RPC call, the
+// creator user might have already shared the RRef with other users, and those
+// users could further share the RRef. One invariant is that the fork graph of
+// any RRef is always a tree rooted at the owner, because forking an RRef always
+// creates a new RRef instance, and hence every RRef has a single parent. One
+// nasty detail is that when an RRef is created on a user, technically the owner
+// is not its parent but we still consider it that way and it does not break the
+// argument below.
+//
+// The owner's view on any node (fork) in the tree has three stages:
+//
+//       1) unknown -> 2) known -> 3) deleted.
+//
+// The owner's view on the entire tree keeps changing. The owner deletes its
+// OwnerRRef instance when it thinks there are no living UserRRefs, i.e., when
+// OwnerRRef is deleted, all UserRRefs could be either indeed deleted or
+// unknown. The dangerous case is when some forks are unknown and others are
+// deleted.
+//
+// G2 trivially guarantees that no parent UserRRef Y can be deleted before the
+// owner knows all of Y's children UserRRefs.
+//
+// However, it is possible that the child UserRRef Z may be deleted before the
+// owner knows its parent Y. More specifically, this can happen when all of Z's
+// messages are processed by the owner before all messages from Y, including the
+// delete message. Nevertheless, this does not cause any problem. Because, at
+// least one of Y's ancestor will be alive, and it will prevent the owner from
+// deleting the OwnerRRef. Consider the following example: (NB: this scenario
+// will no longer relevant when we block UDF until all RRefs are confirmed by
+// the owner)
+//
+//     OwnerRRef -> A -> Y -> Z
+//
+// OwnerRRef forks to A, then A forks to Y, and Y forks to Z. Z can be deleted
+// without OwnerRRef knowing Y. However, the OwnerRRef will at least know A, as
+// the owner directly forks the RRef to A. A won't die before the owner knows Y.
+//
+// Things get a little trickier if the RRef is created on a user:
+//
+//  OwnerRRef
+//      ^
+//      |
+//      A -> Y -> Z
+//
+// If Z calls to_here on the UserRRef, the owner at least knows A when Z is
+// deleted, because otherwise to_here wouldn't finish. If Z does not call
+// to_here, it is possible that the owner receives all messages from Z before
+// any message from A and Y. In this case, as the real data of the OwnerRRef has
+// not been created yet, there is nothing to be deleted either. It is the same
+// as Z does not exist at all Hence, it's still OK.
+//
+// See #26759 for more details and discussions.
+//
+// TODO: make RRef an IValue, and edit createStackForSchema accordingly
+// TODO: make RRef system messages idempotent and retry on failures.
+//
+// ``RRef`` is the base type for both ``UserRRef`` and ``OwnerRRef``.
+// Each ``RRef`` has a globally unique ``RRefId``.
+class TORCH_API RRef : public RRefInterface {
+ public:
+  // RRef is made NOT copyable NOT movable to prevent messing up reference
+  // counting.
+  explicit RRef(const RRef& other) = delete;
+  explicit RRef(RRef&& other) = delete;
+  RRef& operator=(RRef&& other) = delete;
+
+  ~RRef() override = default;
+
+  // returns the worker id of the owner
+  inline worker_id_t owner() const override {
+    return ownerId_;
+  }
+
+  // returns the worker name of the owner
+  inline std::string ownerName() const override {
+    return RpcAgent::getCurrentRpcAgent()->getWorkerInfo(ownerId_).name_;
+  }
+
+  // returns the worker info of the owner
+  inline WorkerInfo ownerWorkerInfo() const {
+    return RpcAgent::getCurrentRpcAgent()->getWorkerInfo(ownerId_);
+  }
+
+  // Returns the globally unique RRefId of this RRef
+  inline const RRefId& rrefId() const {
+    return rrefId_;
+  }
+
+  inline bool isPyObj() const {
+    return type_ == PyObjectType::get();
+  }
+  inline const TypePtr type() const override {
+    return type_;
+  }
+
+  // Save the future corresponding to the creation of this RRef on a remote
+  // node. Note that this is only set when processing requests invoked with
+  // rpc.remote. This is only used to get the future corresponding to the rref
+  // for profiling use cases.
+  inline void registerOwnerCreationFuture(c10::intrusive_ptr<JitFuture> fut) {
+    ownerCreationFuture_ = std::move(fut);
+  }
+
+  // Get the future corresponding to the creation of this rref.
+  inline c10::intrusive_ptr<JitFuture> getOwnerCreationFuture() const {
+    return ownerCreationFuture_;
+  }
+
+  // Check if creation of this RRef on owner node has timed out.
+  inline bool getTimedOut() const {
+    return timedOut_.load();
+  }
+
+  // Dispatches an error to the correct handler based on its RPCErrorType.
+  void handleError(RPCErrorType errorType, const JitFuture& JitFuture);
+
+  // Send delete UserRRef request to Owner,
+  // if the request hasn't been sent yet.
+  // There are 2 cases to call it,
+  // 1, Python GC decides end of UserRRef lifetime, calling destructor.
+  // 2, RPC module graceful shutdown calls it on all UserRRefs tracked
+  //    in the RRefContext.
+  virtual void tryDel() {}
+
+ protected:
+  // Indicates that the creation of this RRef on owner node has timed out.
+  inline void setTimedOut() {
+    timedOut_ = true;
+  }
+  friend class RRefContext;
+
+  RRef(worker_id_t ownerId, const RRefId& rrefId, TypePtr type);
+
+  virtual RRefForkData fork() const;
+
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  const worker_id_t ownerId_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  const RRefId rrefId_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::atomic<bool> timedOut_{false};
+
+  // type field to denote the type of the element that the RRef is holding
+  // it could be any TypePtr that JIT support, including PyObjectType
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  const TypePtr type_;
+  // Future corresponding to request to create RRef on remote node.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  c10::intrusive_ptr<JitFuture> ownerCreationFuture_;
+};
+
+// ``UserRRef`` represents a user of an RRef. Besides the ``RRefId``, each user
+// also has a globally unique ``ForkId`` to identify this user. ``UserRRef``
+// never owns the real value, the only way to get the value of the ``RRef`` is
+// to call ``to_here()`` and get a copy..
+class TORCH_API UserRRef final : public RRef {
+ public:
+  UserRRef(const UserRRef& other) = delete;
+  UserRRef(UserRRef&& other) = delete;
+  UserRRef& operator=(const UserRRef& other) = delete;
+  UserRRef& operator=(UserRRef&& other) = delete;
+
+  UserRRef(
+      worker_id_t ownerId,
+      const RRefId& rrefId,
+      const ForkId& forkId,
+      TypePtr type);
+
+  inline bool isOwner() const override {
+    return false;
+  }
+
+  inline bool confirmedByOwner() const override {
+    return confirmedByOwner_;
+  }
+
+  // Returns the globally unique ForkId of this RRef
+  const ForkId& forkId() const;
+
+  // Get of copy of the value from the ``OwnerRRef``. If the value is not ready
+  // yet, this call will block.
+  IValue toHere(
+      const float timeoutSeconds =
+          torch::distributed::rpc::kUnsetRpcTimeout) const;
+
+  void tryDel() override;
+
+  // Will be called when refcount reaches 0.
+  // Upon destruction, this ``UserRRef`` will tell the owner to deref.
+  void release_resources() override;
+
+  // Will be called when both refcount and weakcount reach 0. See
+  // https://github.com/pytorch/pytorch/blob/9116f02bebf3a5260feef5732d36c54ecb3b4033/c10/util/intrusive_ptr.h#L204
+  // This is called on destructing the wrapping intrusive_ptr_target instance
+  // and it's data members.
+  ~UserRRef() override;
+
+ private:
+  friend class RRefContext;
+
+  RRefForkData fork() const override;
+  inline void confirm() {
+    confirmedByOwner_ = true;
+  }
+
+  const ForkId forkId_;
+
+  // Indicates if this user has sent delete message to it's owner.
+  // Note, thread safety is needed because delete message could be sent by
+  // either the destructor called by Python garbage collection or RRefContext
+  // proactive cleanup on RPC graceful shutdown.
+  std::mutex deletedOnOwnerMutex_;
+  bool deletedOnOwner_{false};
+  // Indicating whether this UserRRef has been confirmed by its owner.
+  std::atomic<bool> confirmedByOwner_;
+};
+
+// Keep the template only on the derived class because ``RRefContext`` needs to
+// erase the type on ``RRef`` and keep them in one map.
+class TORCH_API OwnerRRef final : public RRef {
+ public:
+  OwnerRRef(const OwnerRRef& other) = delete;
+  OwnerRRef(OwnerRRef&& other) = delete;
+  OwnerRRef& operator=(const OwnerRRef& other) = delete;
+  OwnerRRef& operator=(OwnerRRef&& other) = delete;
+
+  OwnerRRef(
+      worker_id_t ownerId,
+      const RRefId& rrefId,
+      TypePtr type,
+      std::vector<c10::Device> devices);
+
+  OwnerRRef(
+      worker_id_t ownerId,
+      const RRefId& rrefId,
+      TypePtr type,
+      std::optional<IValue> value,
+      std::vector<c10::Device> devices);
+
+  inline bool isOwner() const override {
+    return true;
+  }
+
+  // OwnerRRef is always confirmed, while UserRRef is only confirmed when the
+  // owner knows about it.
+  inline bool confirmedByOwner() const override {
+    return true;
+  }
+
+  // Get a constant reference of the real value. This method will block if the
+  // value is not ready. This method does not need GIL as it does not create
+  // any new py::object. It will throw if there is an error.
+  const IValue& getValue() const;
+
+  // Set the value of this ``OwnerRRef``. This method does not need GIL as it
+  // does not create any new py::object.
+  void setValue(IValue&& value);
+  // Sets the value of this ``OwnerRRef`` to contain an exception.
+  void setError(std::exception_ptr eptr);
+
+  // Has a value or error been set?
+  bool hasValue() const;
+  // Gets a future that is satisfied when the value or error is set.
+  c10::intrusive_ptr<JitFuture> getFuture();
+
+ private:
+  friend class RRefContext;
+
+  c10::intrusive_ptr<JitFuture> future_;
+};
+
+TORCH_API std::ostream& operator<<(std::ostream& os, const RRef& rref);
+
+// Helper function that casts from c10::RRefInterface to OwnerRRef
+inline TORCH_API c10::intrusive_ptr<OwnerRRef> fromRRefInterface(
+    const c10::intrusive_ptr<c10::RRefInterface>& rrefInterface) {
+  return c10::static_intrusive_pointer_cast<OwnerRRef>(rrefInterface);
+}
+
+// Helper function that casts from OwnerRRef to c10::RRefInterface
+inline TORCH_API c10::intrusive_ptr<c10::RRefInterface> fromOwnerRRef(
+    const c10::intrusive_ptr<RRef>& ownerRRef) {
+  return c10::static_intrusive_pointer_cast<c10::RRefInterface>(ownerRRef);
+}
+
+} // namespace torch::distributed::rpc
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/rref_proto.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/rref_proto.h
new file mode 100644
index 0000000000000000000000000000000000000000..f03caf801f286be97d5f6b601b5d5062cdd68d04
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/rref_proto.h
@@ -0,0 +1,164 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+#include <torch/csrc/distributed/rpc/types.h>
+#include <torch/csrc/jit/runtime/operator.h>
+#include <torch/csrc/jit/serialization/pickler.h>
+#include <vector>
+
+namespace torch::distributed::rpc {
+
+// Temporary solution of RRef operations.
+// TODO: Remove all these messages and use rpc + registered functions instead.
+class TORCH_API RRefMessageBase : public RpcCommandBase {
+ public:
+  RRefMessageBase(const RRefId& rrefId, MessageType type)
+      : rrefId_(rrefId), type_(type) {}
+
+  const RRefId& rrefId();
+
+ protected:
+  // NOLINTNEXTLINE(cppcoreguidelines*)
+  const RRefId rrefId_;
+  // NOLINTNEXTLINE(cppcoreguidelines*)
+  const MessageType type_;
+};
+
+class TORCH_API ForkMessageBase : public RRefMessageBase {
+ public:
+  ForkMessageBase(const RRefId& rrefId, const ForkId& forkId, MessageType type)
+      : RRefMessageBase(rrefId, type), forkId_(forkId) {}
+
+  const ForkId& forkId();
+
+  c10::intrusive_ptr<Message> toMessageImpl() && override;
+  static std::pair<RRefId, ForkId> fromMessage(
+      const Message& message,
+      MessageType type);
+
+ protected:
+  // NOLINTNEXTLINE(cppcoreguidelines*)
+  const ForkId forkId_;
+};
+
+// UserRRef uses this message to fetch the remote RRef value from the owner.
+class TORCH_API ScriptRRefFetchCall final : public RRefMessageBase {
+ public:
+  ScriptRRefFetchCall(worker_id_t fromWorkerId, const RRefId& rrefId)
+      : RRefMessageBase(rrefId, MessageType::SCRIPT_RREF_FETCH_CALL),
+        fromWorkerId_(fromWorkerId) {}
+
+  inline worker_id_t fromWorkerId() const {
+    return fromWorkerId_;
+  }
+
+  c10::intrusive_ptr<Message> toMessageImpl() && override;
+  static std::unique_ptr<ScriptRRefFetchCall> fromMessage(
+      const Message& message);
+
+ private:
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const worker_id_t fromWorkerId_;
+};
+
+class TORCH_API PythonRRefFetchCall final : public RRefMessageBase {
+ public:
+  PythonRRefFetchCall(worker_id_t fromWorkerId, const RRefId& rrefId)
+      : RRefMessageBase(rrefId, MessageType::PYTHON_RREF_FETCH_CALL),
+        fromWorkerId_(fromWorkerId) {}
+
+  c10::intrusive_ptr<Message> toMessageImpl() && override;
+  static std::unique_ptr<PythonRRefFetchCall> fromMessage(
+      const Message& message);
+
+ private:
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const worker_id_t fromWorkerId_;
+};
+
+// OwnerRRef uses this message to send the RRef value to a remote UserRRef
+class TORCH_API RRefFetchRet : public RpcCommandBase {
+ public:
+  RRefFetchRet(std::vector<at::IValue> values, MessageType type)
+      : values_(std::move(values)), type_(type) {}
+
+  const std::vector<at::IValue>& values();
+  c10::intrusive_ptr<Message> toMessageImpl() && override;
+
+ private:
+  std::vector<at::IValue> values_;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const MessageType type_;
+};
+
+class TORCH_API ScriptRRefFetchRet final : public RRefFetchRet {
+ public:
+  explicit ScriptRRefFetchRet(std::vector<at::IValue> values)
+      : RRefFetchRet(std::move(values), MessageType::SCRIPT_RREF_FETCH_RET) {}
+
+  static std::unique_ptr<ScriptRRefFetchRet> fromMessage(
+      const Message& message);
+};
+
+class TORCH_API PythonRRefFetchRet final : public RRefFetchRet {
+ public:
+  explicit PythonRRefFetchRet(std::vector<at::IValue> values)
+      : RRefFetchRet(std::move(values), MessageType::PYTHON_RREF_FETCH_RET) {}
+
+  static std::unique_ptr<PythonRRefFetchRet> fromMessage(
+      const Message& message);
+};
+
+// UserRRef (regardless it's the creator or not) uses this message to notify
+// OwnerRRef on delete.
+class TORCH_API RRefUserDelete final : public ForkMessageBase {
+ public:
+  RRefUserDelete(const RRefId& rrefId, const ForkId& forkId)
+      : ForkMessageBase(rrefId, forkId, MessageType::RREF_USER_DELETE) {}
+
+  static std::unique_ptr<RRefUserDelete> fromMessage(const Message& message);
+};
+
+class TORCH_API RemoteRet final : public ForkMessageBase {
+ public:
+  RemoteRet(const RRefId& rrefId, const ForkId& forkId)
+      : ForkMessageBase(rrefId, forkId, MessageType::REMOTE_RET) {}
+
+  static std::unique_ptr<RemoteRet> fromMessage(const Message& message);
+};
+
+// A child RRef uses this message to notify its parent that the child has been
+// confirmed by the owner.
+class TORCH_API RRefChildAccept final : public RpcCommandBase {
+ public:
+  explicit RRefChildAccept(const ForkId& forkId) : forkId_(forkId) {}
+
+  const ForkId& forkId() const;
+
+  c10::intrusive_ptr<Message> toMessageImpl() && override;
+  static std::unique_ptr<RRefChildAccept> fromMessage(const Message& message);
+
+ private:
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const ForkId forkId_;
+};
+
+// A child RRef uses this message to send a fork request to the owner.
+class TORCH_API RRefForkRequest final : public ForkMessageBase {
+ public:
+  RRefForkRequest(const RRefId& rrefId, const ForkId& forkId)
+      : ForkMessageBase(rrefId, forkId, MessageType::RREF_FORK_REQUEST) {}
+
+  static std::unique_ptr<RRefForkRequest> fromMessage(const Message& message);
+};
+
+class TORCH_API RRefAck final : public RpcCommandBase {
+ public:
+  RRefAck() = default;
+
+  c10::intrusive_ptr<Message> toMessageImpl() && override;
+  static std::unique_ptr<RRefAck> fromMessage(const Message& message);
+};
+
+} // namespace torch::distributed::rpc
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/script_call.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/script_call.h
new file mode 100644
index 0000000000000000000000000000000000000000..242eb6855b128a614dacf664bb1499eb9e3a5b0d
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/script_call.h
@@ -0,0 +1,68 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+#include <torch/csrc/jit/runtime/operator.h>
+#include <torch/csrc/jit/serialization/pickler.h>
+#include <optional>
+#include <vector>
+
+namespace torch::distributed::rpc {
+
+using torch::jit::Operator;
+
+// A ScriptCall instance represents an invocation of a builtin operator for a
+// TorchScript function. If it is a builtin operator, it
+// contains a shared ptr to the `Operator` and a list of arguments.
+// If it is a TorchScript function, it contains a non empty qualifiedName string
+// to the TorchScript function schema name and a list of arguments.
+class TORCH_API ScriptCall : public RpcCommandBase {
+ public:
+  // Constructor for builtin operator call.
+  ScriptCall(std::shared_ptr<Operator> op, std::vector<at::IValue>&& stack);
+  // Constructor for TorchScript function call.
+  ScriptCall(
+      const c10::QualifiedName& qualifiedName,
+      std::vector<at::IValue>&& stack,
+      const bool isAsyncExecution = false);
+
+  bool hasOp() const;
+  std::shared_ptr<Operator> op() const;
+  bool hasQualifiedName() const;
+  const c10::QualifiedName& qualifiedName() const;
+  // return the argument stack of this builtin operator
+  const std::vector<at::IValue>& stack() const;
+  std::vector<at::IValue>& stackRef();
+  inline bool isAsyncExecution() const {
+    return isAsyncExecution_;
+  }
+
+  c10::intrusive_ptr<Message> toMessageImpl() && override;
+  static std::unique_ptr<ScriptCall> fromMessage(const Message& message);
+
+  ~ScriptCall() override = default;
+
+ protected:
+  virtual void toIValues(std::vector<at::IValue>& ivalues) const;
+  static std::unique_ptr<ScriptCall> fromIValues(
+      std::vector<at::IValue>& ivalues);
+
+ private:
+  // Given an operator symbol and a string schema, return the matched operator.
+  static std::shared_ptr<Operator> matchOperator(const std::string& str_schema);
+
+  static const std::string BUILTIN_OP_NAMESPACE_;
+  static const std::string ATEN_PREFIX_;
+
+  // This field has value if this ScriptCall represents invocation of a builtin
+  // operator.
+  std::optional<std::shared_ptr<Operator>> op_;
+  // This field has non empty string if this ScriptCall represents invocation of
+  // an annotated torchscript function defined by users.
+  std::optional<const c10::QualifiedName> qualifiedName_;
+  std::vector<at::IValue> stack_;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const bool isAsyncExecution_;
+};
+
+} // namespace torch::distributed::rpc
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/script_remote_call.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/script_remote_call.h
new file mode 100644
index 0000000000000000000000000000000000000000..68d70d8bdc88d6dee3709db7b3715a92102e1815
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/script_remote_call.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/script_call.h>
+#include <torch/csrc/distributed/rpc/types.h>
+#include <torch/csrc/jit/runtime/operator.h>
+#include <torch/csrc/jit/serialization/pickler.h>
+#include <vector>
+
+namespace torch::distributed::rpc {
+
+using torch::jit::Operator;
+
+// A ScriptRemoteCall instance represents an invocation of `dist.remote` on a
+// builtin operator. Currently, it does not support using RRef as arguments yet.
+// Besides the operator and a vector of arguments, ScriptRemoteCall also
+// contains the RRefId and the ForkId of the return value RRef.
+class TORCH_API ScriptRemoteCall final : public ScriptCall {
+ public:
+  // Constructor for builtin operator call.
+  ScriptRemoteCall(
+      std::shared_ptr<Operator> op,
+      std::vector<at::IValue>&& stack,
+      const RRefId& retRRefId,
+      const ForkId& retForkId);
+
+  // Constructor for TorchScript function call.
+  ScriptRemoteCall(
+      const c10::QualifiedName& qualifiedName,
+      std::vector<at::IValue>&& stack,
+      const RRefId& retRRefId,
+      const ForkId& retForkId,
+      const bool isAsyncExecution);
+
+  inline const RRefId& retRRefId() const {
+    return retRRefId_;
+  }
+
+  inline const ForkId& retForkId() const {
+    return retForkId_;
+  }
+
+  static std::unique_ptr<ScriptRemoteCall> fromIValues(
+      std::vector<at::IValue>& ivalues);
+
+  c10::intrusive_ptr<Message> toMessageImpl() && override;
+  static std::unique_ptr<ScriptRemoteCall> fromMessage(const Message& message);
+
+ private:
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const RRefId retRRefId_;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const ForkId retForkId_;
+};
+
+} // namespace torch::distributed::rpc
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/script_resp.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/script_resp.h
new file mode 100644
index 0000000000000000000000000000000000000000..8940a97afc536e4cf0446d689e3028a5f304410d
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/script_resp.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+#include <torch/csrc/jit/serialization/pickler.h>
+
+namespace torch::distributed::rpc {
+
+// Return value of a builtin operator or a TorchScript function.
+class TORCH_API ScriptResp final : public RpcCommandBase {
+ public:
+  explicit ScriptResp(at::IValue&& values);
+
+  const at::IValue& value();
+  c10::intrusive_ptr<Message> toMessageImpl() && override;
+  static std::unique_ptr<ScriptResp> fromMessage(const Message& message);
+
+ private:
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const at::IValue value_;
+};
+
+} // namespace torch::distributed::rpc
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/tensorpipe_agent.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/tensorpipe_agent.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e35d2866b3b25e425ea8a82add848ceb4403b8c
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/tensorpipe_agent.h
@@ -0,0 +1,495 @@
+#pragma once
+
+#ifdef USE_TENSORPIPE
+
+#include <atomic>
+#include <thread>
+
+#include <c10/core/thread_pool.h>
+#include <torch/csrc/distributed/c10d/PrefixStore.hpp>
+#include <torch/csrc/distributed/c10d/Store.hpp>
+#include <torch/csrc/distributed/rpc/rpc_agent.h>
+#include <utility>
+
+// Forward-declare the TensorPipe classes we need, to avoid including its
+// headers in PyTorch's ones and thus have it become a public dependency.
+
+namespace tensorpipe {
+
+class Context;
+class Error;
+class Listener;
+class Message;
+class Pipe;
+
+namespace transport {
+class Context;
+} // namespace transport
+
+namespace channel {
+class Context;
+} // namespace channel
+
+} // namespace tensorpipe
+
+namespace torch::distributed::rpc {
+
+// These priorities instruct TensorPipe on which transport/channel to pick
+// during handshake. Higher priorities will take precedence over lower ones.
+// The transport with lowest priority will be the one used to bootstrap pipes.
+
+constexpr int64_t kShmTransportPriority = 200;
+constexpr int64_t kIbvTransportPriority = 100;
+// The UV transport just uses TCP and should work everywhere, thus keep it last.
+constexpr int64_t kUvTransportPriority = 0;
+
+constexpr int64_t kCmaChannelPriority = 1200;
+constexpr int64_t kMultiplexedUvChannelPriority = 1100;
+// The basic channel reuses a transport as a channel, and is thus our fallback.
+constexpr int64_t kBasicChannelPriority = 1000;
+
+// CPU channel have higher priority than CUDA channels, since the latter might
+// handle CPU-to-CPU transfers, but will always be less efficient than their
+// CPU-only counterparts.
+constexpr int64_t kCudaIpcChannelPriority = 300;
+constexpr int64_t kCudaGdrChannelPriority = 200;
+constexpr int64_t kCudaXthChannelPriority = 400;
+constexpr int64_t kCudaBasicChannelPriority = 0;
+
+using steady_clock_time_point =
+    std::chrono::time_point<std::chrono::steady_clock>;
+
+struct TORCH_API TransportRegistration {
+  std::shared_ptr<tensorpipe::transport::Context> transport;
+  int64_t priority;
+  std::string address;
+};
+
+TORCH_DECLARE_REGISTRY(TensorPipeTransportRegistry, TransportRegistration);
+
+struct TORCH_API ChannelRegistration {
+  std::shared_ptr<tensorpipe::channel::Context> channel;
+  int64_t priority;
+};
+
+TORCH_DECLARE_REGISTRY(TensorPipeChannelRegistry, ChannelRegistration);
+
+constexpr auto kDefaultNumWorkerThreads = 16;
+
+struct TORCH_API TensorPipeRpcBackendOptions : public RpcBackendOptions {
+  TensorPipeRpcBackendOptions(
+      int numWorkerThreads,
+      std::optional<std::vector<std::string>> transports,
+      std::optional<std::vector<std::string>> channels,
+      float rpc_timeout,
+      std::string init_method,
+      std::unordered_map<std::string, DeviceMap> device_maps = {},
+      std::vector<c10::Device> devices = {})
+      : RpcBackendOptions(rpc_timeout, std::move(init_method)),
+        numWorkerThreads(numWorkerThreads),
+        transports(std::move(transports)),
+        channels(std::move(channels)),
+        deviceMaps(std::move(device_maps)),
+        devices(std::move(devices)) {
+    TORCH_CHECK(
+        numWorkerThreads > 0,
+        "num_worker_threads must be positive, got ",
+        numWorkerThreads);
+
+    if (this->transports.has_value()) {
+      for (const std::string& transportName : this->transports.value()) {
+        TORCH_CHECK(
+            TensorPipeTransportRegistry()->Has(transportName),
+            "Unknown transport: ",
+            transportName);
+      }
+    }
+
+    if (this->channels.has_value()) {
+      for (const std::string& channelName : this->channels.value()) {
+        TORCH_CHECK(
+            TensorPipeChannelRegistry()->Has(channelName),
+            "Unknown channel: ",
+            channelName);
+      }
+    }
+  }
+
+  void setDeviceMap(const std::string& workerName, const DeviceMap& deviceMap) {
+    auto iter = deviceMaps.find(workerName);
+    if (iter == deviceMaps.end()) {
+      deviceMaps[workerName] = deviceMap;
+    } else {
+      for (auto& entry : deviceMap) {
+        // c10::Device has no default constructor, hence map[device] doesn't
+        // work In C++-17 we can use insert_or_assign.
+        auto entryIter = iter->second.find(entry.first);
+        if (entryIter == iter->second.end()) {
+          iter->second.emplace(entry.first, entry.second);
+        } else {
+          entryIter->second = entry.second;
+        }
+      }
+    }
+  }
+
+  int numWorkerThreads;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const std::optional<std::vector<std::string>> transports;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const std::optional<std::vector<std::string>> channels;
+  std::unordered_map<std::string, DeviceMap> deviceMaps;
+  std::vector<c10::Device> devices;
+};
+
+// Struct to track the network source metrics
+struct TORCH_API NetworkSourceInfo {
+  worker_id_t srcRank;
+  std::vector<uint8_t> srcMachineAddr;
+};
+
+// Struct to track aggregated network metrics
+struct TORCH_API AggregatedNetworkData {
+  uint64_t numCalls{0};
+  uint64_t totalSentBytes{0};
+  uint64_t totalRecvBytes{0};
+  uint64_t totalErrors{0};
+};
+
+// TensorPipeAgent leverages TensorPipe (https://github.com/pytorch/tensorpipe)
+// to transparently move tensors and payloads through the fastest available
+// transport or channel. It acts like a hybrid RPC transport, providing shared
+// memory (linux) and TCP (linux & mac) support. CUDA support is in progress.
+class TORCH_API TensorPipeAgent : public RpcAgent {
+ public:
+  TensorPipeAgent(
+      const c10::intrusive_ptr<::c10d::Store>& store,
+      std::string selfName,
+      worker_id_t selfId,
+      std::optional<int> worldSize,
+      TensorPipeRpcBackendOptions opts,
+      std::unordered_map<std::string, DeviceMap> reverseDeviceMaps,
+      std::vector<c10::Device> devices,
+      std::unique_ptr<RequestCallback> cb);
+
+  TensorPipeAgent(const TensorPipeAgent&) = delete;
+  TensorPipeAgent& operator=(const TensorPipeAgent&) = delete;
+
+  c10::intrusive_ptr<JitFuture> send(
+      const WorkerInfo& to,
+      c10::intrusive_ptr<Message> message,
+      const float rpcTimeoutSeconds = kUnsetRpcTimeout,
+      const DeviceMap& deviceMap = {}) override;
+
+  // join() and sync() would be deprecated -
+  // https://github.com/pytorch/pytorch/issues/27647
+  void join(bool shutdown = false, float timeout = 0) override;
+  void sync() override {}
+  void startImpl() override;
+  void shutdownImpl() override;
+
+  ~TensorPipeAgent() override;
+
+  const WorkerInfo& getWorkerInfo(const std::string& workerName) const override;
+  const WorkerInfo& getWorkerInfo(worker_id_t workerId) const override;
+  std::vector<WorkerInfo> getWorkerInfos() const override;
+  void updateGroupMembership(
+      const WorkerInfo& workerInfo,
+      const std::vector<c10::Device>& devices,
+      const std::unordered_map<std::string, DeviceMap>& reverseDeviceMaps,
+      bool isJoin);
+
+  std::unordered_map<std::string, std::string> getMetrics() override;
+
+  void addGilWaitTime(const std::chrono::microseconds gilWaitTime) override;
+
+  TensorPipeRpcBackendOptions getBackendOptions() const;
+
+  const c10::intrusive_ptr<::c10d::Store> getStore() const;
+
+  DeviceMap getDeviceMap(const WorkerInfo& dest) const override;
+
+  const std::vector<c10::Device>& getDevices() const override;
+
+  using NetworkDataDict =
+      std::unordered_map<std::string, AggregatedNetworkData>;
+
+  // Returns metrics tracked by the NetworkDataDict
+  NetworkDataDict getNetworkData();
+  // Returns NetworkSourceInfo struct
+  NetworkSourceInfo getNetworkSourceInfo();
+
+  static const std::string& guessAddress();
+
+  // For testing purposes.
+  size_t timeoutMapSize();
+  size_t numPendingResponses();
+  size_t messageIdToTimeoutMapSize();
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const bool isStaticGroup_;
+
+ protected:
+  // TensorPipe write function that could be used to write response
+  // messages by server, and write request messages by client. This
+  // is a protected method since it is overwritten by FaultyTensorPipeAgent
+  virtual void pipeWrite(
+      const std::shared_ptr<tensorpipe::Pipe>&,
+      const c10::intrusive_ptr<Message>& message,
+      std::vector<c10::Device>&& devices,
+      std::vector<c10::Stream> streams,
+      std::function<void(const tensorpipe::Error&)>) noexcept;
+
+ private:
+  // Removes the given messageId with the given expirationTime from the
+  // timeoutMap_.
+  void removeFromTimeoutMap(uint64_t messageId);
+
+  // Populates workerIdToInfo_ and workerNameToInfo_ using addressStore_
+  void prepareNames(bool isStaticGroup);
+
+  // Check the static group attribute with the value set in store
+  void checkAndSetStaticGroup(const c10::intrusive_ptr<::c10d::Store>& store);
+
+  const std::string& findWorkerURL(const WorkerInfo& worker) const;
+
+  // Only use for Dynamic RPC groups, method to have worker leave group
+  void leaveGroup();
+
+  // TensorPipe read function that could be used to read response messages
+  // by client, and read request messages by server.
+  void pipeRead(
+      const std::shared_ptr<tensorpipe::Pipe>&,
+      std::function<void(
+          const tensorpipe::Error&,
+          c10::intrusive_ptr<Message>,
+          std::vector<c10::Stream>)>) noexcept;
+
+  // Callback of listener accept()
+  void onListenerAccepted(
+      const tensorpipe::Error& error,
+      std::shared_ptr<tensorpipe::Pipe>& pipe);
+
+  // Respond to a call from a peer
+  void respond(std::shared_ptr<tensorpipe::Pipe>& pipe);
+
+  void sendCompletedResponseMessage(
+      std::shared_ptr<tensorpipe::Pipe>& pipe,
+      JitFuture& futureResponseMessage,
+      uint64_t messageId,
+      std::vector<c10::Stream> stream);
+
+  // Collects metrics from successful RPC calls
+  void trackNetworkData(
+      uint64_t requestSize,
+      uint64_t responseSize,
+      const std::string& destWorkerName);
+
+  // Collects metrics from failed RPC calls
+  void trackNetworkError(
+      uint64_t requestSize,
+      const std::string& destWorkerName);
+
+  inline std::vector<c10::Device> getDevicesForRemote(
+      const std::string& remoteName,
+      const Message& message) const;
+
+  // When a request+response completes, we need to mark the future message as
+  // complete. However, if its timeout has already expired, it already has an
+  // error set. There is no atomic "test-and-set" way to mark a future complete
+  // only if it isn't yet. It does exist for errors (setErrorIfNeeded) but, even
+  // then, it ends up printing a log message, which may worry the user. To solve
+  // both issues we use a separate atomic flag to know the status of the future.
+  struct AtomicJitFuture {
+    explicit AtomicJitFuture(const std::vector<c10::Device>& devices) {
+      jitFuture = c10::make_intrusive<at::ivalue::Future>(
+          at::AnyClassType::get(), devices);
+    }
+
+    std::atomic_flag isComplete = ATOMIC_FLAG_INIT;
+    c10::intrusive_ptr<JitFuture> jitFuture;
+  };
+
+  // Maintains state per client pipe to track pending response messages and
+  // error states. pendingResponseMessage_ should be protected by a mutex since
+  // it can be raced with user send() call.
+  // TODO: To achieve better performance we can have a pipe pool per
+  // client that can be configured using RpcBackendOptions.
+  struct ClientPipe {
+    explicit ClientPipe(std::shared_ptr<tensorpipe::Pipe> pipe)
+        : pipe_(std::move(pipe)) {}
+    std::shared_ptr<tensorpipe::Pipe> pipe_;
+    mutable std::mutex mutex_;
+    bool inError_{false};
+    // Map from Message Request ID's to corresponding futures.
+    std::unordered_map<uint64_t, std::shared_ptr<AtomicJitFuture>>
+        pendingResponseMessage_;
+  };
+
+  const c10::intrusive_ptr<::c10d::Store> store_;
+
+  const TensorPipeRpcBackendOptions opts_;
+  // For dynamic RPC, the reverse device maps are updated whenever a new rank
+  // joins or leaves the group
+  std::unordered_map<std::string, DeviceMap> reverseDeviceMaps_;
+  // Local devices used by this agent. If application didn't specify this
+  // field, it will be initialized using corresponding local devices in
+  // opts_.deviceMaps and reverseDeviceMaps_;
+  std::vector<c10::Device> devices_;
+
+  ThreadPool threadPool_;
+  std::shared_ptr<tensorpipe::Context> context_;
+  std::shared_ptr<tensorpipe::Listener> listener_;
+
+  mutable std::mutex connectedPipesMutex_;
+  std::unordered_map<worker_id_t, ClientPipe> connectedPipes_;
+
+  // Maps keyed on name and id for easy WorkerInfo lookup.
+  std::unordered_map<worker_id_t, WorkerInfo> workerIdToInfo_;
+  std::unordered_map<std::string, WorkerInfo> workerNameToInfo_;
+  std::unordered_map<std::string, std::string> workerNameToURL_;
+
+  ::c10d::PrefixStore rankToNameStore_;
+  ::c10d::PrefixStore nameToAddressStore_;
+  // Store keys that will used to count joined processes and active calls during
+  // the shutdown process
+  ::c10d::PrefixStore shutdownStore_;
+  int worldSize_ = 0;
+  std::atomic<uint64_t> nextMessageID_{0};
+
+  // Metadata used for tracking of whether certain RPCs have timed out or not.
+  struct TimeoutMessageMetadata {
+    TimeoutMessageMetadata(
+        uint64_t messageId_,
+        std::shared_ptr<AtomicJitFuture> responseFuture_,
+        std::chrono::milliseconds timeout_)
+        : messageId(messageId_),
+          responseFuture(std::move(responseFuture_)),
+          timeout(timeout_) {}
+    uint64_t messageId;
+    std::shared_ptr<AtomicJitFuture> responseFuture;
+    std::chrono::milliseconds timeout;
+  };
+
+  // Map to store the expiration times for each message.
+  std::map<steady_clock_time_point, std::vector<TimeoutMessageMetadata>>
+      timeoutMap_;
+
+  // Map to store the messageId to expiry time.
+  std::unordered_map<uint64_t, steady_clock_time_point> messageIdToTimeout_;
+
+  // Thread that will poll the timeoutMap_ for timed out messages and mark them
+  // with an error accordingly
+  std::thread timeoutThread_;
+
+  // Function run by the timeoutThread_ to check for timed out RPCs
+  void pollTimeoutRpcs();
+
+  // Mutex to guard the timeoutMap_
+  std::mutex timeoutMapMutex_;
+
+  // Condition Variable to signal population of the timeoutMap_
+  std::condition_variable timeoutThreadCV_;
+
+  // Returns the expiration time for an RPC by adding the current time to the
+  // passed in timeout.
+  inline steady_clock_time_point computeRpcMessageExpiryTime(
+      std::chrono::milliseconds timeout) const {
+    return std::chrono::time_point_cast<std::chrono::milliseconds>(
+        std::chrono::steady_clock::now() + timeout);
+  }
+
+  // Handle error on an outgoing pipe
+  void handleClientError(
+      ClientPipe& clientPipe,
+      const tensorpipe::Error& error);
+
+  // This is a generic struct for capturing Time-Series Metrics. It keeps a
+  // running sum and count of data points (observations), and can return an
+  // average of the data points seen so far. This is currently only used for
+  // tracking the GIL Wait Time in RPC Agents, but can be used for other metrics
+  // as well.
+  struct TimeSeriesMetricsTracker {
+    // Running sum of the data points seen so far
+    uint64_t currentSum_;
+    // Running count of the data points seen so far
+    uint64_t currentCount_;
+
+    explicit TimeSeriesMetricsTracker(
+        uint64_t currentSum = 0,
+        uint64_t currentCount = 0);
+
+    // Adds a data point (which is basically one observation for the metric
+    // being tracked) to the running sum and count.
+    void addData(uint64_t dataPoint);
+    // Returns the average of all the data points seen so far.
+    float computeAverage() const;
+  };
+
+  // Map of Time-Series metrics tracked by the RPC Agent
+  std::unordered_map<std::string, TimeSeriesMetricsTracker> timeSeriesMetrics_;
+  // Mutex to guard timeSeriesMetrics_
+  std::mutex metricsMutex_;
+
+  // Custom lock guard used to check if the RPC group is dynamic and lock the
+  // mutex if so
+  struct GroupMembershipLockGuard {
+    GroupMembershipLockGuard(std::mutex& mutex, bool isStaticGroup)
+        : ref_(mutex), isStaticGroup_(isStaticGroup) {
+      if (isStaticGroup_) {
+        ref_.lock();
+      }
+    }
+
+    ~GroupMembershipLockGuard() {
+      if (isStaticGroup_) {
+        ref_.unlock();
+      }
+    }
+
+    GroupMembershipLockGuard(const GroupMembershipLockGuard&) = delete;
+
+   private:
+    std::mutex& ref_;
+    bool isStaticGroup_;
+  };
+  // Mutex to guard access to group membership data
+  // e.g. updates to (workerIdToInfo_, workerNameToInfo_, workerNameToURL_)
+  mutable std::mutex groupMembershipMutex_;
+
+  // Map to Track Network Data
+  NetworkDataDict networkData_;
+  // Mutex to guard networkData_
+  std::mutex networkDataMutex_;
+
+  // A mutex and a cv to guard access to the call counts and watch for changes.
+  std::mutex callCountMutex_;
+  std::condition_variable callCountCV_;
+  // Running total of un-processed, un-errored RPC calls sent
+  int32_t clientActiveCalls_{0};
+  // Running total of un-processed RPC requests received
+  int32_t serverActiveCalls_{0};
+  // Running total of RPC requests that will be completed asynchronously
+  int32_t serverActiveAsyncCalls_{0};
+
+  // Whether a global graceful shutdown has begun, in which case we'll silence
+  // error messages due to remote workers closing their pipes.
+  std::atomic<bool> shuttingDown_{false};
+
+  // Helpers to modify the counts while correctly dealing with the mutex and cv.
+  void increaseCallCount(int32_t& count);
+  void decreaseCallCount(int32_t& count);
+
+  // Helpers to set the state of the requests.
+  void markFutureAsComplete(
+      std::shared_ptr<AtomicJitFuture> atomicFuture,
+      c10::intrusive_ptr<Message> message,
+      std::vector<c10::Stream> streams);
+  void markFutureWithError(
+      std::shared_ptr<AtomicJitFuture> atomicFuture,
+      std::string errorMsg);
+};
+
+} // namespace torch::distributed::rpc
+
+#endif // USE_TENSORPIPE
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/tensorpipe_utils.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/tensorpipe_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..52c068b4f12975567f9d713bc66b064779df4784
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/tensorpipe_utils.h
@@ -0,0 +1,119 @@
+#pragma once
+
+#ifdef USE_TENSORPIPE
+
+#include <torch/csrc/distributed/rpc/utils.h>
+
+namespace tensorpipe {
+class Message;
+class Allocation;
+class Descriptor;
+} // namespace tensorpipe
+
+namespace torch::distributed::rpc {
+
+TORCH_API const c10::Stream& getStreamForDevice(
+    const std::vector<c10::Stream>& streams,
+    const c10::Device& device);
+
+// Inspired by c10/core/impl/DeviceGuardImplInterface.h.
+
+class TensorpipeDeviceTypeConverter {
+ public:
+  // Ideally we'd want this to also return a tensorpipe::Message::Tensor object
+  // but we cannot forward-declare that class (because it's nested), and we
+  // cannot include the TensorPipe headers because it's a private dependency.
+  // Thus we bend over backwards and entrust this method with appending that
+  // object to the `tensors` field of the tensorpipe::Message object we pass.
+  virtual std::optional<std::vector<char>> prepareTensorForSending(
+      const c10::Storage& storage,
+      const std::vector<c10::Stream>& streams,
+      tensorpipe::Message& message) const = 0;
+
+  // Same as above: this method cannot return a tensorpipe::Allocation::Tensor,
+  // thus it appends it to the `tensors` field of the tensorpipe::Allocation.
+  virtual at::DataPtr allocateTensorForReceiving(
+      c10::DeviceIndex deviceIndex,
+      size_t length,
+      const std::vector<c10::Stream>& streams,
+      tensorpipe::Allocation& allocation) const = 0;
+
+  virtual ~TensorpipeDeviceTypeConverter() = default;
+};
+
+extern TORCH_API std::array<
+    std::atomic<const TensorpipeDeviceTypeConverter*>,
+    static_cast<size_t>(DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES)>
+    device_type_converter_registry;
+
+class TORCH_API TensorpipeDeviceTypeConverterRegistrar {
+ public:
+  TensorpipeDeviceTypeConverterRegistrar(
+      DeviceType,
+      const TensorpipeDeviceTypeConverter*);
+};
+
+#define C10_REGISTER_TENSORPIPE_DEVICE_TYPE_CONVERTER(                     \
+    DevType, TensorpipeDeviceTypeConverter)                                \
+  static ::torch::distributed::rpc::TensorpipeDeviceTypeConverterRegistrar \
+      C10_ANONYMOUS_VARIABLE(g_##DeviceType)(                              \
+          ::c10::DeviceType::DevType, new TensorpipeDeviceTypeConverter());
+
+inline const TensorpipeDeviceTypeConverter* getDeviceTypeConverter(
+    DeviceType type) {
+  return device_type_converter_registry[static_cast<size_t>(type)].load();
+}
+
+// A struct that holds pointers that keep alive all the memory that will be
+// accessed by TensorPipe during a write operation.
+struct TensorpipeWriteBuffers {
+  // Allocate on heap so pointers stay valid as we move the holder.
+  std::unique_ptr<MessageType> type;
+  std::unique_ptr<int64_t> id;
+  std::vector<char> payload;
+  std::vector<char> pickle;
+  // This contains the original tensors and the clones of the sparse tensors.
+  std::vector<torch::Tensor> tensors;
+  // This contains the copies of the data of the tensors that didn't own their
+  // memory, e.g., the ones created from torch::from_blob() with no deleter.
+  std::vector<std::vector<char>> copiedTensors;
+};
+
+// A struct that holds pointers that keep alive all the memory that will be
+// accessed by TensorPipe during a read operation.
+struct TensorpipeReadBuffers {
+  // Allocate on heap so pointers stay valid as we move the holder.
+  std::unique_ptr<MessageType> type;
+  std::unique_ptr<int64_t> id;
+  std::vector<char> payload;
+  std::vector<char> pickle;
+  std::vector<c10::DataPtr> tensors;
+};
+
+// Convert an RPC message into a TensorPipe message, plus a holder to all the
+// data that must be kept alive while the write is performed asynchronously.
+TORCH_API std::tuple<tensorpipe::Message, TensorpipeWriteBuffers>
+tensorpipeSerialize(
+    const c10::intrusive_ptr<Message>& rpcMessage,
+    std::vector<c10::Device> devices,
+    const std::vector<c10::Stream>& streams);
+
+// Allocate the buffers that will hold the incoming data. They will be managed
+// by the returned holder, which must be kept alive until the asynchronous read
+// has finished. Pointers to these buffers will be stored in the returned
+// tensorpipe::Allocation struct.
+TORCH_API std::pair<tensorpipe::Allocation, TensorpipeReadBuffers>
+tensorpipeAllocate(
+    const tensorpipe::Descriptor& tpDescriptor,
+    const std::vector<c10::Stream>& streams);
+
+// Convert a TensorPipe message back into an RPC message. This requires the data
+// to be available and can thus only be performed once the asynchronous read has
+// completed. The holder can be destroyed once this function returns.
+TORCH_API c10::intrusive_ptr<Message> tensorpipeDeserialize(
+    const tensorpipe::Descriptor& tpDescriptor,
+    TensorpipeReadBuffers&& holder);
+
+} // namespace torch::distributed::rpc
+
+#endif // USE_TENSORPIPE
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/testing/faulty_tensorpipe_agent.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/testing/faulty_tensorpipe_agent.h
new file mode 100644
index 0000000000000000000000000000000000000000..d9c448833c51dfa4d48a0b40cbb527dd6bb6c58c
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/testing/faulty_tensorpipe_agent.h
@@ -0,0 +1,104 @@
+#pragma once
+
+#ifdef USE_TENSORPIPE
+
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/tensorpipe_agent.h>
+
+namespace torch::distributed::rpc {
+
+struct TORCH_API FaultyTensorPipeRpcBackendOptions
+    : public TensorPipeRpcBackendOptions {
+  FaultyTensorPipeRpcBackendOptions(
+      int num_worker_threads,
+      float rpc_timeout,
+      std::string init_method,
+      std::vector<std::string> messages_to_fail,
+      std::unordered_map<std::string, float> messages_to_delay,
+      int num_fail_sends = 0)
+      : TensorPipeRpcBackendOptions(
+            num_worker_threads,
+            std::optional<std::vector<std::string>>(),
+            std::optional<std::vector<std::string>>(),
+            rpc_timeout,
+            std::move(init_method)),
+        messagesToFail(std::move(messages_to_fail)),
+        messagesToDelay(std::move(messages_to_delay)),
+        numFailSends(num_fail_sends) {
+    TORCH_CHECK(numFailSends >= 0, "numFailSends should be non-negative");
+  }
+
+  std::vector<std::string> messagesToFail;
+  std::unordered_map<std::string, float> messagesToDelay;
+  int numFailSends;
+};
+
+class TORCH_API FaultyTensorPipeAgent : public TensorPipeAgent {
+ public:
+  FaultyTensorPipeAgent(
+      const c10::intrusive_ptr<::c10d::Store>& store,
+      std::string selfName,
+      worker_id_t selfId,
+      int worldSize,
+      FaultyTensorPipeRpcBackendOptions opts,
+      std::unordered_map<std::string, DeviceMap> reverseDeviceMaps,
+      std::vector<c10::Device> devices,
+      std::unique_ptr<RequestCallback> callback);
+
+  // Faulty send function for this class.
+  c10::intrusive_ptr<JitFuture> send(
+      const WorkerInfo& to,
+      c10::intrusive_ptr<Message> message,
+      const float rpcTimeoutSeconds = torch::distributed::rpc::kUnsetRpcTimeout,
+      const DeviceMap& deviceMap = {}) override;
+
+  // Add delay to writes
+  void pipeWrite(
+      const std::shared_ptr<tensorpipe::Pipe>& pipe,
+      const c10::intrusive_ptr<Message>& rpcMessage,
+      std::vector<c10::Device>&& devices,
+      std::vector<c10::Stream> streams,
+      std::function<void(const tensorpipe::Error&)> fn) noexcept override;
+
+ protected:
+  // This function checks the messageTypesToFail_ to determine whether to use
+  // the faulty send or not.
+  bool shouldFailMessage(MessageType type) const;
+
+ private:
+  // This function parses the list of strings passed in by the python tests and
+  // resolves the Message Types that must use the faulty send.
+  std::vector<MessageType> parseMessagesToFailInput(
+      const std::vector<std::string>& messagesToFail) const;
+
+  // Returns amount of time in seconds to delay sending of the given message
+  // type.
+  float getDelayForMessage(MessageType type) const;
+
+  // Parse message types that we should inject arbitrary delays for.
+  std::unordered_map<MessageType, float, std::hash<int>> parseMessagesToDelay(
+      const std::unordered_map<std::string, float>& messageTypesToDelay) const;
+
+  // Number of sends to intentionally fail before allowing one to succeed.
+  const int numFailSends_;
+
+  // Vector of the MessageTypes that we must use the faulty send for. This is
+  // parsed based on a list of strings passed in by the python tests.
+  const std::vector<MessageType> messageTypesToFail_;
+
+  // Mapping of message types to amount we should delay send for in the ::send()
+  // function.
+  std::unordered_map<MessageType, float, std::hash<int>> messageTypesToDelay_;
+
+  // Map to track the number of sends we've failed for each RPC.
+  std::unordered_map<std::string, int> failMessageCountMap_;
+
+  // Mutex to guard failMessageCountMap_
+  std::mutex failMapMutex_;
+
+  MessageType messageStringToType(const std::string& messageString) const;
+};
+
+} // namespace torch::distributed::rpc
+
+#endif // USE_TENSORPIPE
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/testing/testing.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/testing/testing.h
new file mode 100644
index 0000000000000000000000000000000000000000..b532f6fff77bd01e47e403ab9f259e727e58615d
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/testing/testing.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+namespace torch::distributed::rpc::testing {
+
+PyMethodDef* python_functions();
+
+} // namespace torch::distributed::rpc::testing
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/torchscript_functions.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/torchscript_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..ee7cfdf9977c54fe2b12628b3a99117d52b3f650
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/torchscript_functions.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <torch/csrc/autograd/profiler.h>
+#include <torch/csrc/distributed/autograd/utils.h>
+#include <torch/csrc/distributed/rpc/rref_context.h>
+#include <torch/csrc/distributed/rpc/script_remote_call.h>
+
+namespace torch::distributed::rpc {
+
+// This function sends an rpc call to run torchscript function, currently the
+// torchscript function could only be a user defined python function with
+// "@torch.jit.script" annotation. The torchscript function could not be
+// a class constructor, class method, instance method or a script module.
+//   dst: destination worker name
+//   qualifiedName: torchscript function qualified name string like
+//                  "moduleName::torchscriptFunctionName", e.g,
+//                  "dist_autograd_test::my_py_add"
+//   stack: a bag of IValue args passed to torchscriptFunctionName
+// It returns c10::intrusive_ptr<ivalue::Future>
+c10::intrusive_ptr<c10::ivalue::Future> TORCH_API rpcTorchscript(
+    const std::string& dstWorkerName,
+    const c10::QualifiedName& qualifiedName,
+    const c10::FunctionSchema& functionSchema,
+    std::vector<c10::IValue> stack,
+    const float rpcTimeoutSeconds = torch::distributed::rpc::kUnsetRpcTimeout,
+    const bool isAsyncExecution = false);
+
+c10::intrusive_ptr<RRef> TORCH_API remoteTorchscript(
+    const std::string& dstWorkerName,
+    const c10::QualifiedName& qualifiedName,
+    const c10::FunctionSchema& functionSchema,
+    std::vector<c10::IValue>& stack,
+    const float rpcTimeoutSeconds = torch::distributed::rpc::kUnsetRpcTimeout,
+    const bool isAsyncExecution = false);
+
+} // namespace torch::distributed::rpc
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/types.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/types.h
new file mode 100644
index 0000000000000000000000000000000000000000..9364dad0351b12e6b55eaa9f4212c9f5370984c3
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/types.h
@@ -0,0 +1,70 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+
+namespace torch::distributed::rpc {
+
+using worker_id_t = int16_t;
+using local_id_t = int64_t;
+
+bool getAllowJitRRefPickle();
+TORCH_API void enableJitRRefPickle();
+TORCH_API void disableJitRRefPickle();
+
+struct TORCH_API JitRRefPickleGuard {
+  JitRRefPickleGuard();
+  JitRRefPickleGuard(JitRRefPickleGuard&& other) = delete;
+  JitRRefPickleGuard(const JitRRefPickleGuard&) = delete;
+  JitRRefPickleGuard& operator=(const JitRRefPickleGuard&) = delete;
+  JitRRefPickleGuard& operator=(JitRRefPickleGuard&&) = delete;
+  ~JitRRefPickleGuard();
+};
+
+struct TORCH_API GloballyUniqueId final {
+  GloballyUniqueId(worker_id_t createdOn, local_id_t localId);
+  GloballyUniqueId(const GloballyUniqueId& other) = default;
+  GloballyUniqueId& operator=(const GloballyUniqueId& other) = delete;
+  GloballyUniqueId(GloballyUniqueId&& other) = default;
+  GloballyUniqueId& operator=(GloballyUniqueId&& other) = delete;
+  ~GloballyUniqueId() = default;
+
+  bool operator==(const GloballyUniqueId& other) const;
+  bool operator!=(const GloballyUniqueId& other) const;
+
+  at::IValue toIValue() const;
+  static GloballyUniqueId fromIValue(const at::IValue&);
+
+  struct Hash {
+    size_t operator()(const GloballyUniqueId& key) const {
+      return (uint64_t(key.createdOn_) << kLocalIdBits) | key.localId_;
+    }
+  };
+
+  static constexpr int kLocalIdBits = 48;
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const worker_id_t createdOn_;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const local_id_t localId_;
+};
+
+TORCH_API std::ostream& operator<<(
+    std::ostream& os,
+    const GloballyUniqueId& globalId);
+
+using RRefId = GloballyUniqueId;
+using ForkId = GloballyUniqueId;
+using ProfilingId = GloballyUniqueId;
+
+struct TORCH_API SerializedPyObj final {
+  SerializedPyObj(std::string&& payload, std::vector<at::Tensor>&& tensors)
+      : payload_(std::move(payload)), tensors_(std::move(tensors)) {}
+
+  std::vector<at::IValue> toIValues() &&;
+  static SerializedPyObj fromIValues(std::vector<at::IValue> value);
+
+  std::string payload_;
+  std::vector<at::Tensor> tensors_;
+};
+
+} // namespace torch::distributed::rpc
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/unpickled_python_call.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/unpickled_python_call.h
new file mode 100644
index 0000000000000000000000000000000000000000..db179f854ac2edc673be8910f9e9ecd992ae5fb4
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/unpickled_python_call.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+#include <torch/csrc/distributed/rpc/types.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch::distributed::rpc {
+
+// This class converts the content in a PythonCall into py::object. This is a
+// helper class to make sure that all arguments deserialization is done before
+// entering RequestCallbackImpl::processRpc(...), so that the deserialization
+// related logic can be carried out in one spot instead of scattered in multiple
+// places for different message types.
+// NB: The reason for not consolidating class into PythonCall is because
+// PythonCall is a libtorch type which should not depend on Python types.
+class TORCH_API UnpickledPythonCall : public RpcCommandBase {
+ public:
+  UnpickledPythonCall(
+      const SerializedPyObj& serializedPyObj,
+      bool isAsyncExecution);
+  ~UnpickledPythonCall() override;
+
+  // toMessage() method is not implemented, as objects of this class should
+  // never be directly converted into a Message object.
+  c10::intrusive_ptr<Message> toMessageImpl() && override;
+  const py::object& pythonUdf() const;
+
+  inline bool isAsyncExecution() const {
+    return isAsyncExecution_;
+  }
+
+ private:
+  py::object pythonUdf_;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const bool isAsyncExecution_;
+};
+
+} // namespace torch::distributed::rpc
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/unpickled_python_remote_call.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/unpickled_python_remote_call.h
new file mode 100644
index 0000000000000000000000000000000000000000..7bf6d7491be5980dad3aef122dc04517d83c751e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/unpickled_python_remote_call.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+#include <torch/csrc/distributed/rpc/types.h>
+#include <torch/csrc/distributed/rpc/unpickled_python_call.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch::distributed::rpc {
+
+// This class converts the content in a PythonRemoteCall into py::object. This
+// is a helper class to make sure that all arguments deserialization is done
+// before entering RequestCallbackImpl::processRpc(...), so that the
+// deserialization related logic can be carried out in one spot instead of
+// scattered in multiple places for different message types.
+// NB: The reason for not consolidating class into PythonRemoteCall is because
+// PythonRemoteCall is a libtorch type which should not depend on Python types.
+class TORCH_API UnpickledPythonRemoteCall final : public UnpickledPythonCall {
+ public:
+  explicit UnpickledPythonRemoteCall(
+      const SerializedPyObj& serializedPyObj,
+      const at::IValue& retRRefId,
+      const at::IValue& retForkId,
+      const bool isAsyncExecution);
+
+  const RRefId& rrefId() const;
+  const ForkId& forkId() const;
+
+ private:
+  RRefId rrefId_;
+  ForkId forkId_;
+};
+
+} // namespace torch::distributed::rpc
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/utils.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..cfe65ef83f7d84424973fb0c0590d5ce15f7c04c
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/distributed/rpc/utils.h
@@ -0,0 +1,86 @@
+#pragma once
+
+#include <c10/core/Device.h>
+#include <c10/core/Event.h>
+#include <c10/core/Stream.h>
+#include <torch/csrc/autograd/profiler.h>
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+#include <torch/csrc/jit/serialization/pickle.h>
+#include <torch/csrc/utils/byte_order.h>
+
+namespace torch::distributed::rpc {
+
+// Parse error message and return RPCErrorType based on the message.
+TORCH_API RPCErrorType getRPCErrorType(const JitFuture& jitFuture);
+// Create an error string given the error description and error type
+TORCH_API std::string makeRPCError(
+    const std::string& rpcErrorStr,
+    RPCErrorType errorType);
+
+// Given an RPC message received as a request over the wire, deserialize it into
+// the appropriate 'RpcCommandBase' type.
+TORCH_API std::unique_ptr<RpcCommandBase> deserializeRequest(
+    const Message& request);
+
+// Given an RPC message received as a response over the wire, deserialize it
+// into the appropriate 'RpcCommandBase' type, if the response is
+// FORWARD_AUTOGRAD_RESP type, unwrap it, attach recvBackward() functions
+// to received tensors and set the wrappedMsgType to its wrapped message type.
+TORCH_API std::unique_ptr<RpcCommandBase> deserializeResponse(
+    const Message& response,
+    MessageType& wrappedMsgType);
+
+// Given an RPC message received as a response over the wire, deserialize it
+// into the valid IValue if the message is for a script rpc result,
+// otherwise deserialize it into dummy none ivalue that will never be used.
+// In this deserialization, we also attach recv rpc backward functions if
+// needed.
+IValue deserializeResptoIValueInternal(
+    RpcCommandBase& rpc,
+    MessageType messageType);
+TORCH_API IValue deserializeRespToIValue(const Message& message);
+
+// Note: format is subject to change and intended for RPCs.
+// For saving persistently to disk, use torch::save().
+TORCH_API std::string wireSerialize(
+    const std::vector<char>& payload,
+    const std::vector<at::Tensor>& tensors);
+
+TORCH_API std::pair<std::vector<char>, std::vector<at::Tensor>> wireDeserialize(
+    const void* data,
+    size_t data_size);
+
+// We use vector<char> as the type of blobs because it's what rpc::Message uses
+// for its payload, even though it has the disadvantage that it cannot be
+// allocated with uninitialized memory: it is always zeroed out.
+
+// Some Tensors are effectively views of larger Tensors, where only a small
+// subset of the Storage data is referenced. This normally is good and avoids
+// copies when kept locally, but if we naively push the whole Storage over the
+// wire, we'll end up with excess network traffic. This change clones tensors if
+// we'd save at least half the data, and over a minimum hurdle.
+TORCH_API c10::List<at::Tensor> cloneSparseTensors(
+    const std::vector<at::Tensor>& tensors);
+
+// Combines an original payload and wrapped payload into the original payload.
+// Used to generate the overall payload for the wrapped RPC.
+TORCH_API void writeWrappedPayload(
+    std::vector<char>& originalPayload,
+    std::vector<char>& additionalPayload);
+
+// Reads the additional, wrapped payload from a wrapped RPC off of the input
+// payload. After this, payload will contain the payload of the original,
+// un-wrapped RPC.
+TORCH_API std::vector<at::IValue> readWrappedPayload(
+    std::vector<char>& payload,
+    const rpc::Message& message);
+
+// Takes a list of events from autograd profiler and populates them into
+// profiledEvents to be carried over RPC.
+TORCH_API void populateRemoteProfiledEvents(
+    std::vector<torch::autograd::profiler::LegacyEvent>& profiledEvents,
+    const torch::autograd::profiler::ProfilerConfig& profilerConfig,
+    const std::vector<std::vector<torch::autograd::profiler::LegacyEvent>>&
+        eventLists);
+
+} // namespace torch::distributed::rpc
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/cache_entry.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/cache_entry.h
new file mode 100644
index 0000000000000000000000000000000000000000..01abe827698ed3d0cae124a1e6f9ef9602073af6
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/cache_entry.h
@@ -0,0 +1,95 @@
+#pragma once
+
+#include <Python.h>
+
+#ifdef __cplusplus
+
+#include <torch/csrc/dynamo/utils.h>
+#include <torch/csrc/utils/pybind.h>
+#include <list>
+
+extern "C" {
+
+#endif
+
+/*
+Our cache resides on the extra scratch space of the code object. The structure
+of the cache is as follows:
+
+-> ExtraState
+  -> CacheEntry (list)
+    -> guard_manager (a wrapper that contains the actual guard manager at its
+attr named root)
+    -> code
+  -> FrameState
+
+CacheEntry is a linked list node containing the guard_manager for guards
+and the optimized code.
+
+The FrameState is a PyDict that enables sharing between different frames. This
+is used to detect dynamism in automatic dynamic shapes.
+
+These two are encapsulated into a ExtraState.
+*/
+
+typedef struct CacheEntry CacheEntry;
+typedef struct ExtraState ExtraState;
+
+#ifdef __cplusplus
+
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED(
+    "-Wdeprecated-copy-with-user-provided-dtor")
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wdeprecated-copy-dtor")
+// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
+typedef struct VISIBILITY_HIDDEN CacheEntry {
+  // check the guards: lambda: <locals of user function>: bool
+  py::object guard_manager;
+  // modified user bytecode (protected by guard_manager's guards)
+  py::object code;
+  // CompileId corresponding to this compilation
+  py::object compile_id;
+  // root guard manager if exists
+  void* root_mgr{nullptr};
+  // diff guard root guard manager if exists
+  void* diff_guard_root_mgr{nullptr};
+  // backend used to create this cache entry
+  PyObject* backend{nullptr};
+  // Reference to owning ExtraState
+  ExtraState* _owner{nullptr};
+  // Reference to this CacheEntry's location in owner's linked list
+  std::list<CacheEntry>::iterator _owner_loc;
+  // Reference to string representation of the CompileContext
+  std::string trace_annotation;
+
+  CacheEntry(const py::handle& guarded_code, PyObject* backend);
+  CacheEntry(const CacheEntry&) = default;
+  CacheEntry(CacheEntry&&) = default;
+  CacheEntry& operator=(const CacheEntry&) = default;
+  CacheEntry& operator=(CacheEntry&&) = default;
+  ~CacheEntry();
+
+  // Warning: returns a reference whose lifetime is controlled by C++
+  py::object next();
+
+  void invalidate(py::object deleted_guard_manager);
+  // Called from the python side to update the diff guard root manager
+  void update_diff_guard_root_manager();
+} CacheEntry;
+C10_DIAGNOSTIC_POP()
+C10_DIAGNOSTIC_POP()
+
+#endif
+
+// Returns borrowed reference
+PyCodeObject* CacheEntry_get_code(CacheEntry* e);
+
+// Returns borrowed string representation of CompileContext
+const char* CacheEntry_get_trace_annotation(CacheEntry* e);
+
+// Returns a borrowed reference to CacheEntry as a PyObject
+// Warning: lifetime is controlled by C++
+PyObject* CacheEntry_to_obj(CacheEntry* e);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/compiled_autograd.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/compiled_autograd.h
new file mode 100644
index 0000000000000000000000000000000000000000..3e41287033c4e19eec8ede7e233b94180073c382
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/compiled_autograd.h
@@ -0,0 +1,1553 @@
+#pragma once
+#include <ATen/TensorGeometry.h>
+#include <ATen/core/ivalue.h>
+#include <c10/core/impl/TorchDispatchModeTLS.h>
+#include <c10/util/flat_hash_map.h>
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/autograd/input_metadata.h>
+#include <torch/csrc/autograd/saved_variable.h>
+#include <torch/csrc/autograd/variable_info.h>
+#include <torch/csrc/utils/python_stub.h>
+#include <torch/csrc/utils/torch_dispatch_mode.h>
+#include <typeindex>
+#include <vector>
+
+// see [Note: Compiled Autograd]
+
+namespace torch::dynamo::autograd {
+using namespace torch::autograd;
+
+// This is a layer of indirection for calling methods on the Python
+// AutogradCompilerInstance (referred to as the "py_compiler") from
+// libtorch_cpu (where Python is not available).
+// A PyCompilerInterfaceImpl in libtorch_python subclasses it and
+// overrides the methods to do the actual calls back to Python.
+struct TORCH_API PyCompilerInterface {
+  PyCompilerInterface() = default;
+  PyCompilerInterface(const PyCompilerInterface&) = delete;
+  PyCompilerInterface& operator=(const PyCompilerInterface&) = delete;
+  PyCompilerInterface(PyCompilerInterface&&) = delete;
+  PyCompilerInterface& operator=(PyCompilerInterface&&) = delete;
+  virtual ~PyCompilerInterface() = default;
+
+  // Invokes py_compiler.bind_function
+  virtual std::string bind_function(
+      PyObject* py_compiler,
+      const std::string& fn_name,
+      // NOLINTNEXTLINE(performance-unnecessary-value-param)
+      functional_apply_t fn,
+      // NOLINTNEXTLINE(performance-unnecessary-value-param)
+      std::vector<at::TypePtr> packed_args_schema,
+      bool is_custom_function = false,
+      bool is_traceable = true) const {
+    TORCH_INTERNAL_ASSERT(false, "Needs to be overridden");
+  }
+
+  // Invokes py_compiler.method_name(fn_name, inputs, packed_args,
+  // output_metadata)
+  virtual variable_list call_function(
+      PyObject* py_compiler,
+      const char* method_name,
+      const std::string& fn_name,
+      const variable_list& inputs,
+      const ivalue_list& packed_args,
+      const c10::IValue& output_metadata) const {
+    TORCH_INTERNAL_ASSERT(false, "Needs to be overridden");
+  }
+  virtual variable_list call_copy_slices_prologue(
+      PyObject* py_compiler,
+      const variable_list& inputs,
+      const at::TensorGeometry& base,
+      const at::TensorGeometry& view) const {
+    TORCH_INTERNAL_ASSERT(false, "Needs to be overridden");
+  }
+  virtual variable_list call_copy_slices_epilogue(
+      PyObject* py_compiler,
+      const std::vector<bool>& needs_input_grad,
+      const at::Tensor& result,
+      const variable_list& res,
+      const at::Tensor& grad_slice) const {
+    TORCH_INTERNAL_ASSERT(false, "Needs to be overridden");
+  }
+  virtual at::Tensor call_unpack(
+      PyObject* py_compiler,
+      std::optional<size_t> hook_id,
+      size_t hook_input_id) const {
+    TORCH_INTERNAL_ASSERT(false, "Needs to be overridden");
+  }
+  virtual void call_accumulate_grad(
+      PyObject* py_compiler,
+      const at::Tensor& variable,
+      const at::Tensor& grad,
+      bool has_post_hooks) const {
+    TORCH_INTERNAL_ASSERT(false, "Needs to be overridden");
+  }
+};
+
+TORCH_API const std::unique_ptr<PyCompilerInterface>& getPyCompilerInterface();
+struct TORCH_API PyCompilerGuard {
+  explicit PyCompilerGuard(std::unique_ptr<PyCompilerInterface>&& impl);
+  PyCompilerGuard(const PyCompilerGuard&) = delete;
+  PyCompilerGuard& operator=(const PyCompilerGuard&) = delete;
+  PyCompilerGuard(PyCompilerGuard&&) = delete;
+  PyCompilerGuard& operator=(PyCompilerGuard&&) = delete;
+
+  ~PyCompilerGuard();
+};
+
+// including torch/csrc/autograd/engine.h breaks BC by somehow introducing
+// symbol resolution issues. Instead requiring downstream users to include
+// engine.h to access collect_input_metadata, we provide it here (with a
+// different name to avoid ambiguous symbols...)
+TORCH_API std::vector<std::optional<InputMetadata>> get_input_metadata(
+    const edge_list& edges);
+
+struct SizeInput {
+  // Note: int value is still needed when dynamic to pass as an arg
+  enum DynType : uint8_t { STATIC = 0, DYNAMIC = 1 };
+  SizeInput(DynType dt, int64_t v) : dyn_type(dt), value(v) {}
+  DynType dyn_type;
+  int64_t value;
+};
+
+struct CacheKeyBuffer {
+  CacheKeyBuffer(const uint8_t* key, uint16_t len) : data(new uint8_t[len]) {
+    std::memcpy(data.get(), key, len);
+  }
+  const uint8_t* get() const {
+    return data.get();
+  }
+
+ private:
+  // NOLINTNEXTLINE(*c-array*)
+  std::unique_ptr<uint8_t[]> data;
+};
+
+struct CacheKey {
+  // Key to find the next node in the shadow graph.  We use C++ RTTI for the
+  // type of the node (ntype), then a key generated with a visitor pattern.
+  CacheKey(const std::type_index& ntype, const uint8_t* key, uint16_t len)
+      : node_type(ntype), key_size(len), key(key) {}
+
+  bool operator<(const CacheKey& other) const {
+    if (node_type != other.node_type) {
+      return node_type < other.node_type;
+    }
+    if (key_size != other.key_size) {
+      return key_size < other.key_size;
+    }
+    return std::memcmp(key, other.key, key_size) < 0;
+  }
+
+  bool operator==(const CacheKey& other) const {
+    return node_type == other.node_type && key_size == other.key_size &&
+        std::memcmp(key, other.key, key_size) == 0;
+  }
+
+  size_t hash() const {
+    // don't bother hashing the key data, common case 1 cache entry per node
+    return std::hash<std::type_index>()(node_type) ^ key_size;
+  }
+
+  std::type_index node_type;
+  uint16_t key_size;
+  const uint8_t* key;
+};
+
+struct NodeCall {
+  NodeCall(uint32_t id_, std::shared_ptr<Node> node_)
+      : id(id_), node(std::move(node_)) {}
+
+  void mark_output(int input_nr, int output_idx) {
+    graph_output.emplace_back(input_nr, output_idx);
+  }
+
+  uint32_t id;
+  std::shared_ptr<Node> node;
+  std::vector<std::pair<int, int>> tensor_pre_hooks;
+  std::vector<std::pair<int, int>> cpp_tensor_pre_hooks;
+  std::vector<int> pre_hooks;
+  std::vector<int> post_hooks;
+  std::vector<int> post_acc_grad_hooks;
+  std::vector<std::pair<int, int>> graph_output;
+  bool needed = true;
+};
+
+struct NodeCalls : public std::unordered_map<Node*, NodeCall> {
+  NodeCall& lookup(const std::shared_ptr<Node>& function) {
+    auto it = find(function.get());
+    if (it == end()) {
+      it = emplace(function.get(), NodeCall(_next_id++, function)).first;
+      nodes.emplace_back(function.get());
+    }
+    return it->second;
+  }
+
+  const NodeCall& lookup(uint32_t id) const {
+    TORCH_INTERNAL_ASSERT(id < nodes.size());
+    auto it = find(nodes[id]);
+    TORCH_INTERNAL_ASSERT(it != end());
+    return it->second;
+  }
+
+  void clear() {
+    _next_id = 0;
+    std::unordered_map<Node*, NodeCall>::clear();
+    nodes.clear();
+  }
+
+ private:
+  uint32_t _next_id = 0;
+  std::vector<Node*> nodes;
+};
+
+struct TensorArg {
+  // Represents a de-duplicated tensor that will be passed into the graph
+  TensorArg(uint32_t i = 0) : id(i) {}
+  uint32_t index() const {
+    TORCH_INTERNAL_ASSERT(defined());
+    return id - 1;
+  }
+  bool defined() const {
+    return id != 0;
+  }
+  uint32_t id;
+  at::Tensor proxy_tensor;
+};
+
+struct TensorArgs {
+  // Manages a collection of TensorArgs and mappings from Tensors/SavedVariables
+  // to them.  This also allows us to unpack SavedVariable exactly once and
+  // store the unpacked Tensor.
+  TensorArgs(const std::optional<size_t>& active_node_call_idx)
+      : active_node_call_idx(active_node_call_idx) {}
+
+  TensorArg& lookup(const at::Tensor& tensor, bool create = false) {
+    if (!tensor.defined()) {
+      return _undefined;
+    }
+    auto impl = tensor.unsafeGetTensorImpl();
+    auto it = _args.find(impl);
+    if (it == _args.end()) {
+      TORCH_INTERNAL_ASSERT(create && inputs.size() == _next_id - 1);
+      it = _args.emplace(impl, TensorArg(_next_id++)).first;
+      inputs.emplace_back(tensor);
+      if (active_node_call_idx.has_value()) {
+        input_origins.emplace_back(active_node_call_idx.value());
+      }
+    }
+    return it->second;
+  }
+
+  TensorArg& lookup(const SavedVariable& sv) {
+    if (auto it = _saved_variables.find(&sv); it != _saved_variables.end()) {
+      // unpacked before graph
+      return *it->second;
+    }
+    // unpacked in graph
+    auto it2 = _saved_variables_proxies.find(&sv);
+    TORCH_INTERNAL_ASSERT(it2 != _saved_variables_proxies.end());
+    return *it2->second;
+  }
+
+  TensorArg& add(const at::Tensor& tensor) {
+    return lookup(tensor, true);
+  }
+
+  TensorArg& add(const SavedVariable& sv, const std::shared_ptr<Node>& node) {
+    // no unpack hooks in this codepath
+    at::Tensor tensor = sv.unpack(node);
+    TensorArg& arg = add(tensor);
+    _saved_variables.emplace(&sv, &arg);
+    return arg;
+  }
+
+  // the concrete tensors that will get passed into the graph as inputs
+  std::vector<at::Tensor> inputs;
+  // NodeCall id of each input, only when verbose logging is enabled
+  std::vector<uint32_t> input_origins;
+
+ private:
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const std::optional<size_t>& active_node_call_idx;
+  std::unordered_map<const c10::TensorImpl*, TensorArg> _args;
+  // Every TensorArg from this is actually owned by _args (or _undefined) and
+  // that's why we have an un-owned pointer here.
+  std::unordered_map<const SavedVariable*, TensorArg*> _saved_variables;
+  std::unordered_map<const SavedVariable*, TensorArg*> _saved_variables_proxies;
+  TensorArg _undefined;
+  uint32_t _next_id = 1; // id=0 used by _undefined
+};
+
+struct LiftedIValueArg {
+  LiftedIValueArg() = delete;
+  LiftedIValueArg(const at::IValue* ptr)
+      : actual_ptr(ptr), proxy(at::IValue::uninitialized()) {}
+
+  const at::IValue* actual_ptr; // lifetime handled by autograd node
+  at::IValue proxy;
+};
+
+struct LiftedIValueArgs {
+  LiftedIValueArgs(const std::optional<size_t>& active_node_call_idx)
+      : active_node_call_idx(active_node_call_idx) {}
+
+  at::IValue& next_proxy(const at::IValue* actual_ptr) {
+    TORCH_INTERNAL_ASSERT(next < args.size());
+    auto& iv_arg = args.at(next++);
+    TORCH_INTERNAL_ASSERT(iv_arg.actual_ptr == actual_ptr);
+    return iv_arg.proxy;
+  }
+
+  void add(const at::IValue* iv) {
+    args.emplace_back(iv);
+    if (active_node_call_idx.has_value()) {
+      args_origins.emplace_back(active_node_call_idx.value());
+    }
+  }
+
+  std::vector<LiftedIValueArg> args;
+  size_t next = 0;
+  // NodeCall id of each arg, only when verbose logging is enabled
+  std::vector<uint32_t> args_origins;
+
+ private:
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const std::optional<size_t>& active_node_call_idx;
+};
+
+struct AutogradCompilerCall {
+  AutogradCompilerCall(SizeInput::DynType default_dyn_type)
+      : active_node_call_idx(std::nullopt),
+        tensor_args(active_node_call_idx),
+        lifted_ivalue_args(active_node_call_idx),
+        default_dyn_type(default_dyn_type) {}
+  void add_size_input(const c10::SymInt& s) {
+    all_size_inputs.emplace_back(
+        default_dyn_type, s.guard_int(__FILE__, __LINE__));
+    if (active_node_call_idx.has_value()) {
+      size_input_origins.emplace_back(active_node_call_idx.value());
+    }
+  }
+
+  size_t emplace_hook(c10::SafePyObject&& fn) {
+    hooks.emplace_back(std::move(fn));
+    return hooks.size() - 1;
+  }
+
+  size_t emplace_cpp_tensor_pre_hook(
+      std::function<at::TensorBase(const at::TensorBase&)>&& fn) {
+    cpp_tensor_pre_hooks.emplace_back(std::move(fn));
+    return cpp_tensor_pre_hooks.size() - 1;
+  }
+
+  size_t emplace_packed_input(c10::SafePyObject&& input) {
+    packed_inputs.emplace_back(std::move(input));
+    return packed_inputs.size() - 1;
+  }
+
+  void set_active_node_call_idx(size_t node_call_idx) {
+    active_node_call_idx = node_call_idx;
+  }
+
+  std::optional<size_t> active_node_call_idx;
+  TensorArgs tensor_args;
+  std::vector<SizeInput> all_size_inputs;
+  LiftedIValueArgs lifted_ivalue_args;
+  std::vector<int64_t> dyn_size_inputs;
+  std::vector<c10::SafePyObject> hooks;
+  std::vector<std::function<at::TensorBase(const at::TensorBase&)>>
+      cpp_tensor_pre_hooks;
+  std::vector<c10::SafePyObject> packed_inputs;
+  NodeCalls node_calls;
+  SizeInput::DynType default_dyn_type;
+  // NodeCall id of each size, only when verbose logging is enabled
+  std::vector<uint32_t> size_input_origins;
+  std::unordered_map<const SavedVariable*, std::pair<size_t, size_t>>
+      sv_to_hooks;
+  // pynode -> backward and backward state idx
+  std::unordered_map<const Node*, std::pair<size_t, std::optional<size_t>>>
+      pynode_objs;
+};
+
+class CompiledNodeArgs {
+  // CompiledNodeArgs builds a representation of the constant values found
+  // across all the nodes in the compiled graph, via 'collect' overloads. The
+  // collected constants are specialized on by concatenation into a cache key.
+  // Tensor, symint arguments (which are lifted to become graph inputs rather
+  // than specialized on) are forwarded to the compiler and not included in the
+  // key.
+ public:
+  void collect(const TensorArg& t) {
+    collect_size(t.id);
+    if (t.defined()) {
+      const at::Tensor& tensor = _compiler.tensor_args.inputs[t.index()];
+      // including these in the cache key means dynamo-level tensor guards can
+      // be skipped
+      collect(tensor.device());
+      collect(tensor.dtype());
+      collect(tensor.requires_grad());
+    }
+  }
+
+  void collect(const at::Tensor& t) {
+    collect(_compiler.tensor_args.add(t));
+  }
+  void collect(const SavedVariable& sv, bool is_output) {
+    if (auto hook_data = sv.retrieve_unpack_hook_data();
+        hook_data.has_value()) {
+      // hooks, unpack in graph
+      auto& [hook, packed_input] = hook_data.value();
+      size_t hook_id = _compiler.emplace_hook(std::move(hook));
+      // rely on dynamo to dedup packed tensors from unpacked tensors
+      size_t input_id = _compiler.emplace_packed_input(std::move(packed_input));
+      _compiler.sv_to_hooks.emplace(&sv, std::make_pair(hook_id, input_id));
+    } else {
+      // no hooks, unpack now
+      collect(
+          _compiler.tensor_args.add(sv, is_output ? _node_call.node : nullptr));
+    }
+  }
+  void collect(const c10::SymInt& t) {
+    _compiler.add_size_input(t);
+  }
+  void collect(const std::vector<SavedVariable>& t, bool is_output) {
+    collect_size(t.size());
+    for (const SavedVariable& i : t) {
+      collect(i, is_output);
+    }
+  }
+  template <typename T>
+  void collect(const std::vector<T>& t) {
+    collect_size(t.size());
+    for (const T& i : t) {
+      collect(i);
+    }
+  }
+  void collect(const c10::ArrayRef<SavedVariable>& t, bool is_output) {
+    collect_size(t.size());
+    for (const SavedVariable& i : t) {
+      collect(i, is_output);
+    }
+  }
+  template <typename T>
+  void collect(const c10::ArrayRef<T>& t) {
+    collect_size(t.size());
+    for (const T& i : t) {
+      collect(i);
+    }
+  }
+  template <typename T>
+  void collect(const c10::OptionalArray<T>& t) {
+    collect(t.list);
+  }
+  template <typename T>
+  void collect(const std::optional<T>& t) {
+    if (cond(t.has_value())) {
+      // NOLINTNEXTLINE(bugprone-unchecked-optional-access)
+      collect(*t);
+    }
+  }
+  template <typename A, typename B>
+  void collect(const std::pair<A, B>& t) {
+    collect(t.first);
+    collect(t.second);
+  }
+  template <typename V>
+  void collect(const ska::flat_hash_map<std::string, V>& m) {
+    collect_size(m.size());
+
+    std::vector<std::string> keys;
+    keys.reserve(m.size());
+    std::transform(
+        m.begin(), m.end(), std::back_inserter(keys), [](const auto& entry) {
+          return entry.first;
+        });
+    std::sort(keys.begin(), keys.end());
+    for (const auto& k : keys) {
+      collect(k);
+      collect(m.at(k));
+    }
+  }
+  void collect(const at::IValue& iv, bool nested = false) {
+    // used by AutogradContext::saved_data from CppNode
+    if (iv.isList()) {
+      c10::List<at::IValue> list = iv.toList();
+      collect_size(list.size());
+      for (auto&& value : list) {
+        collect(value, true);
+      }
+    } else if (iv.isGenericDict()) {
+      c10::Dict<at::IValue, at::IValue> ordered_dict = iv.toGenericDict();
+      collect_size(ordered_dict.size());
+      // NOLINTNEXTLINE(modernize-loop-convert)
+      for (auto it = ordered_dict.begin(); it != ordered_dict.end(); it++) {
+        collect(it->key());
+        collect(it->value(), true);
+      }
+    } else if (iv.isTensor()) {
+      collect(iv.toTensor());
+    } else if (
+        !nested &&
+        (iv.isInt() || iv.isSymInt() || iv.isDouble() || iv.isSymFloat())) {
+      // can't lift ivalues nested in collections
+      _compiler.lifted_ivalue_args.add(&iv);
+    } else {
+      try {
+        collect(static_cast<uint64_t>(at::IValue::hash(iv)));
+      } catch (const std::runtime_error& e) {
+        std::string msg =
+            "Compiled autograd can not trace unhashable IValues, error: " +
+            std::string(e.what());
+        TORCH_CHECK_NOT_IMPLEMENTED(false, msg);
+      }
+    }
+  }
+  void collect(const c10::Scalar& t) {
+    auto type = t.type();
+    specialize_on_bytes(type);
+    if (type == c10::ScalarType::Double) {
+      collect(t.toDouble());
+    } else if (type == c10::ScalarType::Long) {
+      collect(t.toLong());
+    } else if (type == c10::ScalarType::Bool) {
+      collect(t.toBool());
+    } else if (type == c10::ScalarType::ComplexDouble) {
+      auto c = t.toComplexDouble();
+      collect(c.real());
+      collect(c.imag());
+    } else {
+      TORCH_INTERNAL_ASSERT(false);
+    }
+  }
+  void collect(const c10::TensorOptions& t) {
+    collect(t.device());
+    collect(t.dtype());
+    collect(t.layout());
+    collect(t.requires_grad());
+    collect(t.pinned_memory());
+    collect(t.memory_format_opt());
+  }
+  void collect(const at::TensorGeometry& t) {
+    collect(t.sym_sizes());
+    collect(t.sym_strides());
+    collect(t.sym_storage_offset());
+  }
+  void collect(const torch::autograd::TypeAndSize& t) {
+    collect(t.sym_sizes);
+    collect(t.options);
+  }
+  void collect(const c10::Device& t) {
+    collect(t.type());
+    collect(t.index());
+  }
+  void collect(const std::string& t) {
+    collect_size(t.size());
+    for (char c : t) {
+      collect(c);
+    }
+  }
+  void collect(const caffe2::TypeMeta& t) {
+    specialize_on_bytes(t.id());
+  }
+  void collect(const std::shared_ptr<Node>& t) {
+    // Note: this is only capturing the ID of the node not everything
+    // contained inside it.  This is used for tracking connections between
+    // nodes and the actual details of the node itself must be handled by
+    // a separate call to `node->compiled_args()`.
+    if (cond((bool)t)) {
+      collect(_compiler.node_calls.lookup(t));
+    }
+  }
+  void collect(const NodeCall& t) {
+    collect_size(t.id);
+    collect(t.graph_output);
+    collect_hooks_from(t.node.get());
+  }
+  void collect(const Edge& t) {
+    if (cond(t.is_valid())) {
+      collect_size(_compiler.node_calls.lookup(t.function).id);
+      collect_size(t.input_nr);
+      collect(t.function->input_metadata(t.input_nr)); // for validate_outputs
+    }
+  }
+  void collect(const InputMetadata& t) {
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        !t.is_nested_tensor(), "NestedTensor support not implemented. ");
+    collect(t.options());
+    collect(t.is_tensor_subclass());
+    collect(t.shape_as_dim_vector());
+  }
+  void collect(const VariableInfo& t) {
+    collect(t.layout);
+    collect(t.device);
+    collect(t.scalar_type);
+    collect(t.size);
+    collect(t.requires_grad);
+    collect(t.is_empty);
+  }
+  bool cond(bool cond) {
+    collect(cond);
+    return cond;
+  }
+
+#define COLLECT_AS_BYTES(T) \
+  void collect(T t) {       \
+    specialize_on_bytes(t); \
+  }
+  COLLECT_AS_BYTES(c10::ScalarType)
+  COLLECT_AS_BYTES(c10::DeviceType)
+  COLLECT_AS_BYTES(c10::Layout)
+  COLLECT_AS_BYTES(c10::MemoryFormat)
+  COLLECT_AS_BYTES(int8_t)
+  COLLECT_AS_BYTES(int16_t)
+  COLLECT_AS_BYTES(int32_t)
+  COLLECT_AS_BYTES(int64_t)
+  COLLECT_AS_BYTES(uint8_t)
+  COLLECT_AS_BYTES(uint16_t)
+  COLLECT_AS_BYTES(uint32_t)
+  COLLECT_AS_BYTES(uint64_t)
+  COLLECT_AS_BYTES(bool)
+  COLLECT_AS_BYTES(float)
+  COLLECT_AS_BYTES(double)
+#undef COLLECT_AS_BYTES
+
+  void collect_hooks_from(Node* fn) {
+    for (auto& i : fn->tensor_pre_hooks()) {
+      i->compiled_args(*this);
+    }
+    for (auto& [_, i] : fn->retains_grad_hooks()) {
+      i->compiled_args(*this);
+    }
+    for (auto& i : fn->pre_hooks()) {
+      i->compiled_args(*this);
+    }
+    for (auto& i : fn->post_hooks()) {
+      i->compiled_args(*this);
+    }
+    collect_size(_node_call.tensor_pre_hooks.size());
+    collect_size(_node_call.pre_hooks.size());
+    collect_size(_node_call.post_hooks.size());
+    for (const auto& h : _node_call.tensor_pre_hooks) {
+      collect_size(static_cast<size_t>(h.second));
+    }
+  }
+
+  CacheKey key() const {
+    Node* node = _node_call.node.get();
+    return CacheKey(
+        typeid(*node), _specialization_key, _specialization_key_size);
+  }
+
+  void collect_pynode_objs(
+      const Node* pynode,
+      c10::SafePyObject&& bwd,
+      std::optional<c10::SafePyObject>&& bwd_state) {
+    size_t bwd_idx = _compiler.emplace_hook(std::move(bwd));
+    std::optional<size_t> bwd_state_idx;
+    if (auto state = std::move(bwd_state); state.has_value()) {
+      bwd_state_idx = _compiler.emplace_hook(std::move(state.value()));
+    }
+    _compiler.pynode_objs.emplace(
+        pynode, std::make_pair(bwd_idx, bwd_state_idx));
+  }
+
+  void add_tensor_pre_hook(c10::SafePyObject&& obj, int index) {
+    auto fn_id = _compiler.emplace_hook(std::move(obj));
+    collect_size(fn_id);
+    _node_call.tensor_pre_hooks.emplace_back(fn_id, index);
+  }
+
+  void add_cpp_single_tensor_pre_hook(
+      const std::function<at::TensorBase(const at::TensorBase&)>& hook,
+      size_t idx) {
+    auto wrapper = [hook](const at::TensorBase& grad) {
+      // handle when hook returns nothing
+      auto out = hook(grad);
+      if (!out.defined()) {
+        return grad;
+      }
+      return out;
+    };
+
+    auto hook_id = _compiler.emplace_cpp_tensor_pre_hook(std::move(wrapper));
+    collect_size(hook_id);
+    _node_call.cpp_tensor_pre_hooks.emplace_back(hook_id, idx);
+  }
+
+  void add_pre_hook(c10::SafePyObject&& obj) {
+    auto fn_id = _compiler.emplace_hook(std::move(obj));
+    collect_size(fn_id);
+    _node_call.pre_hooks.emplace_back(fn_id);
+  }
+
+  void add_post_hook(c10::SafePyObject&& obj) {
+    auto fn_id = _compiler.emplace_hook(std::move(obj));
+    collect_size(fn_id);
+    _node_call.post_hooks.emplace_back(fn_id);
+  }
+
+  void add_post_acc_grad_hook(c10::SafePyObject&& obj) {
+    auto fn_id = _compiler.emplace_hook(std::move(obj));
+    collect_size(fn_id);
+    _node_call.post_acc_grad_hooks.emplace_back(fn_id);
+  }
+
+  // Need to template the size_t to silence internal 32-bit build errors due to
+  // a mix of -Werror, -Wtautological-type-limit-compare and
+  // -Wunknown-pragmas
+  template <typename T>
+  std::enable_if_t<std::is_unsigned_v<T>, void> collect_size(T s) {
+    // we expect sizes to be small, so try to cram them into a single byte
+    constexpr uint8_t encode_as_u64 = std::numeric_limits<uint8_t>::max();
+    constexpr uint8_t encode_as_u32 = encode_as_u64 - 1;
+    constexpr uint8_t encode_as_u16 = encode_as_u64 - 2;
+    if (C10_UNLIKELY(s >= encode_as_u16)) {
+      // first write a byte indicating the path we followed, then the data
+      if (s <= std::numeric_limits<uint16_t>::max()) {
+        // 3 bytes
+        specialize_on_bytes(encode_as_u16);
+        specialize_on_bytes(static_cast<uint16_t>(s));
+      } else if (s <= std::numeric_limits<uint32_t>::max()) {
+        // 5 bytes
+        specialize_on_bytes(encode_as_u32);
+        specialize_on_bytes(static_cast<uint32_t>(s));
+      } else {
+        // 9 bytes
+        specialize_on_bytes(encode_as_u64);
+        specialize_on_bytes(s);
+      }
+    } else {
+      // happy case, 1 byte
+      specialize_on_bytes(static_cast<uint8_t>(s));
+    }
+  }
+
+  SizeInput::DynType set_default_dyn_type(SizeInput::DynType default_dyn_type) {
+    return std::exchange(_compiler.default_dyn_type, default_dyn_type);
+  }
+
+  CompiledNodeArgs(AutogradCompilerCall& compiler, NodeCall& node_call)
+      : _compiler(compiler),
+        _node_call(node_call),
+        _specialization_key(
+            // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
+            (uint8_t*)std::malloc(_specialization_key_storage)) {}
+  CompiledNodeArgs(const CompiledNodeArgs&) = delete;
+  CompiledNodeArgs(CompiledNodeArgs&&) = delete;
+  CompiledNodeArgs& operator=(const CompiledNodeArgs&) = delete;
+  CompiledNodeArgs& operator=(CompiledNodeArgs&&) = delete;
+  ~CompiledNodeArgs() {
+    // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
+    std::free(_specialization_key);
+  }
+
+ private:
+  template <typename T>
+  void specialize_on_bytes(const T& t) {
+    while (C10_UNLIKELY(
+        _specialization_key_size + sizeof(T) > _specialization_key_storage)) {
+      _specialization_key_storage *= 2;
+      // NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
+      _specialization_key = (uint8_t*)std::realloc(
+          _specialization_key, _specialization_key_storage);
+    }
+    std::memcpy(_specialization_key + _specialization_key_size, &t, sizeof(T));
+    _specialization_key_size += sizeof(T);
+  }
+
+  AutogradCompilerCall& _compiler;
+  NodeCall& _node_call;
+  size_t _specialization_key_size{0};
+  size_t _specialization_key_storage{1024};
+  uint8_t* _specialization_key;
+};
+
+struct TraceState {
+  TraceState(std::vector<std::optional<c10::SymInt>>&& ss, size_t num_outputs)
+      : sym_sizes(std::move(ss)), outputs(num_outputs) {}
+
+  void debug_asserts() {
+    TORCH_INTERNAL_ASSERT(sym_sizes_index == sym_sizes.size());
+  }
+  std::optional<c10::SymInt> next_sym_size() {
+    TORCH_INTERNAL_ASSERT(sym_sizes_index < sym_sizes.size());
+    return sym_sizes[sym_sizes_index++];
+  }
+
+  size_t sym_sizes_index{0};
+  std::vector<std::optional<c10::SymInt>> sym_sizes;
+  variable_list outputs;
+};
+
+class SwapSavedVariables {
+  // SwapSavedVariables is used during the tracing/compilation phase after a
+  // cache-miss. It swaps any 'lifted' inputs (tensors, symints) to proxy nodes,
+  // allows tracing to happen, then swaps them back afterwards.
+ public:
+  std::pair<size_t, std::optional<size_t>> retrieve_pynode_objs(
+      Node* pynode) const {
+    auto it = compiler.pynode_objs.find(pynode);
+    TORCH_INTERNAL_ASSERT(it != compiler.pynode_objs.end());
+    return it->second;
+  }
+
+  void before(at::Tensor& t) {
+    TensorArg& arg = compiler.tensor_args.lookup(t);
+    stashed_tensors.save(&t, std::move(t));
+    if (arg.defined()) {
+      TORCH_INTERNAL_ASSERT(arg.proxy_tensor.defined());
+      t = arg.proxy_tensor;
+    }
+  }
+  void after(at::Tensor& t) {
+    stashed_tensors.restore(&t);
+  }
+
+  void before(SavedVariable& t) {
+    if (auto it = compiler.sv_to_hooks.find(&t);
+        it != compiler.sv_to_hooks.end()) {
+      const auto& pyinterface =
+          torch::dynamo::autograd::getPyCompilerInterface();
+      auto proxy_tensor = pyinterface->call_unpack(
+          get_py_compiler(), it->second.first, it->second.second);
+      stashed_variables.save(&t, std::move(t));
+      bool prior = at::SavedTensorDefaultHooks::set_tracing(true);
+      t = SavedVariable(proxy_tensor, false);
+      at::SavedTensorDefaultHooks::set_tracing(prior);
+    } else {
+      // no hooks, was already unpacked
+      TensorArg& arg = compiler.tensor_args.lookup(t);
+      stashed_variables.save(&t, std::move(t));
+      if (arg.defined()) {
+        bool prior = at::SavedTensorDefaultHooks::set_tracing(true);
+        TORCH_INTERNAL_ASSERT(arg.proxy_tensor.defined());
+        t = SavedVariable(arg.proxy_tensor, false);
+        at::SavedTensorDefaultHooks::set_tracing(prior);
+      }
+    }
+  }
+  void after(SavedVariable& t) {
+    stashed_variables.restore(&t);
+  }
+
+  void before(c10::SymInt& t) {
+    stashed_symints.save(&t, c10::SymInt(t));
+    auto opt_value = state.next_sym_size();
+    if (opt_value.has_value()) {
+      t = *opt_value; // dynamic shape
+    }
+  }
+  void after(c10::SymInt& t) {
+    stashed_symints.restore(&t);
+  }
+
+  void before(at::IValue& iv) {
+    if (iv.isTensor()) {
+      before(iv.toTensor());
+    } else {
+      stashed_ivalues.save(&iv, at::IValue(iv));
+      if (iv.isInt() || iv.isSymInt() || iv.isDouble() || iv.isSymFloat()) {
+        iv = compiler.lifted_ivalue_args.next_proxy(&iv);
+      }
+    }
+  }
+
+  void after(at::IValue& t) {
+    if (t.isTensor()) {
+      after(t.toTensor());
+    } else {
+      stashed_ivalues.restore(&t);
+    }
+  }
+
+  void before(Edge& t) {
+    if (t.is_valid()) {
+      // need for symints used by validate_outputs
+      before(t.function->mutable_input_metadata(t.input_nr));
+    }
+  }
+  void after(Edge& t) {
+    if (t.is_valid()) {
+      after(t.function->mutable_input_metadata(t.input_nr));
+    }
+  }
+  void before(InputMetadata& t) {
+    before(t.mutable_shape_as_dim_vector());
+  }
+  void after(InputMetadata& t) {
+    after(t.mutable_shape_as_dim_vector());
+  }
+  void before(at::TensorGeometry& t) {
+    before(t.mutable_sizes());
+    before(t.mutable_strides());
+    before(t.mutable_storage_offset());
+    t.recompute();
+  }
+  void after(at::TensorGeometry& t) {
+    after(t.mutable_sizes());
+    after(t.mutable_strides());
+    after(t.mutable_storage_offset());
+    t.recompute();
+  }
+  void before(torch::autograd::TypeAndSize& t) {
+    before(t.sym_sizes);
+    before(t.options);
+  }
+  void after(torch::autograd::TypeAndSize& t) {
+    after(t.sym_sizes);
+    after(t.options);
+  }
+  void before(VariableInfo& t) {
+    before(t.size);
+  }
+  void after(VariableInfo& t) {
+    after(t.size);
+  }
+
+  template <typename T>
+  void before(std::vector<T>& t) {
+    for (T& i : t) {
+      before(i);
+    }
+  }
+  template <typename T>
+  void after(std::vector<T>& t) {
+    for (T& i : t) {
+      after(i);
+    }
+  }
+  template <typename T, unsigned N>
+  void before(c10::SmallVector<T, N>& t) {
+    for (T& i : t) {
+      before(i);
+    }
+  }
+  template <typename T, unsigned N>
+  void after(c10::SmallVector<T, N>& t) {
+    for (T& i : t) {
+      after(i);
+    }
+  }
+
+  template <typename T>
+  void before(c10::OptionalArray<T>& t) {
+    before(t.list);
+  }
+  template <typename T>
+  void after(c10::OptionalArray<T>& t) {
+    after(t.list);
+  }
+
+  template <typename T>
+  void before(std::optional<T>& t) {
+    if (t.has_value()) {
+      before(*t);
+    }
+  }
+  template <typename T>
+  void after(std::optional<T>& t) {
+    if (t.has_value()) {
+      after(*t);
+    }
+  }
+
+  template <typename V>
+  void before(ska::flat_hash_map<std::string, V>& m) {
+    std::vector<std::string> keys;
+    keys.reserve(m.size());
+    std::transform(
+        m.begin(), m.end(), std::back_inserter(keys), [](const auto& entry) {
+          return entry.first;
+        });
+    std::sort(keys.begin(), keys.end());
+    for (auto& k : keys) {
+      before(m.at(k));
+    }
+  }
+
+  template <typename V>
+  void after(ska::flat_hash_map<std::string, V>& m) {
+    for (auto& [_, v] : m) {
+      after(v);
+    }
+  }
+
+#define NO_OP_VISIT(T)     \
+  void before(const T&) {} \
+  void after(const T&) {}
+  NO_OP_VISIT(caffe2::TypeMeta)
+  NO_OP_VISIT(c10::Device)
+  NO_OP_VISIT(c10::DeviceType)
+  NO_OP_VISIT(c10::Layout)
+  NO_OP_VISIT(c10::MemoryFormat)
+  NO_OP_VISIT(c10::ScalarType)
+  NO_OP_VISIT(c10::Scalar)
+  NO_OP_VISIT(c10::TensorOptions)
+  NO_OP_VISIT(std::string)
+  NO_OP_VISIT(int64_t)
+  NO_OP_VISIT(bool)
+  NO_OP_VISIT(double)
+#undef NO_OP_VISIT
+
+  SwapSavedVariables(
+      AutogradCompilerCall& c,
+      TraceState& s,
+      PyObject* p,
+      const NodeCall& n)
+      : compiler(c), state(s), py_compiler(p), curr_node_call(n) {}
+
+  PyObject* get_py_compiler() const {
+    return py_compiler;
+  }
+
+  const NodeCall& get_curr_node_call() {
+    return curr_node_call;
+  }
+
+  void debug_asserts() {
+    stashed_variables.debug_assert();
+    stashed_tensors.debug_assert();
+    stashed_symints.debug_assert();
+  }
+
+ private:
+  template <typename T>
+  struct Stashed {
+    Stashed(T&& v) : prior_value(std::move(v)) {}
+    T prior_value;
+    // Note: we need count here to support duplicate calls to before()
+    // which happen when we have multiple autograd::Edge objects pointing
+    // to the same autograd::Node
+    int count = 1;
+  };
+
+  template <typename T>
+  struct StashedVars : public std::unordered_map<const T*, Stashed<T>> {
+    void save(const T* key, T&& value) {
+      auto [it, inserted] = this->try_emplace(key, std::move(value));
+      if (!inserted) {
+        // keep the value from the prior save()
+        it->second.count++;
+      }
+    }
+    void restore(T* var) {
+      auto it = this->find(var);
+      TORCH_INTERNAL_ASSERT(it != this->end(), "missing before())");
+      if (--it->second.count == 0) {
+        // restore the value on the last restore()
+        *var = std::move(it->second.prior_value);
+        this->erase(it);
+      }
+    }
+    void debug_assert() {
+      TORCH_INTERNAL_ASSERT(this->empty(), "missing call to after()");
+    }
+  };
+
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  AutogradCompilerCall& compiler;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  TraceState& state;
+  // This is a borrowed reference, we do not increment ownership, or lower it,
+  // it's lifecycle is entirely longer than this objects.
+  PyObject* py_compiler;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const NodeCall& curr_node_call;
+
+  // These mappings are used to save the prior values when we overwrite things
+  // in before(). In after(), we use these to cleanup after ourselves.
+  StashedVars<SavedVariable> stashed_variables;
+  StashedVars<at::Tensor> stashed_tensors;
+  StashedVars<c10::SymInt> stashed_symints;
+  StashedVars<at::IValue> stashed_ivalues;
+};
+
+// NOTE: [Compiled Autograd and backward functions]
+// Built-in autograd nodes have functional apply variants
+// (e.g. MulBackward0_apply_functional). Compiled Autograd's initial graph
+// capture wants to take a variant of this function and proxy it into the graph.
+// Every autograd node defines an apply_with_saved function, that when invoked,
+// proxies a call to a function into the Compiled Autograd graph.
+//
+// Some requirements that we have are:
+// - The proxy'ed function must have inputs that are FX-graphable types.
+// - Windows has a DLL symbol limit of 65536.
+// - Node::apply_with_saved is in libtorch_cpu which does not have direct access
+// to Python
+//
+// There were multiple ways to skin the cat, but what we end up doing is:
+// - for e.g. MulBackward0_apply_functional, we create a new C++ function
+// MulBackward0_apply_functional_ivalue that accepts vector<IValue>.
+// - We define how to pack and unpack arbitrary C++ types into IValues.
+// - apply_with_saved passes MulBackward0_apply_functional_ivalue and
+// the IValue arguments to Python via an indirection.
+// In Python, these get proxy'ed into a graph.
+
+// Helper struct for packing/unpacking an arbitrary C++ type into a single
+// IValue. There are various full and partial specializations for IValuePacker
+// to handle packing specific types (like TensorOptions) into an IValue.
+template <typename T>
+struct IValuePacker {
+  // Defines how to pack T into an IValue.
+  static at::IValue pack(const T& t) {
+    return t;
+  }
+  // Defines how to unpack an IValue into T.
+  static T unpack(const at::IValue& t) {
+    return t.to<T>();
+  }
+  // Returns the TypePtr for the IValue (this is like the "type" of the IValue).
+  // We use this when passing the packed IValue from Python to C++.
+  // In Python, the IValue is just a PyObject* with the native type.
+  // For example, it may be a Python int, a Python List[int], etc.
+  // When passing this PyObject* into C++, we need to know how to parse it
+  // into a C++ type that then gets put into an IValue.
+  // That's what the TypePtr is for: it contains the information to do the
+  // parsing. See torch::jit::toIValue for more information.
+  static at::TypePtr packed_type() {
+#ifdef _WIN32
+    // NB: the if-constexpr usage triggers compilation errors on Windows
+    // with certain compiler settings
+    // (see https://github.com/pytorch/pytorch/pull/144707 for examples).
+    // It's not clear what the problem is, so we're going to ignore it for now.
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        false, "torch.compile not supported on Windows");
+#else
+    if constexpr (::std::is_same_v<T, at::Tensor>) {
+      return at::TensorType::get();
+    } else if constexpr (::std::is_same_v<T, int64_t>) {
+      return at::IntType::get();
+    } else if constexpr (::std::is_same_v<T, c10::SymInt>) {
+      return at::SymIntType::get();
+    } else if constexpr (::std::is_same_v<T, bool>) {
+      return at::BoolType::get();
+    } else if constexpr (::std::is_same_v<T, double>) {
+      return at::FloatType::get();
+    } else if constexpr (::std::is_same_v<T, c10::SymFloat>) {
+      return at::SymFloatType::get();
+    } else if constexpr (::std::is_same_v<T, c10::SymBool>) {
+      return at::SymBoolType::get();
+    } else if constexpr (::std::is_same_v<T, c10::Layout>) {
+      return at::LayoutType::get();
+    } else if constexpr (::std::is_same_v<T, ::std::string>) {
+      return at::StringType::get();
+    } else if constexpr (::std::is_same_v<T, at::Device>) {
+      return at::DeviceObjType::get();
+    } else if constexpr (::std::is_same_v<T, at::Scalar>) {
+      return at::NumberType::get();
+    } else if constexpr (::std::is_same_v<T, at::MemoryFormat>) {
+      return at::MemoryFormatType::get();
+    } else if constexpr (::std::is_same_v<T, at::ScalarType>) {
+      return at::ScalarTypeType::get();
+    } else {
+      // If you got here, you have probably added a member of a new type
+      // to a built-in C++ autograd node.
+      // Unfortunately, we don't know how to handle this type yet.
+      // To get this new type to work with Compiled Autograd, please
+      // either change it to be an IValue-constructible type, or
+      // define how to pack and unpack an object of this time into an IValue
+      // by creating a specialization of IValuePacker for this type.
+      // See NOTE: [Compiled Autograd and backward functions] for context.
+      TORCH_CHECK_NOT_IMPLEMENTED(
+          false, "IValuePacker not implemented for type");
+      return at::NoneType::get();
+    }
+#endif
+  }
+};
+
+template <>
+struct IValuePacker<size_t> {
+  static at::IValue pack(const size_t& t) {
+    // We generally use size_t as the size of a list of Tensors or number of
+    // dimensions. The number of dimensions generally do not exceed 64
+    // (TensorIterator has that limitation), and lists of Tensors generally do
+    // not exceed the int64_t max (you'd probably run out of RAM or run into
+    // significant Tensor overhead). If you run into this limitation the fix is
+    // to figure out how to pack size_t into int64_t. Note that size_t has some
+    // weird behavior on Mac OS.
+    uint64_t maximum_value = std::numeric_limits<int64_t>::max();
+    TORCH_INTERNAL_ASSERT(
+        static_cast<uint64_t>(t) <= maximum_value,
+        "size_t too large to pack into IValue");
+    return static_cast<int64_t>(t); // pack as int64_t
+  }
+  static size_t unpack(const at::IValue& t) {
+    return static_cast<size_t>(t.toInt());
+  }
+  static at::TypePtr packed_type() {
+    return IValuePacker<int64_t>::packed_type();
+  }
+};
+
+template <>
+struct IValuePacker<std::vector<at::SymInt>> {
+  static at::IValue pack(const std::vector<at::SymInt>& t) {
+    return t;
+  }
+  static std::vector<at::SymInt> unpack(const at::IValue& t) {
+    // We need this because there's no t.to<std::vector<at::SymInt>>() override?
+    return t.toSymIntVector();
+  }
+  static at::TypePtr packed_type() {
+    return at::ListType::create(at::SymIntType::get());
+  }
+};
+
+template <>
+struct IValuePacker<VariableInfo> {
+  static at::IValue pack(const VariableInfo& t) {
+    auto tuple = std::make_tuple(
+        t.layout, t.device, t.scalar_type, t.size, t.requires_grad, t.is_empty);
+    return tuple;
+  }
+  static VariableInfo unpack(const at::IValue& t) {
+    auto tuple = t.toTuple();
+    const auto& tuple_elements = tuple->elements();
+    const auto elements = tuple_elements.asArrayRef();
+    TORCH_INTERNAL_ASSERT(elements.size() == 6);
+    VariableInfo v;
+    v.layout = elements[0].toLayout();
+    v.device = elements[1].toDevice();
+    v.scalar_type = elements[2].toScalarType();
+    v.size = elements[3].toSymIntVector();
+    v.requires_grad = elements[4].toBool();
+    v.is_empty = elements[5].toBool();
+    return v;
+  }
+  static at::TypePtr packed_type() {
+    return at::TupleType::create({
+        at::LayoutType::get(),
+        at::DeviceObjType::get(),
+        at::ScalarTypeType::get(),
+        at::ListType::create(at::SymIntType::get()),
+        at::BoolType::get(),
+        at::BoolType::get(),
+    });
+  }
+};
+
+template <>
+struct IValuePacker<caffe2::TypeMeta> {
+  static at::IValue pack(const caffe2::TypeMeta& t) {
+    return at::typeMetaToScalarType(t); // pack as at::ScalarType
+  }
+  static caffe2::TypeMeta unpack(const at::IValue& t) {
+    return caffe2::TypeMeta::fromScalarType(t.to<at::ScalarType>());
+  }
+  static at::TypePtr packed_type() {
+    return IValuePacker<at::ScalarType>::packed_type();
+  }
+};
+
+inline std::optional<at::ScalarType> optTypeMetaToScalarType(
+    const std::optional<caffe2::TypeMeta>& t) {
+  if (t.has_value()) {
+    return at::typeMetaToScalarType(t.value());
+  } else {
+    return std::nullopt;
+  }
+}
+
+using packed_tensoroptions_t = std::tuple<
+    std::optional<bool>,
+    std::optional<at::MemoryFormat>,
+    std::optional<at::Device>,
+    std::optional<at::ScalarType>,
+    std::optional<at::Layout>,
+    std::optional<bool>>;
+
+inline packed_tensoroptions_t pack_TensorOptions(const at::TensorOptions& t) {
+  auto tuple = std::make_tuple(
+      t.requires_grad_opt(),
+      t.memory_format_opt(),
+      t.device_opt(),
+      optTypeMetaToScalarType(t.dtype_opt()),
+      t.layout_opt(),
+      t.pinned_memory_opt());
+  return tuple;
+}
+inline at::TensorOptions unpack_TensorOptions(
+    const packed_tensoroptions_t& tuple) {
+  at::TensorOptions result;
+  auto maybe_requires_grad = std::get<0>(tuple);
+  if (maybe_requires_grad.has_value()) {
+    result = result.requires_grad(maybe_requires_grad);
+  }
+  auto maybe_memory_format = std::get<1>(tuple);
+  if (maybe_memory_format.has_value()) {
+    result = result.memory_format(maybe_memory_format);
+  }
+  auto maybe_device = std::get<2>(tuple);
+  if (maybe_device.has_value()) {
+    result = result.device(maybe_device.value());
+  }
+  auto maybe_dtype = std::get<3>(tuple);
+  if (maybe_dtype.has_value()) {
+    result =
+        result.dtype(caffe2::TypeMeta::fromScalarType(maybe_dtype.value()));
+  }
+  auto maybe_layout = std::get<4>(tuple);
+  if (maybe_layout.has_value()) {
+    result = result.layout(maybe_layout);
+  }
+  auto maybe_pinned_memory = std::get<5>(tuple);
+  if (maybe_pinned_memory.has_value()) {
+    result = result.pinned_memory(maybe_pinned_memory);
+  }
+  return result;
+}
+
+template <>
+struct IValuePacker<at::TensorOptions> {
+  static at::IValue pack(const at::TensorOptions& t) {
+    return pack_TensorOptions(t);
+  }
+  static at::TensorOptions unpack(const at::IValue& t) {
+    auto tuple = t.to<packed_tensoroptions_t>();
+    return unpack_TensorOptions(tuple);
+  }
+  static at::TypePtr packed_type() {
+    return at::TupleType::create(
+        {at::OptionalType::create(at::BoolType::get()),
+         at::OptionalType::create(at::MemoryFormatType::get()),
+         at::OptionalType::create(at::DeviceObjType::get()),
+         at::OptionalType::create(at::ScalarTypeType::get()),
+         at::OptionalType::create(at::LayoutType::get()),
+         at::OptionalType::create(at::BoolType::get())});
+  }
+};
+
+template <>
+struct IValuePacker<TypeAndSize> {
+  static at::IValue pack(const TypeAndSize& t) {
+    auto tuple = std::make_tuple(t.sym_sizes, pack_TensorOptions(t.options));
+    return tuple;
+  }
+  static TypeAndSize unpack(const at::IValue& t) {
+    auto tuple =
+        t.to<std::tuple<std::vector<at::SymInt>, packed_tensoroptions_t>>();
+    TypeAndSize result;
+    result.sym_sizes = std::get<0>(tuple);
+    result.options = unpack_TensorOptions(std::get<1>(tuple));
+    return result;
+  }
+  static at::TypePtr packed_type() {
+    return at::TupleType::create(
+        {IValuePacker<std::vector<at::SymInt>>::packed_type(),
+         IValuePacker<at::TensorOptions>::packed_type()});
+  }
+};
+
+template <typename T>
+struct IValuePacker<std::optional<T>> {
+  static at::IValue pack(const std::optional<T>& t) {
+    if (t.has_value()) {
+      return IValuePacker<T>::pack(t.value());
+    } else {
+      return std::nullopt;
+    }
+  }
+  static std::optional<T> unpack(const at::IValue& t) {
+    if (t.isNone()) {
+      return std::nullopt;
+    } else {
+      return IValuePacker<T>::unpack(t);
+    }
+  }
+  static at::TypePtr packed_type() {
+    return at::OptionalType::create(IValuePacker<T>::packed_type());
+  }
+};
+
+template <typename T>
+struct IValuePacker<std::vector<T>> {
+  static at::IValue pack(const std::vector<T>& t) {
+    if constexpr (::std::is_constructible_v<at::IValue, T>) {
+      return t;
+    }
+    if (t.empty()) {
+      auto lst = c10::impl::GenericList(at::AnyType::get());
+      return lst;
+    }
+    auto type_ptr = IValuePacker<T>::pack(t[0]).type();
+    auto lst = c10::impl::GenericList(type_ptr);
+    for (const auto& elt : t) {
+      lst.emplace_back(IValuePacker<T>::pack(elt));
+    }
+    return lst;
+  }
+  static std::vector<T> unpack(const at::IValue& t) {
+    if constexpr (::std::is_constructible_v<at::IValue, T>) {
+      return t.to<::std::vector<T>>();
+    }
+    std::vector<T> result;
+    auto lst = t.toList();
+    for (const at::IValue& elt : lst) {
+      result.emplace_back(IValuePacker<T>::unpack(elt));
+    }
+    return result;
+  }
+  static at::TypePtr packed_type() {
+    return at::ListType::create(IValuePacker<T>::packed_type());
+  }
+};
+
+template <typename T>
+struct IValuePacker<c10::List<T>> {
+  static at::IValue pack(const c10::List<T>& t) {
+    return IValuePacker<std::vector<T>>::pack(t.vec());
+  }
+  static c10::List<T> unpack(const at::IValue& t) {
+    return c10::List<T>(IValuePacker<std::vector<T>>::unpack(t));
+  }
+  static at::TypePtr packed_type() {
+    return IValuePacker<std::vector<T>>::packed_type();
+  }
+};
+
+template <size_t N>
+struct IValuePacker<std::array<bool, N>> {
+  static at::IValue pack(const std::array<bool, N>& t) {
+    std::vector<bool> result(t.begin(), t.end());
+    return IValuePacker<std::vector<bool>>::pack(result);
+  }
+  static std::array<bool, N> unpack(const at::IValue& t) {
+    std::array<bool, N> result;
+    auto packed = IValuePacker<std::vector<bool>>::unpack(t);
+    for (size_t i = 0; i < packed.size(); i++) {
+      result[i] = packed[i];
+    }
+    return result;
+  }
+  static at::TypePtr packed_type() {
+    return IValuePacker<std::vector<bool>>::packed_type();
+  }
+};
+
+template <>
+struct IValuePacker<at::TensorGeometry> {
+  static at::IValue pack(const at::TensorGeometry& t) {
+    auto tuple = std::make_tuple(
+        t.sym_sizes().vec(), t.sym_strides().vec(), t.sym_storage_offset());
+    return tuple;
+  }
+  static at::TensorGeometry unpack(const at::IValue& t) {
+    auto tuple = t.to<std::tuple<
+        std::vector<at::SymInt>,
+        std::vector<at::SymInt>,
+        at::SymInt>>();
+    return at::TensorGeometry(
+        std::get<0>(tuple), std::get<1>(tuple), std::get<2>(tuple));
+  }
+  static at::TypePtr packed_type() {
+    return at::TupleType::create(
+        {IValuePacker<std::vector<at::SymInt>>::packed_type(),
+         IValuePacker<std::vector<at::SymInt>>::packed_type(),
+         at::SymIntType::get()});
+  }
+};
+
+template <>
+struct IValuePacker<InputMetadata> {
+  static at::IValue pack(const InputMetadata& t) {
+    TORCH_INTERNAL_ASSERT(!t.is_nested_tensor());
+    auto tuple = std::make_tuple(
+        pack_TensorOptions(t.options()),
+        t.shape_as_dim_vector().vec(),
+        t.is_tensor_subclass());
+    return tuple;
+  }
+  static InputMetadata unpack(const at::IValue& t) {
+    auto tuple = t.to<
+        std::tuple<packed_tensoroptions_t, std::vector<at::SymInt>, bool>>();
+
+    return InputMetadata(
+        unpack_TensorOptions(std::get<0>(tuple)),
+        SymIntSmallVec(std::get<1>(tuple)),
+        std::get<2>(tuple),
+        false);
+  }
+  static at::TypePtr packed_type() {
+    return at::TupleType::create(
+        {IValuePacker<at::TensorOptions>::packed_type(),
+         IValuePacker<std::vector<at::SymInt>>::packed_type(),
+         at::BoolType::get()});
+  }
+};
+
+template <typename T>
+struct IValuePacker<at::OptionalArray<T>> {
+  static at::IValue pack(const at::OptionalArray<T>& t) {
+    return IValuePacker<std::optional<std::vector<T>>>::pack(t.list);
+  }
+  static at::OptionalArray<T> unpack(const at::IValue& t) {
+    auto result = IValuePacker<std::optional<std::vector<T>>>::unpack(t);
+    if (result.has_value()) {
+      return {result.value()};
+    } else {
+      return {};
+    }
+  }
+  static at::TypePtr packed_type() {
+    return IValuePacker<std::optional<std::vector<T>>>::packed_type();
+  }
+};
+
+// This is a helper struct for packing and unpacking multiple arguments into
+// an ivalue_list. It leverages IValuePacker<T>.
+struct PackedArgs {
+  PackedArgs() = default;
+
+  explicit PackedArgs(std::vector<at::IValue> stack_)
+      : stack(std::move(stack_)) {}
+
+  const std::vector<at::IValue>& vec() const {
+    return stack;
+  }
+
+  template <typename T>
+  void pack(const T& t) {
+    stack.emplace_back(IValuePacker<T>::pack(t));
+  }
+  template <typename T>
+  T unpack() {
+    return IValuePacker<T>::unpack(std::move(stack[idx++]));
+  }
+
+  void pack_saved_data(const ska::flat_hash_map<std::string, at::IValue>& dct) {
+    std::vector<std::string> keys;
+    std::vector<at::IValue> values;
+    for (const auto& [key, value] : dct) {
+      keys.emplace_back(key);
+      values.emplace_back(value);
+    }
+    pack(keys);
+    for (const auto& value : values) {
+      pack(value);
+    }
+  }
+
+  ska::flat_hash_map<std::string, at::IValue> unpack_saved_data() {
+    ska::flat_hash_map<std::string, at::IValue> dct;
+    auto keys = unpack<std::vector<std::string>>();
+    for (const auto& key : keys) {
+      dct.insert({key, std::move(stack[idx++])});
+    }
+    return dct;
+  }
+
+ private:
+  std::vector<at::IValue> stack;
+  int64_t idx = 0;
+};
+
+} // namespace torch::dynamo::autograd
+
+template <>
+struct std::hash<torch::dynamo::autograd::CacheKey> {
+  size_t operator()(const torch::dynamo::autograd::CacheKey& k) const {
+    return k.hash();
+  }
+};
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/cpp_shim.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/cpp_shim.h
new file mode 100644
index 0000000000000000000000000000000000000000..a5de6c3df954484922658891f7e17d4b90665ecf
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/cpp_shim.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct _PytorchRecordFunctionState;
+typedef struct _PytorchRecordFunctionState _PytorchRecordFunctionState;
+
+_PytorchRecordFunctionState* _pytorch_record_function_enter(const char* name);
+void _pytorch_record_function_exit(_PytorchRecordFunctionState* state);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/cpython_defs.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/cpython_defs.h
new file mode 100644
index 0000000000000000000000000000000000000000..288d4bc62a496a517e2618422c050f8518e673ca
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/cpython_defs.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <torch/csrc/utils/python_compat.h>
+
+// Functions that need to be copied from the CPython source
+// should go in cpython_defs.c. Copying is required when, e.g.,
+// we need to call internal CPython functions that are not exposed.
+
+#if IS_PYTHON_3_11_PLUS
+
+typedef struct _PyInterpreterFrame _PyInterpreterFrame;
+
+PyFunctionObject* _PyFunction_CopyWithNewCode(
+    PyFunctionObject* o,
+    PyCodeObject* code);
+
+void THP_PyFrame_Clear(_PyInterpreterFrame* frame);
+
+_PyInterpreterFrame* THP_PyThreadState_BumpFramePointerSlow(
+    PyThreadState* tstate,
+    size_t size);
+
+void THP_PyThreadState_PopFrame(
+    PyThreadState* tstate,
+    _PyInterpreterFrame* frame);
+
+#endif
+
+// pointers to _PyOpcode_Caches for C++
+#ifdef __cplusplus
+
+extern "C" const uint8_t* THP_PyOpcode_Caches;
+extern "C" const int THP_PyOpcode_Caches_size;
+
+#else
+
+extern const uint8_t* THP_PyOpcode_Caches;
+extern const int THP_PyOpcode_Caches_size;
+
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/cpython_includes.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/cpython_includes.h
new file mode 100644
index 0000000000000000000000000000000000000000..e1bfd75555ed1110ff588aaca468139833980321
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/cpython_includes.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <torch/csrc/utils/python_compat.h>
+
+// Problem in CPython includes when mixing core and non-core build
+// The fix was not backported to 3.12 so this is needed here
+// https://github.com/python/cpython/issues/105268
+#if IS_PYTHON_3_12_PLUS
+#undef _PyGC_FINALIZED
+#endif
+
+// see https://bugs.python.org/issue35886
+#if PY_VERSION_HEX >= 0x03080000
+#define Py_BUILD_CORE
+
+#ifndef __cplusplus
+// C-only headers
+#include <internal/pycore_pystate.h>
+
+#endif // __cplusplus
+
+#if IS_PYTHON_3_11_PLUS
+#include <internal/pycore_frame.h>
+#endif
+
+#undef Py_BUILD_CORE
+#endif // PY_VERSION_HEX >= 0x03080000
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if IS_PYTHON_3_13_PLUS
+#define F_CODE(x) ((PyCodeObject*)(x)->f_executable)
+#define PREV_INSTR(x) (x)->instr_ptr
+#else
+#define F_CODE(x) ((PyCodeObject*)(x)->f_code)
+#define PREV_INSTR(x) (x)->prev_instr
+#endif
+
+#if IS_PYTHON_3_12_PLUS
+#define FUNC(x) ((x)->f_funcobj)
+#else
+#define FUNC(x) ((x)->f_func)
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/debug_macros.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/debug_macros.h
new file mode 100644
index 0000000000000000000000000000000000000000..11642464e43443f3c9131057edc30015a3f6d8d1
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/debug_macros.h
@@ -0,0 +1,102 @@
+#pragma once
+
+#include <torch/csrc/utils/python_compat.h>
+
+#ifdef __cplusplus
+#include <cstdio>
+#else
+#include <stdio.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _WIN32
+#define unlikely(x) (x)
+#else
+#define unlikely(x) __builtin_expect((x), 0)
+#endif
+
+#define NULL_CHECK(val)                                         \
+  if (unlikely((val) == NULL)) {                                \
+    fprintf(stderr, "NULL ERROR: %s:%d\n", __FILE__, __LINE__); \
+    PyErr_Print();                                              \
+    abort();                                                    \
+  } else {                                                      \
+  }
+
+// CHECK might be previously declared
+#undef CHECK
+#define CHECK(cond)                                                     \
+  if (unlikely(!(cond))) {                                              \
+    fprintf(stderr, "DEBUG CHECK FAILED: %s:%d\n", __FILE__, __LINE__); \
+    abort();                                                            \
+  } else {                                                              \
+  }
+
+// Uncomment next line to print debug message
+// #define TORCHDYNAMO_DEBUG 1
+#ifdef TORCHDYNAMO_DEBUG
+
+#define DEBUG_CHECK(cond) CHECK(cond)
+#define DEBUG_NULL_CHECK(val) NULL_CHECK(val)
+#define DEBUG_TRACE(msg, ...) \
+  fprintf(stderr, "TRACE[%s:%d] " msg "\n", __func__, __LINE__, __VA_ARGS__)
+#define DEBUG_TRACE0(msg) \
+  fprintf(stderr, "TRACE[%s:%d] " msg "\n", __func__, __LINE__)
+
+#else
+
+#define DEBUG_CHECK(cond)
+#define DEBUG_NULL_CHECK(val)
+#define DEBUG_TRACE(msg, ...)
+#define DEBUG_TRACE0(msg)
+
+#endif
+
+inline _PyFrameEvalFunction _debug_set_eval_frame(
+    PyThreadState* tstate,
+    _PyFrameEvalFunction eval_frame) {
+  _PyFrameEvalFunction prev =
+      _PyInterpreterState_GetEvalFrameFunc(tstate->interp);
+  _PyInterpreterState_SetEvalFrameFunc(tstate->interp, eval_frame);
+  return prev;
+}
+
+// Inspect PyObject*'s from C/C++ at the Python level, in pdb.
+// e.g.
+//
+// PyObject* obj1 = PyList_New(...);
+// PyObject* obj2 = PyObject_CallFunction(...);
+// INSPECT(obj1, obj2);
+// (pdb) p args[0]
+// # list
+// (pdb) p args[1]
+// # some object
+// (pdb) p args[1].some_attr
+// # etc.
+//
+// Implementation: set eval frame callback to default, call
+// torch._dynamo.utils._breakpoint_for_c_dynamo, reset eval frame callback.
+#define INSPECT(...)                                                  \
+  {                                                                   \
+    PyThreadState* cur_tstate = PyThreadState_Get();                  \
+    _PyFrameEvalFunction prev_eval_frame =                            \
+        _debug_set_eval_frame(cur_tstate, &_PyEval_EvalFrameDefault); \
+    PyObject* torch__dynamo_utils_module =                            \
+        PyImport_ImportModule("torch._dynamo.utils");                 \
+    NULL_CHECK(torch__dynamo_utils_module);                           \
+    PyObject* breakpoint_for_c_dynamo_fn = PyObject_GetAttrString(    \
+        torch__dynamo_utils_module, "_breakpoint_for_c_dynamo");      \
+    NULL_CHECK(breakpoint_for_c_dynamo_fn);                           \
+    PyObject_CallFunctionObjArgs(                                     \
+        breakpoint_for_c_dynamo_fn, __VA_ARGS__, NULL);               \
+    _debug_set_eval_frame(cur_tstate, prev_eval_frame);               \
+    Py_DECREF(breakpoint_for_c_dynamo_fn);                            \
+    Py_DECREF(torch__dynamo_utils_module);                            \
+  }
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/eval_frame.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/eval_frame.h
new file mode 100644
index 0000000000000000000000000000000000000000..d830638797db619658309ea12e90d128958af24a
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/eval_frame.h
@@ -0,0 +1,60 @@
+#pragma once
+#include <stdbool.h>
+
+#include <torch/csrc/dynamo/extra_state.h>
+#include <torch/csrc/utils/python_compat.h>
+#ifdef __cplusplus
+
+extern "C" {
+
+PyObject* torch_c_dynamo_eval_frame_init(void);
+
+#endif
+
+// All the eval APIs change in 3.11 so we need to decide which one to use on the
+// fly https://docs.python.org/3/c-api/init.html#c._PyFrameEvalFunction
+#if IS_PYTHON_3_11_PLUS
+#define THP_EVAL_API_FRAME_OBJECT _PyInterpreterFrame
+#else
+#define THP_EVAL_API_FRAME_OBJECT PyFrameObject
+#endif // IS_PYTHON_3_11_PLUS
+
+// We need to be able to return the _PyInterpreterFrame to python so create
+// a python binding for it
+
+typedef struct THPPyInterpreterFrame {
+  PyObject_HEAD
+  THP_EVAL_API_FRAME_OBJECT* frame; // Borrowed reference
+  PyObject* locals;
+} THPPyInterpreterFrame;
+
+THPPyInterpreterFrame* THPPyInterpreterFrame_New(
+    THP_EVAL_API_FRAME_OBJECT* frame);
+
+extern bool is_skip_guard_eval_unsafe;
+
+void clear_old_frame_if_python_312_plus(
+    PyThreadState* tstate,
+    THP_EVAL_API_FRAME_OBJECT* frame);
+
+void eval_frame_callback_set(PyObject* obj);
+
+const char* get_frame_name(THP_EVAL_API_FRAME_OBJECT* frame);
+
+PyObject* dynamo_eval_frame_default(
+    PyThreadState* tstate,
+    THP_EVAL_API_FRAME_OBJECT* frame,
+    int throw_flag);
+
+PyObject* dynamo_eval_custom_code(
+    PyThreadState* tstate,
+    THP_EVAL_API_FRAME_OBJECT* frame,
+    PyCodeObject* code,
+    const char* trace_annotation,
+    int throw_flag);
+
+#ifdef __cplusplus
+
+} // extern "C"
+
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/eval_frame_cpp.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/eval_frame_cpp.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ae77113867c0ea12b591111fb460d629ea3a849
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/eval_frame_cpp.h
@@ -0,0 +1,25 @@
+#pragma once
+#include <Python.h>
+
+#include <torch/csrc/dynamo/eval_frame.h>
+#include <torch/csrc/dynamo/extra_state.h>
+#include <torch/csrc/dynamo/framelocals_mapping.h>
+#ifdef __cplusplus
+
+extern "C" {
+
+#endif
+
+PyObject* dynamo__custom_eval_frame(
+    PyThreadState* tstate,
+    THP_EVAL_API_FRAME_OBJECT* frame,
+    int throw_flag,
+    PyObject* callback);
+
+PyObject* set_code_exec_strategy(PyObject* dummy, PyObject* obj);
+
+#ifdef __cplusplus
+
+} // extern "C"
+
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/extra_state.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/extra_state.h
new file mode 100644
index 0000000000000000000000000000000000000000..99b3fccbac98fd385c541a1226177d5df1270eda
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/extra_state.h
@@ -0,0 +1,206 @@
+#pragma once
+
+#include <Python.h>
+
+#include <torch/csrc/dynamo/framelocals_mapping.h>
+
+#ifdef __cplusplus
+
+#include <torch/csrc/dynamo/utils.h>
+#include <torch/csrc/utils/pybind.h>
+#include <list>
+
+namespace py = pybind11;
+
+extern "C" {
+
+#else
+
+#include <stdbool.h>
+
+#endif
+
+enum FrameAction {
+  DEFAULT, // look through the cache, compile if not found
+  SKIP, // eager
+  RUN_ONLY, // look through the cache, run eager if not found
+};
+
+typedef struct FrameExecStrategy {
+  enum FrameAction cur_action; // action to take for current frame
+  enum FrameAction recursive_action; // action to take for recursive frames
+} FrameExecStrategy;
+
+// Points to the extra scratch space on the code object
+extern Py_ssize_t extra_index;
+
+// function to call when cache lookup errors
+extern PyObject* guard_error_hook;
+
+typedef PyObject FrameState;
+typedef struct CacheEntry CacheEntry;
+
+// ExtraState encasulates CacheEntry and FrameState. ExtraState is the highest
+// level of abstraction of what is stored on the extra code object. Previously,
+// we saved different parts on different extra indexes.  We prefer this way
+// because of cleaner abstraction and faster SetExtra access.
+
+#ifdef __cplusplus
+
+typedef struct VISIBILITY_HIDDEN PrecompileEntry {
+  py::object guard_manager;
+  py::object code;
+  void* root_mgr;
+
+  PrecompileEntry(py::object gm, py::object c);
+} PrecompileEntry;
+
+typedef struct VISIBILITY_HIDDEN ExtraState {
+  // A pointer to the orig_code object to prevent race conditions in invalidate
+  // function.
+  PyCodeObject* orig_code;
+  std::list<PrecompileEntry> precompile_entries;
+  // List of cache entries for compiled code objects
+  std::list<CacheEntry> cache_entry_list;
+  // Frame state to detect dynamic shape dims
+  py::dict frame_state;
+  // Actions to apply to all frames with this code object
+  FrameExecStrategy strategy{DEFAULT, DEFAULT};
+
+  ExtraState(PyCodeObject* orig_code_arg);
+  CacheEntry* get_first_entry();
+  void move_to_front(CacheEntry* cache_entry);
+  void move_to_back(CacheEntry* cache_entry);
+  void invalidate(CacheEntry* cache_entry, py::object deleted_guard_manager);
+} ExtraState;
+
+#else
+
+typedef struct ExtraState ExtraState;
+typedef struct PrecompileEntry PrecompileEntry;
+
+#endif
+
+// Helper to extra the cache_entry from the extra state.
+// Ownership contract
+// args
+//  - extra_state: Borrowed
+// return
+//  - CacheEntry: Borrowed.
+CacheEntry* extract_cache_entry(ExtraState* extra_state);
+
+// Returns either the previously stored frame state or an empty dict.
+// Ownership contract
+// args
+//  - extra_state: Borrowed
+// return
+//  - extra_state->frame_state: Borrowed.
+FrameState* extract_frame_state(ExtraState* extra_state);
+
+// Returns the FrameExecStrategy stored in extra_state.
+// Ownership contract
+// args
+//  - extra_state: Borrowed
+FrameExecStrategy extra_state_get_exec_strategy(ExtraState* extra_state);
+
+// Set the FrameExecStrategy to be done to all frames with code object
+// corresponding to this extra_state. Ownership contract
+// - extra_state: Borrowed
+void extra_state_set_exec_strategy(
+    ExtraState* extra_state,
+    FrameExecStrategy strategy);
+
+// Ownership contract
+// args
+//  - code: Borrowed
+// return
+//  - extra_state: Borrowed.
+ExtraState* get_extra_state(PyCodeObject* code);
+
+// This is passed as freefunc to _PyEval_RequestCodeExtraIndex. This acts as a
+// deleter for the object on extra scratch space. This function is called
+// internally in _PyCode_SetExtra and also during the code deallocation.
+
+// Destroys the extra state by deleting cache_entry, frame state and finally
+// freeing the constructed extra state.
+
+// Developer note - You should not call this function directly. This is called
+// directly inside set_extra_state. If you are in a situation trying to call
+// this function, consider if set_extra_state should be called.
+void destroy_extra_state(void* obj);
+
+// Clears the existing object sitting on the extra scratch spance and sets it
+// up with the new state. Note that _PyCode_SetExtra calls the
+// destroy_extra_state deleter internally, and therefore we don't call it
+// explicitly here.
+
+// Ownership contract
+// args
+//  - extra_state: Stolen
+// return
+//  - there is no return, but the extra_state is stolen, so it becomes
+//  set_extra_state responsibility to clean it up. It will be deleted during
+//  the reset_code, when the set_extra_state is called with NULL.
+
+// Invariant - Dont set the extra state for the extra state that is already on
+// the code object. Otherwise, we will first free up the old extra state
+// (which is also the new extra state) and write something invalid on the
+// scratch space.
+void set_extra_state(PyCodeObject* code, ExtraState* extra_state);
+
+// Creates a new extra state and put it on the extra scratch space of the code
+// object.
+
+// Ownership contract
+// args
+//  - code: Borrowed
+// return:
+//   - extra_state: New reference.
+// These references are then further passed to set_extra_state which becomes
+// the final owner of these references.
+ExtraState* init_and_set_extra_state(PyCodeObject* code);
+
+// Lookup the cache held by extra_state.
+// Ownership contract
+// args
+//  - extra_state: Borrowed
+// return:
+//   - Py_None or PyCodeObject: Borrowed reference.
+//   - Py_None or PyObject: Trace id of the compiled code.
+void lookup(
+    ExtraState* extra_state,
+    FrameLocalsMapping* f_locals,
+    PyObject* backend,
+    PyObject** maybe_cached_code,
+    const char** trace_annotation,
+    bool is_skip_guard_eval_unsafe);
+
+// Create a new cache entry at extra_state holding on to guarded_code.
+// Ownership contract
+// args
+//  - extra_state: Borrowed
+//  - guarded_code: Borrowed
+// return:
+//  - cache_entry: Borrowed reference
+CacheEntry* create_cache_entry(
+    ExtraState* extra_state,
+    PyObject* guraded_code,
+    PyObject* callback);
+
+// Extracts the backend fn from the callback.
+PyObject* get_backend(PyObject* callback);
+
+#ifdef __cplusplus
+
+} // extern "C"
+
+// Returns the list of CacheEntry corresponding to code_obj.
+// Warning: returns references whose lifetimes are controlled by C++
+py::list _debug_get_cache_entry_list(const py::handle& code_obj);
+void _reset_precompile_entries(const py::handle& code_obj);
+void _load_precompile_entry(
+    const py::handle& code_obj,
+    py::object guard_manager,
+    py::object dynamo_code);
+
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/framelocals_mapping.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/framelocals_mapping.h
new file mode 100644
index 0000000000000000000000000000000000000000..555d036681a0e4342a18fe73df651177c52d7720
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/framelocals_mapping.h
@@ -0,0 +1,92 @@
+#pragma once
+
+#include <torch/csrc/utils/python_compat.h>
+
+#ifdef __cplusplus
+
+#include <string>
+#include <unordered_map>
+
+#include <torch/csrc/dynamo/utils.h>
+#include <torch/csrc/utils/pybind.h>
+
+extern "C" {
+
+#if IS_PYTHON_3_11_PLUS
+using FrameLocalsFrameType = _PyInterpreterFrame;
+#else
+using FrameLocalsFrameType = PyFrameObject;
+#endif // IS_PYTHON_3_11_PLUS
+
+/**
+ * Utility to view a frame's localsplus (locals + cells + freevars)
+ * in C/C++ and Python, without changing the state of the frame.
+ *
+ * Notes on usage:
+ *  - C/C++ can directly read the frame's localsplus using an index.
+ *  - Cell/free variables are unboxed.
+ *  - Can be converted into a dict for use in Python.
+ *    The dict is constructed once per FrameLocalsMapping, lazily.
+ *  - Lifetime should not exceed the lifetime of the frame
+ *
+ * How do guards use FrameLocalsMapping?
+ * - When a guard accesses a frame's localsplus, we find the index of the
+ *   variable name in the frame's code object and create a
+ *   FrameLocalsGuardAccessor.
+ * - We create a FrameLocalsMapping for the frame that we pass on to guard eval.
+ * - LeafGuards/GuardManagers/GuardAccessors now need to define how they
+ *   handle FrameLocalsMapping. By default, the FrameLocalsMapping is converted
+ *   to a Python dict and the guard check is performed on the resulting dict.
+ * - Some guard checks don't actually depend on the input arguments, e.g. they
+ *   only check global state. In this case, no dict conversion of
+ *   FrameLocalsMapping is done.
+ * - FrameLocalsGuardAccessor is like DictGetItemGuardAccessor, except it knows
+ *   how to handle FrameLocalsMapping - by using the framelocals variable name
+ *   index that it was given when it was built.
+ */
+typedef struct VISIBILITY_HIDDEN FrameLocalsMapping {
+ private:
+  py::object _code_obj;
+  // can't use localsplus directly due to closure variables:
+  // - in 3.11+, the closure vars in the frame's closure object and
+  //   the corresponding localsplus entry is nullptr
+  // - regardless of Python version, we need to unbox the cell variable
+  std::vector<py::handle> _framelocals;
+
+  py::object _dict{py::none()};
+
+  void _realize_dict();
+
+ public:
+  explicit FrameLocalsMapping(FrameLocalsFrameType* frame);
+
+  PyObject* get(int idx);
+
+  bool dict_realized() const {
+    return _dict.is_none();
+  }
+
+  // Borrowed reference
+  PyDictObject* to_dict() {
+    if (this->dict_realized()) {
+      _realize_dict();
+    }
+    return (PyDictObject*)_dict.ptr();
+  }
+} FrameLocalsMapping;
+
+#else
+
+// opaque type for C
+typedef struct FrameLocalsMapping FrameLocalsMapping;
+
+#endif
+
+// Borrowed reference
+PyDictObject* framelocals_mapping_to_dict(FrameLocalsMapping* map);
+
+#ifdef __cplusplus
+} // extern "C"
+
+py::tuple code_framelocals_names(py::handle code);
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/guards.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/guards.h
new file mode 100644
index 0000000000000000000000000000000000000000..d4c3ee3aaf023a186f857c1f9d4fcd87f5e68224
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/guards.h
@@ -0,0 +1,91 @@
+#pragma once
+#include <c10/core/GradMode.h>
+#include <torch/csrc/dynamo/framelocals_mapping.h>
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch::dynamo {
+
+PyObject* torch_c_dynamo_guards_init();
+
+// interfaces for extra_state and eval_frame.c because RootGuardManager class is
+// not visible there.
+void* convert_to_root_guard_manager(py::object root);
+bool run_root_guard_manager(void* root, FrameLocalsMapping* f_locals);
+
+struct LocalState {
+  // TLS state that changes operators
+  c10::impl::LocalDispatchKeySet dispatch_modifier;
+  c10::DispatchKeySet override_dispatch_key_set;
+  bool grad_mode_enabled;
+
+  at::DispatchKeySet apply(at::DispatchKeySet ks) const {
+    if (override_dispatch_key_set.empty()) {
+      return (ks | dispatch_modifier.included_) - dispatch_modifier.excluded_;
+    } else {
+      return override_dispatch_key_set;
+    }
+  }
+
+  LocalState()
+      : dispatch_modifier(c10::impl::tls_local_dispatch_key_set()),
+        override_dispatch_key_set(c10::BackendComponent::InvalidBit),
+        grad_mode_enabled(at::GradMode::is_enabled()) {}
+
+  void overrideDispatchKeySet(c10::DispatchKeySet ks) {
+    override_dispatch_key_set = ks;
+  }
+};
+
+class TensorCheck {
+ public:
+  TensorCheck(
+      const LocalState& state,
+      PyTypeObject* pt,
+      const at::Tensor& v,
+      c10::DispatchKeySet dispatch_key_set,
+      std::vector<std::optional<c10::SymInt>> dynamic_dims_sizes,
+      std::vector<std::optional<c10::SymInt>> dynamic_dims_strides);
+
+  TensorCheck(
+      const LocalState& state,
+      PyTypeObject* pt,
+      c10::DispatchKeySet dispatch_key_set,
+      at::ScalarType dtype,
+      at::DeviceIndex device_index,
+      bool requires_grad,
+      std::vector<std::optional<c10::SymInt>> dynamic_dims_sizes,
+      std::vector<std::optional<c10::SymInt>> dynamic_dims_strides);
+
+  bool check(const LocalState& state, const at::Tensor& v);
+  bool check(
+      const LocalState& state,
+      const c10::DispatchKeySet& dispatch_key_set,
+      const at::ScalarType& dtype,
+      const c10::Device& device,
+      const c10::SymIntArrayRef& dynamic_dims_sizes,
+      const c10::SymIntArrayRef& dynamic_dims_strides,
+      const bool& requires_grad);
+  std::string check_verbose(
+      const LocalState& state,
+      const at::Tensor& v,
+      const std::string& tensor_name);
+
+  PyTypeObject* pytype;
+
+ private:
+  uint64_t dispatch_key_; // DispatchKeySet includes device/layout
+  at::ScalarType dtype_;
+  // Note(voz): While dispatch_key_ is sufficiently representative of a device
+  // In that keys are more granular AND device specific - they do not
+  // necessarily capture device indices correctly.
+  at::DeviceIndex device_index_;
+  bool requires_grad_;
+  // NB: These are unset if dynamic shapes is enabled.
+  std::vector<std::optional<c10::SymInt>> sizes_;
+  std::vector<std::optional<c10::SymInt>> strides_;
+  // Not strictly required for dense tensors, but nested tensors need it.
+  int64_t dim_;
+};
+
+} // namespace torch::dynamo
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/init.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/init.h
new file mode 100644
index 0000000000000000000000000000000000000000..c188c74315f2e76e4fa8202e1c99c3b855480abe
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/init.h
@@ -0,0 +1,11 @@
+#pragma once
+
+// C2039 MSVC
+#include <pybind11/complex.h>
+#include <torch/csrc/utils/pybind.h>
+
+#include <Python.h>
+
+namespace torch::dynamo {
+void initDynamoBindings(PyObject* torch);
+}
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/python_compiled_autograd.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/python_compiled_autograd.h
new file mode 100644
index 0000000000000000000000000000000000000000..19d284b30f0ff76509d7f7e670ead7466818b8ba
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/python_compiled_autograd.h
@@ -0,0 +1,7 @@
+#pragma once
+#include <torch/csrc/utils/python_stub.h>
+
+// see [Note: Compiled Autograd]
+namespace torch::dynamo::autograd {
+PyObject* torch_c_dynamo_compiled_autograd_init();
+} // namespace torch::dynamo::autograd
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/utils.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..e4b7b2a21f90eb34de4768235eb88520dae2d74c
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/dynamo/utils.h
@@ -0,0 +1,18 @@
+#pragma once
+#include <torch/csrc/python_headers.h>
+// C2039 MSVC
+#include <pybind11/complex.h>
+#include <torch/csrc/utils/pybind.h>
+
+#include <Python.h>
+// The visibility attribute is to avoid a warning about storing a field in the
+// struct that has a different visibility (from pybind) than the struct.
+#ifdef _WIN32
+#define VISIBILITY_HIDDEN
+#else
+#define VISIBILITY_HIDDEN __attribute__((visibility("hidden")))
+#endif
+
+namespace torch::dynamo {
+PyObject* torch_c_dynamo_utils_init();
+} // namespace torch::dynamo
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/export/pt2_archive_constants.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/export/pt2_archive_constants.h
new file mode 100644
index 0000000000000000000000000000000000000000..848da7e4b5d2ac3dddcd4053c8d7c9838fbbdee9
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/export/pt2_archive_constants.h
@@ -0,0 +1,66 @@
+#pragma once
+
+#include <array>
+#include <string_view>
+
+namespace torch::_export::archive_spec {
+
+#define FORALL_CONSTANTS(DO)                                                   \
+  DO(ARCHIVE_ROOT_NAME, "package")                                             \
+  /* Archive format */                                                         \
+  DO(ARCHIVE_FORMAT_PATH, "archive_format")                                    \
+  DO(ARCHIVE_FORMAT_VALUE, "pt2")                                              \
+  /* Archive version */                                                        \
+  DO(ARCHIVE_VERSION_PATH, "archive_version")                                  \
+  DO(ARCHIVE_VERSION_VALUE, "0") /* Sep.4.2024: This is the initial version of \
+                                    the PT2 Archive Spec */                    \
+  /*                                                                           \
+   * ######## Note on updating ARCHIVE_VERSION_VALUE ########                  \
+   * When there is a BC breaking change to the PT2 Archive Spec,               \
+   * e.g. deleting a folder, or changing the naming convention of the          \
+   * following fields it would require bumping the ARCHIVE_VERSION_VALUE       \
+   * Archive reader would need corresponding changes to support loading both   \
+   * the current and older versions of the PT2 Archive.                        \
+   */                                                                          \
+  /* Model definitions */                                                      \
+  DO(MODELS_DIR, "models/")                                                    \
+  DO(MODELS_FILENAME_FORMAT, "models/{}.json") /* {model_name} */              \
+  /* AOTInductor artifacts */                                                  \
+  DO(AOTINDUCTOR_DIR, "data/aotinductor/")                                     \
+  /* MTIA artifacts */                                                         \
+  DO(MTIA_DIR, "data/mtia")                                                    \
+  /* weights, including parameters and buffers */                              \
+  DO(WEIGHTS_DIR, "data/weights/")                                             \
+  DO(WEIGHT_FILENAME_PREFIX, "weight_")                                        \
+  DO(WEIGHTS_PARAM_CONFIG_FORMAT, "data/weights/{}_model_param_config.json")   \
+  /* constants, including tensor_constants, non-persistent buffers and script  \
+   * objects */                                                                \
+  DO(CONSTANTS_DIR, "data/constants/")                                         \
+  DO(CONSTANTS_PARAM_CONFIG_FORMAT,                                            \
+     "data/constants/{}_model_constants_config.json")                          \
+  DO(TENSOR_CONSTANT_FILENAME_PREFIX, "tensor_")                               \
+  DO(CUSTOM_OBJ_FILENAME_PREFIX, "custom_obj_")                                \
+  /* example inputs */                                                         \
+  DO(SAMPLE_INPUTS_DIR, "data/sample_inputs/")                                 \
+  DO(SAMPLE_INPUTS_FILENAME_FORMAT,                                            \
+     "data/sample_inputs/{}.pt") /* {model_name} */                            \
+  /* extra folder */                                                           \
+  DO(EXTRA_DIR, "extra/")                                                      \
+  DO(MODULE_INFO_PATH, "extra/module_info.json")                               \
+  /* xl_model_weights, this folder is used for storing per-feature-weights for \
+   * remote net data in this folder is consume by Predictor, and is not        \
+   * intended to be used by Sigmoid */                                         \
+  DO(XL_MODEL_WEIGHTS_DIR, "xl_model_weights/")                                \
+  DO(XL_MODEL_WEIGHTS_PARAM_CONFIG_PATH, "xl_model_weights/model_param_config")
+
+#define DEFINE_GLOBAL(NAME, VALUE) \
+  inline constexpr std::string_view NAME = VALUE;
+FORALL_CONSTANTS(DEFINE_GLOBAL)
+#undef DEFINE_GLOBAL
+
+#define DEFINE_ENTRY(NAME, VALUE) std::pair(#NAME, VALUE),
+inline constexpr std::array kAllConstants{FORALL_CONSTANTS(DEFINE_ENTRY)};
+#undef DEFINE_ENTRY
+
+#undef FORALL_CONSTANTS
+} // namespace torch::_export::archive_spec
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/export/pybind.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/export/pybind.h
new file mode 100644
index 0000000000000000000000000000000000000000..bba56b502403d68456de87b19c6f9b7728b822c8
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/export/pybind.h
@@ -0,0 +1,7 @@
+#include <torch/csrc/python_headers.h>
+
+namespace torch::_export {
+
+void initExportBindings(PyObject* module);
+
+} // namespace torch::_export
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/functorch/init.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/functorch/init.h
new file mode 100644
index 0000000000000000000000000000000000000000..7af2389fd66bdfe2929e17f6567128d0c895ad9a
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/functorch/init.h
@@ -0,0 +1,7 @@
+#include <Python.h>
+
+namespace torch::functorch::impl {
+
+void initFuncTorchBindings(PyObject* module);
+
+}
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/fx/node.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/fx/node.h
new file mode 100644
index 0000000000000000000000000000000000000000..645c249a8a552011c6d64cdad6ad1490880471e6
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/fx/node.h
@@ -0,0 +1,6 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+bool NodeBase_init(PyObject* module);
+bool NodeIter_init(PyObject* module);
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_eager/kernel_holder.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_eager/kernel_holder.h
new file mode 100644
index 0000000000000000000000000000000000000000..0833432ea9412a436ef3b5ce1a19c0ca580f1757
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_eager/kernel_holder.h
@@ -0,0 +1,112 @@
+#if !defined(C10_MOBILE) && !defined(ANDROID)
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/core/boxing/KernelFunction.h>
+#include <ATen/core/function_schema.h>
+
+#include <torch/csrc/dynamo/guards.h>
+#include <torch/csrc/inductor/aoti_eager/kernel_meta_info.h>
+#include <torch/csrc/inductor/aoti_runner/model_container_runner.h>
+#include <torch/csrc/utils/pybind.h>
+
+#include <string>
+
+namespace torch::inductor {
+
+// Represent AOTI kernel. It contains all the parameter metadata of the kernel
+// and the AOTI model runner.
+struct AOTIKernelMetadata {
+  // Represent all the parameters of AOTI kernel
+  std::vector<ParameterMetadata> parameter_metadata_list_;
+  // AOTI model runner to run the AOTI kernel
+  std::shared_ptr<AOTIModelContainerRunner> kernel_runner_;
+  AOTIKernelMetadata() : kernel_runner_(nullptr) {}
+
+  // Check whether the given parameter metadata list is the same as the
+  // parameter metadata list of the AOTI kernel.
+  bool check(
+      const std::vector<ParameterMetadata>& parameter_metadata_list) const {
+    if (parameter_metadata_list_.size() != parameter_metadata_list.size()) {
+      return false;
+    }
+
+    for (size_t i = 0; i < parameter_metadata_list_.size(); ++i) {
+      if (parameter_metadata_list_[i] == parameter_metadata_list[i]) {
+        continue;
+      } else {
+        return false;
+      }
+    }
+
+    return true;
+  }
+};
+
+// The AOTIPythonKernelHolder class uses the AOT Inductor to generate a kernel
+// for a specified operation. To speed up this process, the generated kernel
+// library is cached on disk. Detailed information from the input tensors is
+// used as the key for caching the kernel library. On subsequent runs, these
+// input tensors are used to search the cache. If a cache hit occurs, the cached
+// kernel library is loaded and executed. If a cache miss occurs, the AOT
+// Inductor is called again to generate the kernel library.
+class AOTIPythonKernelHolder : public c10::OperatorKernel {
+  // A DispatchKey object that represents the dispatch key for the kernel.
+  c10::DispatchKey dispatch_key_;
+  // Namespace of the kernel.
+  std::string ns_;
+  // Name of the operation the kernel performs.
+  std::string op_name_with_overload_;
+  // The device on which the kernel is to be executed.
+  c10::Device device_;
+  // The Python interpreter to get OpOverload object with the given op_name and
+  // op_overload_name.
+  c10::impl::PyInterpreter* pyinterpreter_;
+  // Cache the produced kernels by AOTI and its metadata
+  std::vector<AOTIKernelMetadata> aoti_kernel_cache_;
+
+ public:
+  AOTIPythonKernelHolder(
+      c10::DispatchKey dispatch_key,
+      std::string_view ns,
+      std::string_view op_name_with_overload);
+
+  void operator()(
+      const c10::OperatorHandle& op,
+      c10::DispatchKeySet keyset,
+      torch::jit::Stack* stack);
+
+ private:
+  bool cache_lookup(
+      const c10::OperatorHandle& op,
+      const c10::DispatchKeySet& keyset,
+      const torch::jit::Stack* stack,
+      AOTIKernelMetadata& aoti_kernel_metadata);
+  void cache_miss(
+      const c10::OperatorHandle& op,
+      const c10::DispatchKeySet& keyset,
+      torch::jit::Stack* stack);
+  void cache_hit(
+      const AOTIKernelMetadata& aoti_kernel_metadata,
+      const c10::OperatorHandle& op,
+      const c10::DispatchKeySet& keyset,
+      torch::jit::Stack* stack);
+  // Invoke python utility function on the Inductor side to produce AOTI kernel
+  // for the given operation.
+  //   Inductor utility function -
+  //   torch._inductor.utils.aoti_compile_with_persistent_cache
+  std::string produce_aoti_kernel_lib(
+      const c10::OperatorHandle& op,
+      const c10::DispatchKeySet& keyset,
+      const torch::jit::Stack* stack);
+  // Invoke python utility function on the Inductor side to load AOTI kernel for
+  // the given operation.
+  //   Inductor utility function - torch._inductor.utils.load_aoti_eager_cache
+  void init_aoti_kernel_cache();
+  // Load the AOTIModelContainerRunner object from the given file path.
+  std::shared_ptr<AOTIModelContainerRunner> load_aoti_model_runner(
+      const std::string&);
+};
+
+} // namespace torch::inductor
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_eager/kernel_meta_info.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_eager/kernel_meta_info.h
new file mode 100644
index 0000000000000000000000000000000000000000..70eb5a7a50f2a4aae25021ca9afc34577360b494
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_eager/kernel_meta_info.h
@@ -0,0 +1,142 @@
+#if !defined(C10_MOBILE) && !defined(ANDROID)
+#pragma once
+
+#include <ATen/ATen.h>
+#include <c10/core/SymIntArrayRef.h>
+#include <torch/csrc/dynamo/guards.h>
+
+#include <string>
+
+namespace torch::inductor {
+
+// Regarding a aten operation implemented by AOTI, the metadata of the input
+// tensors will be cached on the disk to accelerate next run. TensorMetada
+// structure is to represent the metadata of each input tensor. It includes
+// whether the tensor is symbolic, the dtype, the device, the sizes and the
+// strides of the tensor. When the metadata of the input tensors is the same as
+// the cached metadata, the cached kernel library will be loaded and executed.
+// Otherwise, the AOT Inductor will be called again to generate the kernel
+// library.
+// Beyond the TensorMetadata, we build guard/TensorCheck for each input tensor
+// as well to support symbolic shape. We intend to utilize TensorCheck to find
+// out the proper kernel rather than TensorMetada comparison. Suppose an
+// operation with a single input tensor and two kernels:
+//   kernel1: TensorMetadata(is_symbolic=false, dtype=Float, device=CPU,
+//   sizes=[s0, s1, s2], strides=[s1 * s2, s2, 1]) kernel2:
+//   TensorMetadata(is_symbolic=false, dtype=Float, device=CPU, sizes=[3, s1,
+//   s2], strides=[s1 * s2, s2, 1])
+// If a tensor with sizes=[3, 4, 5] is passed to the operation, both kernel1 and
+// kernel2 support the tensor shape. In this case, we need to use TensorCheck
+// plus some heruistic rules to find out the proper kernel.
+struct TensorMetadata {
+  // Indicate whether the tensor is symbolic and it may be concluded by sizes_
+  // and strides_ in the future.
+  bool is_symbolic_;
+  // Dtype of a tensor(For scalar, we will wrap it as a scalar tensor)
+  c10::ScalarType dtype_ = c10::ScalarType::Undefined;
+  // Device of a tensor.
+  c10::Device device_;
+  // Dispatch key set of a tensor
+  c10::DispatchKeySet dispatch_key_set_;
+  // Sizes of a tensor. Currently, we only support static shape and use int64_t
+  // to represent the sizes. In the future, we will create symbolic size and use
+  // SymInt to represent it to support symbolic shape.
+  std::vector<int64_t> sizes_;
+  // Strides of a tensor. For symbolic shape support, it is the same as sizes_
+  std::vector<int64_t> strides_;
+  // requires grad
+  bool requires_grad_ = false;
+  // TensorCheck for the tensor
+  std::optional<dynamo::TensorCheck> tensor_check_;
+
+  TensorMetadata()
+      : is_symbolic_(false),
+        device_(c10::DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES),
+        sizes_({}),
+        strides_({}) {}
+  TensorMetadata(const at::Tensor& src_tensor);
+  TensorMetadata(
+      bool is_symbolic,
+      c10::ScalarType dtype,
+      c10::Device device,
+      c10::DispatchKeySet dispatch_key_set,
+      std::vector<int64_t> sizes,
+      std::vector<int64_t> strides,
+      bool requires_grad = false);
+
+  // Build TensorCheck for the tensor by using the data fields in TensorMetadata
+  void build_guard(const dynamo::LocalState& local_state);
+
+  // Compare two TensorMetadata objects
+  bool operator==(const TensorMetadata& other) const;
+};
+
+// ParameterTag is to represent the type of the input parameters of a aten
+// operation. Currently, we support the following types:
+//   1. TENSOR: a single tensor
+//   2. TENSOR_OPTIONAL: a single optional tensor
+//   3. TENSOR_LIST: a list of tensors
+//   4. TENSOR_LIST_OPTIONAL: a list of optional tensors
+//   5. SCALAR: a scalar value
+// If we need to support more types in the future, we will add more types in the
+// ParameterTag enum. For example, we will extend the enum to support string,
+// Dimname and so on to support more types of input parameters of aten
+// operations.
+enum ParameterTag {
+  TENSOR,
+  TENSOR_OPTIONAL,
+  TENSOR_LIST,
+  TENSOR_LIST_OPTIONAL,
+  SCALAR,
+  STRING,
+  DEVICE,
+  INVALID,
+};
+
+// ParameterMetadataValue is to represent the value of the input parameters of a
+// aten operation.
+using ParameterMetadataValue = std::variant<
+    TensorMetadata,
+    std::vector<TensorMetadata>,
+    c10::Scalar,
+    std::string,
+    c10::Device>;
+
+// ParameterMetadata is to represent the metadata of the input parameters of a
+// aten operation. It includes the tag of the parameter, the value of the
+// parameter and the order of the parameter.
+struct ParameterMetadata {
+  // The tag of the parameter. It indicates the type of the parameter.
+  ParameterTag tag_;
+  // The value of the parameter. It can be a tensor, a list of tensors or a
+  // scalar.
+  ParameterMetadataValue value_;
+  // The order of the parameter is used to distinguish the parameters with the
+  // same tag. For example, an operation with two input tensors, the first
+  // tensor is a optional tensor and the second tensor is a tensor. The first
+  // tensor will have the order 0 and the second tensor will have the order 1.
+  uint64_t order_{};
+
+  ParameterMetadata() : tag_(INVALID) {}
+  ParameterMetadata(TensorMetadata tensor_metadata, uint64_t input_order);
+  ParameterMetadata(const at::Tensor& tensor, uint64_t input_order);
+  ParameterMetadata(
+      const std::vector<at::Tensor>& tensor_list,
+      uint64_t input_order);
+  ParameterMetadata(
+      const std::vector<TensorMetadata>& tensor_metadata_list,
+      uint64_t input_order);
+  ParameterMetadata(const c10::Scalar& scalar, uint64_t input_order);
+  ParameterMetadata(const std::string& string_value, uint64_t input_order);
+  ParameterMetadata(const c10::Device& device, uint64_t input_order);
+
+  bool operator==(const ParameterMetadata& other) const;
+
+ private:
+  // Helper function to compare two ParameterMetadata objects with the same
+  // SCALAR tag.
+  bool equal_to(const c10::Scalar& scalar) const;
+};
+
+} // namespace torch::inductor
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_include/array_ref.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_include/array_ref.h
new file mode 100644
index 0000000000000000000000000000000000000000..46a57587ef3447928410390c4e0edd62543d8935
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_include/array_ref.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#include <torch/csrc/inductor/aoti_include/common.h>
+#include <torch/csrc/inductor/aoti_runtime/arrayref_tensor.h>
+#include <torch/csrc/inductor/aoti_runtime/thread_local.h>
+#include <torch/csrc/inductor/array_ref_impl.h>
+#include <torch/csrc/inductor/cpp_wrapper/device_internal/cpu.h>
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_include/common.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_include/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae9b40fc00d44ce8476b3abe6daf917d408c9329
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_include/common.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <array>
+#include <filesystem>
+#include <optional>
+
+#include <torch/csrc/inductor/aoti_runtime/interface.h>
+#include <torch/csrc/inductor/aoti_runtime/model.h>
+
+#include <c10/util/generic_math.h>
+#include <torch/csrc/inductor/aoti_runtime/scalar_to_tensor.h>
+
+// Round up to the nearest multiple of 64
+[[maybe_unused]] inline int64_t align(int64_t nbytes) {
+  return (nbytes + 64 - 1) & -64;
+}
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_include/cpu.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_include/cpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..c78db2b6c283dff0bd27f907972fa22e51384cc6
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_include/cpu.h
@@ -0,0 +1,4 @@
+#pragma once
+
+#include <torch/csrc/inductor/aoti_include/common.h>
+#include <torch/csrc/inductor/cpp_wrapper/device_internal/cpu.h>
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_include/cuda.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_include/cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..d225bbdee5ae8e0c313dcfebdf627f3377147ec7
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_include/cuda.h
@@ -0,0 +1,4 @@
+#pragma once
+
+#include <torch/csrc/inductor/aoti_include/common.h>
+#include <torch/csrc/inductor/cpp_wrapper/device_internal/cuda.h>
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_include/mps.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_include/mps.h
new file mode 100644
index 0000000000000000000000000000000000000000..19d8ad5971f17fa84a2c9d3770f502267bedd426
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_include/mps.h
@@ -0,0 +1,4 @@
+#pragma once
+
+#include <torch/csrc/inductor/aoti_include/common.h>
+#include <torch/csrc/inductor/cpp_wrapper/device_internal/mps.h>
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_include/xpu.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_include/xpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..47c2618bcd2f4e95bfd2fd4168a0b9235004c2bd
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_include/xpu.h
@@ -0,0 +1,4 @@
+#pragma once
+
+#include <torch/csrc/inductor/aoti_include/common.h>
+#include <torch/csrc/inductor/cpp_wrapper/device_internal/xpu.h>
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_package/model_package_loader.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_package/model_package_loader.h
new file mode 100644
index 0000000000000000000000000000000000000000..957e11719d336c644b506b87862ef7fd98e3a3c1
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_package/model_package_loader.h
@@ -0,0 +1,54 @@
+#if !defined(C10_MOBILE) && !defined(ANDROID)
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <c10/core/Device.h>
+#include <torch/csrc/inductor/aoti_runner/model_container_runner.h>
+
+namespace torch::inductor {
+class TORCH_API AOTIModelPackageLoader {
+ public:
+  AOTIModelPackageLoader(
+      const std::string& model_package_path,
+      const std::string& model_name = "model",
+      const bool run_single_threaded = false,
+      const size_t num_runners = 1,
+      const c10::DeviceIndex device_index = -1);
+  ~AOTIModelPackageLoader();
+
+  AOTIModelContainerRunner* get_runner();
+  std::unordered_map<std::string, std::string> get_metadata();
+
+  std::vector<at::Tensor> run(
+      const std::vector<at::Tensor>& inputs,
+      void* stream_handle = nullptr);
+
+  // boxed_run will steal the ownership of the input tensors
+  std::vector<at::Tensor> boxed_run(
+      std::vector<at::Tensor>&& inputs,
+      void* stream_handle = nullptr);
+
+  std::vector<std::string> get_call_spec();
+  void load_constants(
+      std::unordered_map<std::string, at::Tensor>& constants_map,
+      bool use_inactive,
+      bool check_full_update,
+      bool user_managed = false);
+  std::vector<std::string> get_constant_fqns();
+
+  void update_constant_buffer(
+      std::unordered_map<std::string, at::Tensor>& tensor_map,
+      bool use_inactive,
+      bool validate_full_updates,
+      bool user_managed = false);
+
+ private:
+  std::string temp_dir_;
+  std::unique_ptr<AOTIModelContainerRunner> runner_;
+  std::unordered_map<std::string, std::string> metadata_;
+
+  void load_metadata(const std::string& cpp_filename);
+};
+
+} // namespace torch::inductor
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_package/pybind.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_package/pybind.h
new file mode 100644
index 0000000000000000000000000000000000000000..195d51a8cf119e5a891581a9b46efcc7af239d17
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_package/pybind.h
@@ -0,0 +1,7 @@
+#include <torch/csrc/python_headers.h>
+
+namespace torch::inductor {
+
+void initAOTIPackageBindings(PyObject* module);
+
+} // namespace torch::inductor
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runner/model_container_runner.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runner/model_container_runner.h
new file mode 100644
index 0000000000000000000000000000000000000000..7fee90264dce910082647cbf650b391a4909f9b4
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runner/model_container_runner.h
@@ -0,0 +1,135 @@
+#if !defined(C10_MOBILE) && !defined(ANDROID)
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <torch/csrc/inductor/aoti_runtime/interface.h>
+#include <torch/csrc/inductor/aoti_torch/proxy_executor.h>
+
+// Forward declare DynamicLibrary
+namespace at {
+struct DynamicLibrary;
+}
+
+namespace torch::inductor {
+using TensorConstantMap = std::unordered_map<std::string, at::Tensor*>;
+
+class TORCH_API AOTIModelContainerRunner {
+ public:
+  AOTIModelContainerRunner() = delete;
+  AOTIModelContainerRunner(const AOTIModelContainerRunner& other) = delete;
+  AOTIModelContainerRunner(AOTIModelContainerRunner&& other) = delete;
+  AOTIModelContainerRunner& operator=(const AOTIModelContainerRunner& other) =
+      delete;
+  AOTIModelContainerRunner& operator=(AOTIModelContainerRunner&& other) =
+      delete;
+  virtual ~AOTIModelContainerRunner();
+
+  std::vector<at::Tensor> run(
+      const std::vector<at::Tensor>& inputs,
+      void* stream_handle = nullptr);
+
+  // boxed_run will steal the ownership of the input tensors
+  std::vector<at::Tensor> boxed_run(
+      std::vector<at::Tensor>&& inputs,
+      void* stream_handle = nullptr);
+
+  std::unordered_map<std::string, std::string> getConstantNamesToOriginalFQNs()
+      const;
+  std::unordered_map<std::string, int32_t> getConstantNamesToDtypes() const;
+
+  const std::unordered_map<std::string, at::Tensor> extract_constants_map(
+      bool use_inactive) const;
+  void update_inactive_constant_buffer(const TensorConstantMap& const_map);
+  void update_constant_buffer(
+      std::unordered_map<std::string, at::Tensor>& tensor_map,
+      bool use_inactive,
+      bool validate_full_updates,
+      bool user_managed = false);
+  void update_constant_buffer(
+      const TensorConstantMap& const_map,
+      bool use_inactive,
+      bool validate_full_updates,
+      bool user_managed = false);
+  void run_const_fold(
+      bool use_inactive,
+      AOTInductorStreamHandle cuda_stream_handle = nullptr);
+  void swap_constant_buffer();
+  void free_inactive_constant_buffer();
+
+  std::vector<std::string> get_call_spec();
+
+ protected:
+  AOTIModelContainerRunner(
+      const std::string& model_so_path,
+      size_t num_models,
+      const std::string& device_str,
+      const std::string& cubin_dir,
+      const bool run_single_threaded);
+
+  virtual std::vector<at::Tensor> run_impl(
+      std::vector<AtenTensorHandle>& input_handles,
+      void* stream_handle);
+
+  std::unique_ptr<at::DynamicLibrary> model_so_;
+  decltype(&AOTInductorModelContainerCreateWithDevice) create_func_{nullptr};
+  decltype(&AOTInductorModelContainerDelete) delete_func_{nullptr};
+  decltype(&AOTInductorModelContainerGetNumOutputs) get_num_outputs_func_{
+      nullptr};
+  decltype(&AOTInductorModelContainerRun) run_func_{nullptr};
+  decltype(&AOTInductorModelContainerGetNumConstants) get_num_constants_func_{
+      nullptr};
+  decltype(&AOTInductorModelContainerGetConstantName) get_constant_name_func_{
+      nullptr};
+  decltype(&AOTInductorModelContainerGetConstantOriginalFQN)
+      get_constant_original_fqn_func_{nullptr};
+  decltype(&AOTInductorModelContainerGetConstantDtype) get_constant_dtype_func_{
+      nullptr};
+  decltype(&AOTInductorModelContainerExtractConstantsMap)
+      extract_constants_map_func_{nullptr};
+  decltype(&AOTInductorModelContainerUpdateUserManagedConstantBuffer)
+      update_user_managed_constant_buffer_func_{nullptr};
+  decltype(&AOTInductorModelContainerUpdateConstantBuffer)
+      update_constant_buffer_func_{nullptr};
+  decltype(&AOTInductorModelContainerUpdateInactiveConstantBuffer)
+      update_inactive_constant_buffer_func_{nullptr};
+  decltype(&AOTInductorModelContainerRunConstantFolding) run_const_fold_func_{
+      nullptr};
+  decltype(&AOTInductorModelContainerSwapConstantBuffer)
+      swap_constant_buffer_func_{nullptr};
+  decltype(&AOTInductorModelContainerFreeInactiveConstantBuffer)
+      free_inactive_constant_buffer_func_{nullptr};
+  decltype(&AOTInductorModelContainerGetCallSpec) get_call_spec_func_{nullptr};
+
+  AOTInductorModelContainerHandle container_handle_ = nullptr;
+
+  AOTIProxyExecutorHandle proxy_executor_handle_;
+
+ private:
+  std::unique_ptr<torch::aot_inductor::ProxyExecutor> proxy_executor_;
+};
+
+using CreateAOTIModelRunnerFunc = std::unique_ptr<AOTIModelContainerRunner> (*)(
+    const std::string& model_so_path,
+    size_t num_models,
+    const std::string& device_str,
+    const std::string& bin_dir,
+    const bool run_single_threaded);
+
+// Return a global map "device name" -> "aoti model runner create function" for
+// all registered in AOTI external backends
+TORCH_API std::unordered_map<std::string, CreateAOTIModelRunnerFunc>&
+getAOTIModelRunnerRegistry();
+
+// To register a new external backend in AOTI one needs to create an instance of
+// this struct. It is not thread-safe. Because it is expected to be called
+// during the initialization of the program.
+struct TORCH_API RegisterAOTIModelRunner{RegisterAOTIModelRunner(
+    const std::string& name,
+    CreateAOTIModelRunnerFunc create_aoti_model_runner_fn){
+    getAOTIModelRunnerRegistry()[name] = create_aoti_model_runner_fn;
+} // namespace torch::inductor
+}
+;
+
+} // namespace torch::inductor
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runner/model_container_runner_cpu.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runner/model_container_runner_cpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..65e79aff3dddbf47594aa9fbe760699921b7f69c
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runner/model_container_runner_cpu.h
@@ -0,0 +1,18 @@
+#if !defined(C10_MOBILE) && !defined(ANDROID)
+#pragma once
+
+#include <torch/csrc/inductor/aoti_runner/model_container_runner.h>
+
+namespace torch::inductor {
+class TORCH_API AOTIModelContainerRunnerCpu : public AOTIModelContainerRunner {
+ public:
+  AOTIModelContainerRunnerCpu(
+      const std::string& model_so_path,
+      size_t num_models = 1,
+      const bool run_single_threaded = false);
+
+  ~AOTIModelContainerRunnerCpu() override;
+};
+
+} // namespace torch::inductor
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runner/model_container_runner_cuda.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runner/model_container_runner_cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..eef407022f1a8426c7b34b468edba970b688f3eb
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runner/model_container_runner_cuda.h
@@ -0,0 +1,35 @@
+#if !defined(C10_MOBILE) && !defined(ANDROID)
+#pragma once
+
+#include <c10/cuda/CUDAStream.h>
+#include <torch/csrc/inductor/aoti_runner/model_container_runner.h>
+
+namespace torch::inductor {
+
+// NOTICE: Following APIs are subject to change due to active development
+// We provide NO BC guarantee for these APIs
+// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
+class TORCH_CUDA_CPP_API AOTIModelContainerRunnerCuda
+    : public AOTIModelContainerRunner {
+ public:
+  // @param device_str: cuda device string, e.g. "cuda", "cuda:0"
+  AOTIModelContainerRunnerCuda(
+      const std::string& model_so_path,
+      size_t num_models = 1,
+      const std::string& device_str = "cuda",
+      const std::string& cubin_dir = "",
+      const bool run_single_threaded = false);
+
+  ~AOTIModelContainerRunnerCuda() override;
+
+  std::vector<at::Tensor> run_impl(
+      std::vector<AtenTensorHandle>& input_handles,
+      void* stream_handle) override;
+
+  std::vector<at::Tensor> run_with_cuda_stream(
+      const std::vector<at::Tensor>& inputs,
+      const at::cuda::CUDAStream& cuda_stream);
+};
+
+} // namespace torch::inductor
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runner/model_container_runner_mps.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runner/model_container_runner_mps.h
new file mode 100644
index 0000000000000000000000000000000000000000..6cdd52dc5aa2b315958d947b5ea4e432e902a508
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runner/model_container_runner_mps.h
@@ -0,0 +1,18 @@
+#if defined(__APPLE__)
+#pragma once
+
+#include <torch/csrc/inductor/aoti_runner/model_container_runner.h>
+
+namespace torch::inductor {
+class TORCH_API AOTIModelContainerRunnerMps : public AOTIModelContainerRunner {
+ public:
+  AOTIModelContainerRunnerMps(
+      const std::string& model_so_path,
+      size_t num_models = 1,
+      const bool run_single_threaded = false);
+
+  ~AOTIModelContainerRunnerMps() override;
+};
+
+} // namespace torch::inductor
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runner/model_container_runner_xpu.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runner/model_container_runner_xpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..9ca58bf36bf8a3c24d42704a261bba9f151be88d
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runner/model_container_runner_xpu.h
@@ -0,0 +1,37 @@
+#if !defined(C10_MOBILE) && !defined(ANDROID)
+#pragma once
+
+#include <c10/xpu/XPUStream.h>
+#include <torch/csrc/inductor/aoti_runner/model_container_runner.h>
+
+namespace torch::inductor {
+
+// NOTICE: Following APIs are subject to change due to active development
+// We provide NO BC guarantee for these APIs
+
+// HERE we use C10_EXPORT because libtorch_python needs this Symbol be exported.
+// And `TORCH_API and `TORCH_XPU_API`` do not export the symbol in Windows
+// build.
+class C10_EXPORT AOTIModelContainerRunnerXpu : public AOTIModelContainerRunner {
+ public:
+  // @param device_str: xpu device string, e.g. "xpu", "xpu:0"
+  AOTIModelContainerRunnerXpu(
+      const std::string& model_so_path,
+      size_t num_models = 1,
+      const std::string& device_str = "xpu",
+      const std::string& kernel_bin_dir = "",
+      const bool run_single_threaded = false);
+
+  ~AOTIModelContainerRunnerXpu() override;
+
+  std::vector<at::Tensor> run_impl(
+      std::vector<AtenTensorHandle>& input_handles,
+      void* stream_handle) override;
+
+  std::vector<at::Tensor> run_with_xpu_stream(
+      const std::vector<at::Tensor>& inputs,
+      const at::xpu::XPUStream& xpu_stream);
+};
+
+} // namespace torch::inductor
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runner/pybind.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runner/pybind.h
new file mode 100644
index 0000000000000000000000000000000000000000..d651e480afad5b9831c0748f56b674f0b42bb373
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runner/pybind.h
@@ -0,0 +1,7 @@
+#include <torch/csrc/python_headers.h>
+
+namespace torch::inductor {
+
+void initAOTIRunnerBindings(PyObject* module);
+
+} // namespace torch::inductor
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/arrayref_tensor.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/arrayref_tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..c1703ca0328af1049bacb960753d16d9922f33de
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/arrayref_tensor.h
@@ -0,0 +1,242 @@
+#pragma once
+
+#include <torch/csrc/inductor/aoti_runtime/mini_array_ref.h>
+#include <torch/csrc/inductor/aoti_runtime/utils.h>
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+
+namespace torch::aot_inductor {
+
+using MiniIntArrayRef = MiniArrayRef<int64_t>;
+
+static_assert(
+    sizeof(MiniIntArrayRef) == sizeof(void*) + sizeof(size_t),
+    "changing the size of MiniArrayRef breaks ABI compatibility!");
+
+inline bool is_contiguous_strides_for_shape(
+    int64_t ndim,
+    const int64_t* strides_ptr,
+    const int64_t* sizes_ptr) {
+  int64_t z = 1;
+  for (int64_t d = ndim - 1; d >= 0; d--) {
+    const auto& size_d = sizes_ptr[d];
+    if (size_d != 1) {
+      if (strides_ptr[d] == z) {
+        z *= size_d;
+      } else {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+// Shim for AOTI generated code to pretend a raw array works like an
+// AtenTensorHandle.
+template <typename T>
+class ArrayRefTensor {
+ public:
+  ArrayRefTensor() = default;
+
+  explicit ArrayRefTensor(
+      MiniArrayRef<T> arr,
+      MiniArrayRef<const int64_t> sizes,
+      MiniArrayRef<const int64_t> strides,
+      int32_t device_type,
+      int32_t device_idx)
+      : arrayRef_(arr),
+        sizes_(sizes),
+        strides_(strides),
+        device_type_(device_type),
+        device_idx_(device_idx) {
+    assert(sizes.size() == strides.size());
+    assert(is_contiguous_strides_for_shape(
+        sizes.size(), strides.data(), sizes.data()));
+  }
+
+  AtenTensorHandle expensiveCopyToTensor() const {
+    AtenTensorHandle result = nullptr;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(
+        sizes_.size(),
+        sizes_.data(),
+        strides_.data(),
+        aoti_torch_dtype<std::remove_const_t<T>>(),
+        device_type_,
+        device_idx_,
+        &result));
+    void* dataPtr = nullptr;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_data_ptr(result, &dataPtr));
+    std::memcpy(dataPtr, data(), numel() * sizeof(T));
+    return result;
+  }
+
+  // We need to look the same as RAIIAtenTensorHandle, which returns
+  // an owning AtenTensorHandle from release(). So, we allocate one!
+  AtenTensorHandle release() {
+    return expensiveCopyToTensor();
+  }
+
+  AtenTensorHandle borrowAsTensor() const {
+    AtenTensorHandle result = nullptr;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob_v2(
+        data(),
+        sizes_.size(),
+        sizes_.data(),
+        strides_.data(),
+        0,
+        aoti_torch_dtype<std::remove_const_t<T>>(),
+        device_type_,
+        device_idx_,
+        &result,
+        aoti_torch_layout_strided(),
+        nullptr,
+        0));
+    return result;
+  }
+
+  // We don't need to free any memory.
+  void reset() {}
+
+  auto sizes() const {
+    return sizes_;
+  }
+
+  auto strides() const {
+    return strides_;
+  }
+
+  auto device_type() const {
+    return device_type_;
+  }
+
+  auto device_idx() const {
+    return device_idx_;
+  }
+
+  T* data() const {
+    return arrayRef_.data();
+  }
+
+  auto numel() const {
+    return arrayRef_.size();
+  }
+
+  void set_arrayref(MiniArrayRef<T> new_arrayref) {
+    arrayRef_ = new_arrayref;
+  }
+
+ private:
+  MiniArrayRef<T> arrayRef_;
+  // We expect generated code to have statically available sizes &
+  // strides for us.
+  MiniArrayRef<const int64_t> sizes_;
+  MiniArrayRef<const int64_t> strides_;
+  int32_t device_type_ = 0;
+  int32_t device_idx_ = 0;
+  // We continue to zero-initialize this field in case we repurpose
+  // the space later; having predictable contents can only help.
+  int32_t unusedDoNotRemoveForABICompatibility_ = 0;
+};
+
+static_assert(
+    sizeof(ArrayRefTensor<int>) ==
+        3 * sizeof(MiniIntArrayRef) + 3 * sizeof(int32_t) +
+            (alignof(ArrayRefTensor<int>) > 4 ? sizeof(int32_t) : 0),
+    "changing the size of ArrayRefTensor breaks ABI compatibility!");
+
+template <typename T>
+inline ArrayRefTensor<T> reinterpret_tensor_wrapper(
+    const ArrayRefTensor<T>& self,
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    int64_t storage_offset) {
+  // REVIEW: we should add a way to build the DSO in debug mode during
+  // tests so we can have checks like this!
+  assert(is_contiguous_strides_for_shape(ndim, strides_ptr, sizes_ptr));
+  return ArrayRefTensor<T>(
+      MiniArrayRef<T>(
+          self.data() + storage_offset, self.numel() - storage_offset),
+      MiniArrayRef<const int64_t>(sizes_ptr, ndim),
+      MiniArrayRef<const int64_t>(strides_ptr, ndim),
+      self.device_type(),
+      self.device_idx());
+}
+
+template <typename T>
+inline T* get_data_ptr_wrapper(ArrayRefTensor<T>& tensor) {
+  return tensor.data();
+}
+
+template <typename T>
+inline T* get_data_ptr_wrapper(const MiniArrayRef<T>& arr) {
+  return arr.data();
+}
+
+template <typename T>
+inline const ArrayRefTensor<T>& unwrap_raii_handle_if_needed(
+    const ArrayRefTensor<T>& tensor) {
+  return tensor;
+}
+
+template <typename T>
+inline ArrayRefTensor<T>& unwrap_raii_handle_if_needed(
+    ArrayRefTensor<T>& tensor) {
+  return tensor;
+}
+
+template <typename T>
+inline const ArrayRefTensor<T>& wrap_with_raii_handle_if_needed(
+    const ArrayRefTensor<T>& tensor) {
+  return tensor;
+}
+
+template <typename T>
+inline ArrayRefTensor<T>& wrap_with_raii_handle_if_needed(
+    ArrayRefTensor<T>& tensor) {
+  return tensor;
+}
+
+template <typename T>
+inline ArrayRefTensor<T> wrap_with_raii_handle_if_needed(
+    ArrayRefTensor<T>&& tensor) {
+  return std::move(tensor);
+}
+
+template <typename T>
+inline RAIIAtenTensorHandle expensive_copy_to_tensor_if_needed(
+    const ArrayRefTensor<T>& tensor) {
+  return tensor.expensiveCopyToTensor();
+}
+
+inline AtenTensorHandle expensive_copy_to_tensor_if_needed(
+    AtenTensorHandle handle) {
+  return handle;
+}
+
+template <typename T>
+const T& copy_arrayref_tensor_to_tensor(const T& t) {
+  return t;
+}
+
+template <typename T>
+RAIIAtenTensorHandle copy_arrayref_tensor_to_tensor(
+    const ArrayRefTensor<T>& art) {
+  return art.expensiveCopyToTensor();
+}
+
+template <typename T>
+const T& borrow_arrayref_tensor_as_tensor(const T& t) {
+  return t;
+}
+
+template <typename T>
+RAIIAtenTensorHandle borrow_arrayref_tensor_as_tensor(
+    const ArrayRefTensor<T>& art) {
+  return art.borrowAsTensor();
+}
+
+} // namespace torch::aot_inductor
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/constant_type.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/constant_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..146e8e63069e4d3e70da39c6c98c08a6d72593ca
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/constant_type.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <cstdint>
+
+// WARNING: Be careful when adding new includes here. This header will be used
+// in model.so, and should not refer to any aten/c10 headers except the stable
+// C ABI defined in torch/csrc/inductor/aoti_torch/c/shim.h. The same rule
+// applies to other files under torch/csrc/inductor/aoti_runtime/.
+
+namespace torch::aot_inductor {
+
+enum ConstantType : uint8_t {
+  Unknown = 0,
+  Parameter = 1,
+  Buffer = 2,
+  TensorConstant = 3,
+  FoldedConstant = 4,
+};
+
+} // namespace torch::aot_inductor
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/device_utils.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/device_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..c2191d4ae776cedcd320352b19bcd7df939f46b5
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/device_utils.h
@@ -0,0 +1,67 @@
+#pragma once
+
+// WARNING: Be careful when adding new includes here. This header will be used
+// in model.so, and should not refer to any aten/c10 headers except the stable
+// C ABI defined in torch/csrc/inductor/aoti_torch/c/shim.h. The same rule
+// applies to other files under torch/csrc/inductor/aoti_runtime/.
+
+#ifdef USE_CUDA
+
+// FIXME: Currently, CPU and CUDA backend are mutually exclusive.
+// This is a temporary workaround. We need a better way to support
+// multi devices.
+
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+
+#define AOTI_RUNTIME_DEVICE_CHECK(EXPR)                    \
+  do {                                                     \
+    const cudaError_t code = EXPR;                         \
+    const char* msg = cudaGetErrorString(code);            \
+    if (code != cudaSuccess) {                             \
+      throw std::runtime_error(                            \
+          std::string("CUDA error: ") + std::string(msg)); \
+    }                                                      \
+  } while (0)
+
+namespace torch::aot_inductor {
+
+using DeviceStreamType = cudaStream_t;
+
+} // namespace torch::aot_inductor
+
+#elif defined(USE_XPU)
+#include <level_zero/ze_api.h>
+#include <sycl/sycl.hpp>
+#include <sstream>
+#define AOTI_RUNTIME_DEVICE_CHECK(EXPR)                                   \
+  do {                                                                    \
+    const ze_result_t status = EXPR;                                      \
+    if (status != ZE_RESULT_SUCCESS) {                                    \
+      std::stringstream ss;                                               \
+      ss << "L0 runtime error: " << std::hex << std::uppercase << status; \
+      throw std::runtime_error(ss.str());                                 \
+    }                                                                     \
+  } while (0)
+
+namespace torch::aot_inductor {
+
+using DeviceStreamType = sycl::queue*;
+
+} // namespace torch::aot_inductor
+
+#else
+
+#define AOTI_RUNTIME_DEVICE_CHECK(EXPR)            \
+  bool ok = EXPR;                                  \
+  if (!ok) {                                       \
+    throw std::runtime_error("CPU runtime error"); \
+  }
+
+namespace torch::aot_inductor {
+
+using DeviceStreamType = void*;
+
+} // namespace torch::aot_inductor
+
+#endif // USE_CUDA
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/interface.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/interface.h
new file mode 100644
index 0000000000000000000000000000000000000000..aa1882fc6e60b829f6a7d3ac2f74ed234b708249
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/interface.h
@@ -0,0 +1,230 @@
+#pragma once
+
+// WARNING: Be careful when adding new includes here. This header will be used
+// in model.so, and should not refer to any aten/c10 headers except the stable
+// C ABI defined in torch/csrc/inductor/aoti_torch/c/shim.h. The same rule
+// applies to other files under torch/csrc/inductor/aoti_runtime/.
+#include <torch/csrc/inductor/aoti_runtime/utils.h>
+
+extern "C" {
+struct AOTInductorModelOpaque;
+using AOTInductorModelHandle = AOTInductorModelOpaque*;
+
+struct AOTInductorModelContainerOpaque;
+using AOTInductorModelContainerHandle = AOTInductorModelContainerOpaque*;
+
+struct AOTInductorStreamOpaque;
+using AOTInductorStreamHandle = AOTInductorStreamOpaque*;
+
+struct AOTInductorConstantMap;
+using AOTInductorConstantMapHandle = AOTInductorConstantMap*;
+
+// TODO: Deprecate this API. This was kept for BC compatibility.
+// Please use AOTInductorModelContainerCreateWithDevice instead.
+AOTIRuntimeError AOTInductorModelContainerCreate(
+    AOTInductorModelContainerHandle* container_handle,
+    size_t num_models,
+    bool is_cpu,
+    const char* cubin_dir);
+
+// Creates an AOTInductor model container. The parameter num_models
+// specifies the number of model instances that may be run concurrently for
+// the same input model.
+// `device_str` MUST NOT be nullptr. It must be a valid device string, e.g.
+// "cpu", "cuda", "cuda:0", etc. If the device index is not specified for CUDA
+// device, runtime will use the device index returned by
+// "cudaGetDevice(&device_idx)"
+AOTIRuntimeError AOTInductorModelContainerCreateWithDevice(
+    AOTInductorModelContainerHandle* container_handle,
+    size_t num_models,
+    const char* device_str,
+    const char* cubin_dir);
+
+// Deletes the AOTInductor model container.
+AOTIRuntimeError AOTInductorModelContainerDelete(
+    AOTInductorModelContainerHandle container_handle);
+
+// Runs the inference.
+AOTIRuntimeError AOTInductorModelContainerRun(
+    AOTInductorModelContainerHandle container_handle,
+    AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles
+                                     // are stolen; the array itself is borrowed
+    size_t num_inputs,
+    AtenTensorHandle*
+        output_handles, // array for writing output AtenTensorHandle; handles
+                        // will be stolen by the caller; the array itself is
+                        // borrowed
+    size_t num_outputs,
+    AOTInductorStreamHandle stream_handle,
+    AOTIProxyExecutorHandle proxy_executor_handle);
+
+// Single-threaded variant of previous.
+AOTIRuntimeError AOTInductorModelContainerRunSingleThreaded(
+    AOTInductorModelContainerHandle container_handle,
+    AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles
+                                     // are stolen; the array itself is borrowed
+    size_t num_inputs,
+    AtenTensorHandle*
+        output_handles, // array for writing output AtenTensorHandle; handles
+                        // will be stolen by the caller; the array itself is
+                        // borrowed
+    size_t num_outputs,
+    AOTInductorStreamHandle stream_handle,
+    AOTIProxyExecutorHandle proxy_executor_handle);
+
+// Retrieves the number of constants for the model.
+AOTIRuntimeError AOTInductorModelContainerGetNumConstants(
+    AOTInductorModelContainerHandle container_handle,
+    size_t* num_constants);
+
+// Retrieves a constant's name.
+// idx is the index of the internal's constants.
+// Need idx < num_constants from AOTInductorModelContainerGetNumConstants
+AOTIRuntimeError AOTInductorModelContainerGetConstantName(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    const char** name);
+
+// Retrieves a constant's original FQN.
+// idx is the index of the internal's constants.
+// Need idx < num_constants from AOTInductorModelContainerGetNumConstants
+AOTIRuntimeError AOTInductorModelContainerGetConstantOriginalFQN(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    const char** original_fqn);
+
+// Retrieves whether a constant is from folded.
+// idx is the index of the internal's constants.
+// Need idx < num_constants from AOTInductorModelContainerGetNumConstants
+AOTIRuntimeError AOTInductorModelContainerGetConstantFromFolded(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    bool* from_folded);
+
+// Retrieves the inductor constant type.
+// idx is the index of the internal's constants.
+// Need idx < num_constants from AOTInductorModelContainerGetNumConstants
+AOTIRuntimeError AOTInductorModelContainerGetConstantType(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    int32_t* type);
+
+// Retrieves a constant's dtype.
+// idx is the index of the internal's constants.
+// Need idx < num_constants from AOTInductorModelContainerGetNumConstants
+AOTIRuntimeError AOTInductorModelContainerGetConstantDtype(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    int32_t* dtype);
+
+// Retrieves a constant's data size.
+// idx is the index of the internal's constants.
+// Need idx < num_constants from AOTInductorModelContainerGetNumConstants
+AOTIRuntimeError AOTInductorModelContainerGetConstantDataSize(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    size_t* data_size);
+
+// Extract the constants that is being used in the container.
+AOTIRuntimeError AOTInductorModelContainerExtractConstantsMap(
+    AOTInductorModelContainerHandle container_handle,
+    AOTInductorConstantMapHandle constant_map_handle,
+    bool use_inactive);
+
+// Setup the constant buffer in model container with provided ConstantMap.
+// The ConstantMap is user managed, and the user would retain ownership.
+AOTIRuntimeError AOTInductorModelContainerUpdateUserManagedConstantBuffer(
+    AOTInductorModelContainerHandle container_handle,
+    AOTInductorConstantMapHandle constant_map_handle,
+    bool use_inactive,
+    bool validate_full_update);
+
+// Setup the constant buffer in model container with provided ConstantMap
+// use_inactive should be set as true if the inactive buffer is to be updated.
+// validate_full_update checks if all constants are included in the ConstantMap
+AOTIRuntimeError AOTInductorModelContainerUpdateConstantBuffer(
+    AOTInductorModelContainerHandle container_handle,
+    AOTInductorConstantMapHandle constant_map_handle,
+    bool use_inactive,
+    bool validate_full_update);
+
+// Setup the inactive constant buffer in model container with provided
+// ConstantMap
+AOTIRuntimeError AOTInductorModelContainerUpdateInactiveConstantBuffer(
+    AOTInductorModelContainerHandle container_handle,
+    AOTInductorConstantMapHandle constant_map_handle);
+
+// Free the inactive constant buffer in model container.
+AOTIRuntimeError AOTInductorModelContainerFreeInactiveConstantBuffer(
+    AOTInductorModelContainerHandle container_handle);
+
+// Run constant folding on constant buffer.
+AOTIRuntimeError AOTInductorModelContainerRunConstantFolding(
+    AOTInductorModelContainerHandle container_handle,
+    bool use_inactive,
+    AOTInductorStreamHandle stream_handle,
+    AOTIProxyExecutorHandle proxy_executor_handle);
+
+// Swap the constant buffer being used to the inactive one.
+AOTIRuntimeError AOTInductorModelContainerSwapConstantBuffer(
+    AOTInductorModelContainerHandle container_handle);
+
+// Retrieves the number of inputs for the model.
+AOTIRuntimeError AOTInductorModelContainerGetNumInputs(
+    AOTInductorModelContainerHandle container_handle,
+    size_t* ret_num_inputs);
+
+// Retrieves the input name at the given index.
+AOTIRuntimeError AOTInductorModelContainerGetInputName(
+    AOTInductorModelContainerHandle container_handle,
+    size_t input_idx,
+    const char** ret_input_names);
+
+// Retrieves the number of outputs for the model.
+AOTIRuntimeError AOTInductorModelContainerGetNumOutputs(
+    AOTInductorModelContainerHandle container_handle,
+    size_t* ret_num_outputs);
+
+// Retrieves the output name at the given index.
+AOTIRuntimeError AOTInductorModelContainerGetOutputName(
+    AOTInductorModelContainerHandle container_handle,
+    size_t output_idx,
+    const char** ret_output_names);
+
+// Creates an AOTInductorModel instance.  This is a thin and light wrapper
+// around the compiled model; it doesn't handle concurrency, queueing, device
+// management, etc.  Use this if bare-metal performance is needed and you are
+// willing to handle other "management" aspects yourself.
+//
+// constant_map_handle is an opaque type to satisfy the C ABI.  It should be a
+// std::unordered_map<std::string, at::Tensor*>*.
+AOTIRuntimeError AOTInductorModelCreate(
+    AOTInductorModelHandle* model_handle,
+    AOTInductorConstantMapHandle constant_map_handle);
+
+// Run an AOTInductorModel (see AOTInductorModelCreate for when one should use
+// this function versus AOTInductorModelContainerRun).
+AOTIRuntimeError AOTInductorModelRun(
+    AOTInductorModelHandle model_handle,
+    AtenTensorHandle* input_handles,
+    AtenTensorHandle* output_handles);
+
+// Replace AOTInductorModel's constant map. Note it doesn't handle concurrency
+// so be sure to handle ordering if AOTInductorModelRun is ran concurrently.
+AOTIRuntimeError AOTInductorModelUpdateConstantsMap(
+    AOTInductorModelHandle model_handle,
+    AOTInductorConstantMapHandle constant_map_handle);
+
+// Delete an AOTInductorModel created by AOTInductorModelCreate.
+AOTIRuntimeError AOTInductorModelDelete(AOTInductorModelHandle model_handle);
+
+AOTIRuntimeError AOTInductorModelGetNumOutputs(
+    AOTInductorModelHandle model_handle,
+    size_t* ret_num_outputs);
+
+AOTIRuntimeError AOTInductorModelContainerGetCallSpec(
+    AOTInductorModelContainerHandle container_handle,
+    const char** in_spec,
+    const char** out_spec);
+
+} // extern "C"
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/mini_array_ref.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/mini_array_ref.h
new file mode 100644
index 0000000000000000000000000000000000000000..47f4e6421d8b035dd287c42b605ea441b8f1ff85
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/mini_array_ref.h
@@ -0,0 +1,160 @@
+#pragma once
+
+#include <array>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+namespace torch::aot_inductor {
+
+// Can't use c10::ArrayRef because it's not truly header-only and
+// pulls in other c10 headers. This is (sadly) copy-pasted and
+// adapted.
+template <typename T>
+class MiniArrayRef final {
+ public:
+  using iterator = T*;
+  using const_iterator = const T*;
+  using size_type = size_t;
+  using value_type = T;
+
+  using reverse_iterator = std::reverse_iterator<iterator>;
+
+ private:
+  /// The start of the array, in an external buffer.
+  T* Data;
+
+  /// The number of elements.
+  size_type Length;
+
+ public:
+  /// @name Constructors
+  /// @{
+
+  /// Construct an empty MiniArrayRef.
+  /* implicit */ constexpr MiniArrayRef() : Data(nullptr), Length(0) {}
+
+  /// Construct an MiniArrayRef from a single element.
+  // TODO Make this explicit
+  constexpr MiniArrayRef(const T& OneElt) : Data(&OneElt), Length(1) {}
+
+  /// Construct an MiniArrayRef from a pointer and length.
+  constexpr MiniArrayRef(T* data, size_t length) : Data(data), Length(length) {}
+
+  /// Construct an MiniArrayRef from a range.
+  constexpr MiniArrayRef(T* begin, T* end) : Data(begin), Length(end - begin) {}
+
+  template <
+      typename Container,
+      typename = std::enable_if_t<std::is_same_v<
+          std::remove_const_t<decltype(std::declval<Container>().data())>,
+          T*>>>
+  /* implicit */ MiniArrayRef(Container& container)
+      : Data(container.data()), Length(container.size()) {}
+
+  /// Construct an MiniArrayRef from a std::vector.
+  // The enable_if stuff here makes sure that this isn't used for
+  // std::vector<bool>, because MiniArrayRef can't work on a std::vector<bool>
+  // bitfield.
+  template <typename A>
+  /* implicit */ MiniArrayRef(const std::vector<T, A>& Vec)
+      : Data(Vec.data()), Length(Vec.size()) {
+    static_assert(
+        !std::is_same_v<T, bool>,
+        "MiniArrayRef<bool> cannot be constructed from a std::vector<bool> bitfield.");
+  }
+
+  /// Construct an MiniArrayRef from a std::array
+  template <size_t N>
+  /* implicit */ constexpr MiniArrayRef(std::array<T, N>& Arr)
+      : Data(Arr.data()), Length(N) {}
+
+  /// Construct an MiniArrayRef from a C array.
+  template <size_t N>
+  // NOLINTNEXTLINE(*c-array*)
+  /* implicit */ constexpr MiniArrayRef(T (&Arr)[N]) : Data(Arr), Length(N) {}
+
+  // /// Construct an MiniArrayRef from an empty C array.
+  /* implicit */ constexpr MiniArrayRef(const volatile void* Arr)
+      : Data(nullptr), Length(0) {}
+
+  /// Construct an MiniArrayRef from a std::initializer_list.
+  /* implicit */ constexpr MiniArrayRef(const std::initializer_list<T>& Vec)
+      : Data(
+            std::begin(Vec) == std::end(Vec) ? static_cast<T*>(nullptr)
+                                             : std::begin(Vec)),
+        Length(Vec.size()) {}
+
+  /// @}
+  /// @name Simple Operations
+  /// @{
+
+  constexpr iterator begin() const {
+    return Data;
+  }
+  constexpr iterator end() const {
+    return Data + Length;
+  }
+
+  // These are actually the same as iterator, since MiniArrayRef only
+  // gives you const iterators.
+  constexpr const_iterator cbegin() const {
+    return Data;
+  }
+  constexpr const_iterator cend() const {
+    return Data + Length;
+  }
+
+  constexpr reverse_iterator rbegin() const {
+    return reverse_iterator(end());
+  }
+  constexpr reverse_iterator rend() const {
+    return reverse_iterator(begin());
+  }
+
+  /// empty - Check if the array is empty.
+  constexpr bool empty() const {
+    return Length == 0;
+  }
+
+  constexpr T* data() const {
+    return Data;
+  }
+
+  /// size - Get the array size.
+  constexpr size_t size() const {
+    return Length;
+  }
+
+  /// equals - Check for element-wise equality.
+  constexpr bool equals(MiniArrayRef RHS) const {
+    return Length == RHS.Length && std::equal(begin(), end(), RHS.begin());
+  }
+
+  /// @}
+  /// @name Operator Overloads
+  /// @{
+  constexpr const T& operator[](size_t Index) const {
+    return Data[Index];
+  }
+
+  /// Disallow accidental assignment from a temporary.
+  ///
+  /// The declaration here is extra complicated so that "arrayRef = {}"
+  /// continues to select the move assignment operator.
+  template <typename U>
+  std::enable_if_t<std::is_same_v<U, T>, MiniArrayRef<T>>& operator=(
+      // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
+      U&& Temporary) = delete;
+
+  /// Disallow accidental assignment from a temporary.
+  ///
+  /// The declaration here is extra complicated so that "arrayRef = {}"
+  /// continues to select the move assignment operator.
+  template <typename U>
+  std::enable_if_t<std::is_same_v<U, T>, MiniArrayRef<T>>& operator=(
+      std::initializer_list<U>) = delete;
+};
+
+} // namespace torch::aot_inductor
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/model.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/model.h
new file mode 100644
index 0000000000000000000000000000000000000000..00d6992460f368bd9644e5a89876baf1d8e8b592
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/model.h
@@ -0,0 +1,781 @@
+#pragma once
+
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <optional>
+#include <regex>
+#include <stdexcept>
+#include <unordered_map>
+#include <utility>
+
+// WARNING: Be careful when adding new includes here. This header will be used
+// in model.so, and should not refer to any aten/c10 headers except the stable
+// C ABI defined in torch/csrc/inductor/aoti_torch/c/shim.h. The same rule
+// applies to other files under torch/csrc/inductor/aoti_runtime/.
+#include <torch/csrc/inductor/aoti_runtime/device_utils.h>
+#ifdef USE_MPS
+#include <torch/csrc/inductor/aoti_torch/c/shim_mps.h>
+#endif // USE_MPS
+#ifdef USE_XPU
+#include <torch/csrc/inductor/aoti_runtime/utils_xpu.h>
+#else
+#include <torch/csrc/inductor/aoti_runtime/utils.h>
+#endif // USE_XPU
+#include <torch/csrc/inductor/aoti_runtime/constant_type.h>
+
+#define AOTI_RUNTIME_CHECK(EXPR, MSG) \
+  do {                                \
+    bool ok = EXPR;                   \
+    if (!ok) {                        \
+      throw std::runtime_error(MSG);  \
+    }                                 \
+  } while (0)
+
+// At codegen time, we write out a binary file called constants.bin.
+// We then turn the raw binary to an object file that exposes this
+// symbol and link it into the final .so.
+// For information on the binary format, see `man objcopy`, under
+// the "binary-architecture" flag:
+// https://man7.org/linux/man-pages/man1/objcopy.1.html
+// todo: use #embed in C++ 23 once available
+// The constants are NOT readonly because they may be mutated.
+// NOLINTNEXTLINE(*array*)
+extern uint8_t _binary_constants_bin_start[];
+// NOLINTNEXTLINE(*array*)
+extern uint8_t _binary_constants_bin_end[];
+
+#if defined(USE_CUDA) || defined(USE_XPU)
+// Compute required blob size with 64-alignment if on GPU.
+#define AOTI_CONST_ALIGNMENT 64
+#else
+// Use 64-alignment (use something >=64)for better performance on CPU.
+#define AOTI_CONST_ALIGNMENT 64
+#endif
+
+namespace {
+
+using RAIIDataPtr = std::unique_ptr<void, std::function<void(void*)>>;
+
+#ifdef USE_CUDA
+
+RAIIDataPtr RAII_gpuMalloc(size_t num_bytes) {
+  void* data_ptr;
+  AOTI_RUNTIME_DEVICE_CHECK(cudaMalloc((void**)&data_ptr, num_bytes));
+  auto deleter = [](void* ptr) { AOTI_RUNTIME_DEVICE_CHECK(cudaFree(ptr)); };
+  return RAIIDataPtr(data_ptr, deleter);
+}
+
+#elif defined(USE_XPU)
+
+RAIIDataPtr RAII_gpuMalloc(size_t num_bytes) {
+  sycl::queue* queue_ptr = nullptr;
+  aoti_torch_get_current_sycl_queue((void**)&queue_ptr);
+  void* data_ptr = sycl::malloc_device(num_bytes, *queue_ptr);
+  auto deleter = [queue_ptr](void* ptr) { sycl::free(ptr, *queue_ptr); };
+  return RAIIDataPtr(data_ptr, deleter);
+}
+
+#elif defined(USE_MPS)
+
+RAIIDataPtr RAII_gpuMalloc(size_t num_bytes) {
+  void* data_ptr = nullptr;
+  aoti_torch_mps_malloc(&data_ptr, num_bytes);
+  auto deleter = [](void* ptr) { aoti_torch_mps_free(ptr); };
+  return RAIIDataPtr(data_ptr, deleter);
+}
+
+#else
+
+RAIIDataPtr RAII_cpuMalloc(size_t num_bytes) {
+  void* data_ptr = std::malloc(num_bytes);
+  if (!data_ptr) {
+    throw std::bad_alloc();
+  }
+  auto deleter = [](void* ptr) { std::free(ptr); };
+  return RAIIDataPtr(data_ptr, deleter);
+}
+
+#endif // USE_CUDA
+
+} // anonymous namespace
+
+namespace torch::aot_inductor {
+
+using ConstantMap =
+    std::unordered_map<std::string, MaybeOwningAtenTensorHandle>;
+
+// valid device strs are: cpu, cuda, cuda:0, cuda:1, ...
+// Update the list here if more devices are supported in the future
+inline void parse_device_str(
+    const std::string& device_str,
+    int32_t& device_type,
+    int32_t& device_idx) {
+  std::regex re("(cpu|cuda|xpu|mps)(:([0-9]+))?");
+  std::smatch sm;
+  bool matched = std::regex_match(device_str, sm, re);
+  AOTI_RUNTIME_CHECK(matched, "Invalid device: " + device_str);
+
+  if (sm[1].str() == "cpu") {
+    device_type = aoti_torch_device_type_cpu();
+  } else if (sm[1].str() == "cuda") {
+    device_type = aoti_torch_device_type_cuda();
+#ifdef USE_XPU
+  } else if (sm[1].str() == "xpu") {
+    device_type = aoti_torch_device_type_xpu();
+#endif
+#ifdef USE_MPS
+  } else if (sm[1].str() == "mps") {
+    device_type = aoti_torch_device_type_mps();
+#endif
+  } else {
+    AOTI_RUNTIME_CHECK(false, "Invalid device: " + device_str);
+  }
+
+  if (sm[3].matched) {
+    device_idx = stoi(sm[3].str());
+  } else {
+    device_idx = -1;
+  }
+}
+
+// Defines the base class for AOTInductorModel, which is generated by the
+// AOTInductor cpp codegen. Since we do not need dynamic dispatch, we rely
+// on curiously recurring template pattern (CRTP) to save some runtime
+// v-table overhead. The generated AOTInductorModel is specialized with
+// methods such as run_impl.
+template <typename Model>
+class AOTInductorModelBase {
+ public:
+  AOTInductorModelBase(
+      size_t num_inputs,
+      size_t num_outputs,
+      size_t num_constants,
+      const std::string& device_str,
+      std::optional<std::string> cubin_dir,
+      bool include_weights = true)
+      : inputs_info_(num_inputs),
+        outputs_info_(num_outputs),
+        constants_info_(num_constants),
+        cubin_dir_(std::move(cubin_dir)),
+        include_weights(include_weights) {
+    parse_device_str(device_str, device_type_, device_idx_);
+
+#ifdef USE_CUDA
+    if (device_idx_ == -1) {
+      AOTI_RUNTIME_DEVICE_CHECK(cudaGetDevice(&device_idx_));
+    } else {
+      // If device_idx_ is passed in, we need to set the current device to it
+      AOTI_RUNTIME_DEVICE_CHECK(cudaSetDevice(device_idx_));
+    }
+#endif // USE_CUDA
+#ifdef USE_XPU
+    if (device_idx_ == -1) {
+      aoti_torch_get_current_xpu_device(&device_idx_);
+    } else {
+      aoti_torch_set_current_xpu_device(device_idx_);
+    }
+#endif // USE_XPU
+#ifdef USE_MPS
+    if (device_idx_ == -1) {
+      device_idx_ = 0;
+    }
+#endif // USE_MPS
+  }
+
+  // NOLINTNEXTLINE(modernize-use-equals-default)
+  ~AOTInductorModelBase() {
+#ifdef USE_CUDA
+    if (run_finished_) {
+      auto code = cudaEventDestroy(*run_finished_);
+      if (code != cudaSuccess) {
+        std::cerr << "Failed to destroy CUDA event in AOTInductor model: "
+                  << cudaGetErrorString(code) << std::endl;
+      }
+    }
+#endif // USE_CUDA
+#ifdef USE_XPU
+    if (run_finished_) {
+      (*run_finished_)->wait_and_throw();
+      delete *run_finished_;
+    }
+#endif // USE_XPU
+  }
+
+  AOTInductorModelBase(AOTInductorModelBase&&) = delete;
+  AOTInductorModelBase& operator=(AOTInductorModelBase&&) = delete;
+  AOTInductorModelBase(const AOTInductorModelBase&) = delete;
+  AOTInductorModelBase& operator=(const AOTInductorModelBase&) = delete;
+
+  void run(
+      AtenTensorHandle*
+          input_handles, // array of input AtenTensorHandle; handles
+                         // are stolen; the array itself is borrowed
+      AtenTensorHandle*
+          output_handles, // array for writing output AtenTensorHandle; handles
+                          // will be stolen by the caller; the array itself is
+                          // borrowed
+      DeviceStreamType stream,
+      AOTIProxyExecutorHandle proxy_executor) {
+#ifdef USE_CUDA
+    if (!run_finished_) {
+      cudaEvent_t run_finished;
+      AOTI_RUNTIME_DEVICE_CHECK(cudaEventCreate(&run_finished));
+      run_finished_.emplace(run_finished);
+    }
+#elif defined(USE_XPU)
+    if (run_finished_) {
+      (*run_finished_)->wait_and_throw();
+      delete *run_finished_;
+      run_finished_.reset();
+    }
+#else // !USE_CUDA && !USE_XPU
+    run_finished_ = false;
+#endif
+
+    auto* model = static_cast<Model*>(this);
+    model->run_impl(input_handles, output_handles, stream, proxy_executor);
+
+#ifdef USE_CUDA
+    AOTI_RUNTIME_DEVICE_CHECK(cudaEventRecord(*run_finished_, stream));
+#elif defined(USE_XPU)
+    run_finished_ = std::make_optional<sycl::event*>(new sycl::event(
+        static_cast<sycl::queue*>(stream)->ext_oneapi_submit_barrier()));
+#else // !USE_CUDA && !USE_XPU
+    run_finished_ = true;
+#endif // USE_CUDA
+  }
+
+  // Non-thread-aware variant of run(). Obviously unsafe to use in a threaded
+  // environment :)
+  void run_single_threaded(
+      AtenTensorHandle*
+          input_handles, // array of input AtenTensorHandle; handles
+                         // are stolen; the array itself is borrowed
+      AtenTensorHandle*
+          output_handles, // array for writing output AtenTensorHandle; handles
+                          // will be stolen by the caller; the array itself is
+                          // borrowed
+      DeviceStreamType stream,
+      AOTIProxyExecutorHandle proxy_executor) {
+    // don't bother with any of the run_finished stuff; this is unsafe to call
+    // in a threaded context
+    auto* model = static_cast<Model*>(this);
+    model->run_impl(input_handles, output_handles, stream, proxy_executor);
+  }
+
+  std::unordered_map<std::string, AtenTensorHandle> run_const_fold(
+      DeviceStreamType stream,
+      AOTIProxyExecutorHandle proxy_executor,
+      bool initialization = false) {
+#ifdef USE_CUDA
+    if (!run_finished_) {
+      cudaEvent_t run_finished;
+      AOTI_RUNTIME_DEVICE_CHECK(cudaEventCreate(&run_finished));
+      run_finished_.emplace(run_finished);
+    }
+#elif defined(USE_XPU)
+    if (run_finished_) {
+      (*run_finished_)->wait_and_throw();
+      delete *run_finished_;
+      run_finished_.reset();
+    }
+#else // !USE_CUDA && !USE_XPU
+    run_finished_ = false;
+#endif
+
+    auto* model = static_cast<Model*>(this);
+    auto folded_constants =
+        model->const_run_impl(stream, proxy_executor, initialization);
+
+#ifdef USE_CUDA
+    AOTI_RUNTIME_DEVICE_CHECK(cudaEventRecord(*run_finished_, stream));
+#elif defined(USE_XPU)
+    // sycl::queue* queue_ptr = nullptr;
+    // aoti_torch_get_current_sycl_queue((void**)&queue_ptr);
+    run_finished_ = std::make_optional<sycl::event*>(new sycl::event(
+        static_cast<sycl::queue*>(stream)->ext_oneapi_submit_barrier()));
+
+#else // !USE_CUDA && !USE_XPU
+    run_finished_ = true;
+#endif // USE_CUDA
+
+    return folded_constants;
+  }
+
+  void load_constants() {
+    size_t num_constants = this->num_constants();
+    size_t num_folded_constants = this->num_folded_constants();
+    constants_map_->reserve(num_constants);
+
+    std::vector<size_t> constants_internal_offset(
+        num_constants - num_folded_constants);
+    size_t blob_size = 0;
+    compute_constant_blob(blob_size, constants_internal_offset);
+    if (!include_weights) {
+      return;
+    }
+#if defined(USE_CUDA) || defined(USE_XPU) || defined(USE_MPS)
+    constant_blob_ = RAII_gpuMalloc(blob_size);
+#else
+    constant_blob_ = RAII_cpuMalloc(blob_size);
+#endif
+
+    size_t bytes_read = 0;
+    for (size_t i = 0; i < num_constants; i++) {
+      bool from_folded = this->constant_from_folded(i);
+      if (from_folded) {
+        continue;
+      }
+      std::string name = this->constant_name(i);
+      size_t data_size = this->constant_data_size(i);
+      uint8_t* internal_ptr = (data_size != 0)
+          ? constant_ptr(
+                constants_internal_offset[i],
+                bytes_read,
+                data_size,
+                /* skip_copy = */ false)
+          : nullptr;
+      bytes_read += data_size;
+
+      // Create at::Tensor from copied memory.
+      auto dtype = this->constant_dtype(i);
+      auto ndim = this->constant_ndim(i);
+      auto size = this->constant_shape(i);
+      auto stride = this->constant_stride(i);
+#ifdef USE_MPS
+      auto offset = this->constant_offset(i) +
+          (constants_internal_offset[i] / aoti_torch_dtype_element_size(dtype));
+#else
+      auto offset = this->constant_offset(i);
+#endif
+      auto layout = this->constant_layout(i);
+      auto opaque_metadata_ptr = this->opaque_metadata(i);
+      auto opaque_metadata_size = this->opaque_metadata_size(i);
+
+      AtenTensorHandle tensor_handle = nullptr;
+      AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob_v2(
+          internal_ptr,
+          ndim,
+          size,
+          stride,
+          offset,
+          dtype,
+          device_type_,
+          device_idx_,
+          &tensor_handle,
+          layout,
+          opaque_metadata_ptr,
+          opaque_metadata_size));
+      constants_map_->emplace(std::move(name), tensor_handle);
+    }
+    if (constants_map_) {
+      this->update_constants_array_from_map();
+    }
+  }
+
+  RAIIDataPtr&& release_constant_blob() {
+    return std::move(constant_blob_);
+  }
+
+  std::shared_ptr<std::vector<ConstantHandle>> get_constants_array() {
+    return constants_;
+  }
+
+  int32_t get_device_type() const {
+    return device_type_;
+  }
+
+  int32_t get_device_idx() const {
+    return device_idx_;
+  }
+
+  uint8_t* constant_ptr(
+      size_t constant_offset,
+      size_t bytes_read,
+      size_t data_size,
+      bool skip_copy) {
+    auto* constants_ptr = static_cast<uint8_t*>(constant_blob_.get());
+    uint8_t* internal_ptr = constants_ptr + constant_offset;
+    // TODO: Handle shared storage case.
+    if (!skip_copy) {
+#ifdef USE_XPU
+      sycl::queue* queue_ptr = nullptr;
+      aoti_torch_get_current_sycl_queue((void**)&queue_ptr);
+      queue_ptr
+          ->memcpy(internal_ptr, _get_constants_start() + bytes_read, data_size)
+          .wait();
+#elif USE_CUDA
+      AOTI_RUNTIME_DEVICE_CHECK(cudaMemcpy(
+          internal_ptr,
+          _get_constants_start() + bytes_read,
+          data_size,
+          cudaMemcpyHostToDevice));
+#elif USE_MPS
+      aoti_torch_mps_memcpy(
+          constants_ptr,
+          constant_offset,
+          bytes_read,
+          data_size,
+          _get_constants_start());
+      return constants_ptr;
+#else
+      memcpy(internal_ptr, _get_constants_start() + bytes_read, data_size);
+#endif
+    }
+    return internal_ptr;
+  }
+
+  void compute_constant_blob(
+      size_t& blob_size,
+      std::vector<size_t>& constants_internal_offset) {
+    size_t num_constants = this->num_constants();
+    blob_size = 0;
+    size_t curr_idx = 0;
+    for (size_t i = 0; i < num_constants; i++) {
+      if (this->constant_from_folded(i)) {
+        continue;
+      }
+      size_t data_size = this->constant_data_size(i);
+      if (data_size % AOTI_CONST_ALIGNMENT) {
+        data_size = AOTI_CONST_ALIGNMENT +
+            (data_size / AOTI_CONST_ALIGNMENT) * AOTI_CONST_ALIGNMENT;
+      }
+      constants_internal_offset[curr_idx++] = blob_size;
+      blob_size += data_size;
+    }
+  }
+
+  size_t num_inputs() const {
+    return inputs_info_.size();
+  }
+
+  size_t num_outputs() const {
+    return outputs_info_.size();
+  }
+
+  size_t num_constants() const {
+    return constants_info_.size();
+  }
+
+  size_t num_folded_constants() const {
+    size_t total_consts = this->num_constants();
+    size_t folded_consts = 0;
+    for (size_t i = 0; i < total_consts; i++) {
+      if (this->constant_from_folded(i)) {
+        folded_consts++;
+      }
+    }
+    return folded_consts;
+  }
+
+  const char* input_name(int64_t idx) const {
+    return inputs_info_.at(idx).name;
+  }
+
+  const char* output_name(int64_t idx) const {
+    return outputs_info_.at(idx).name;
+  }
+
+  const char* constant_name(int64_t idx) const {
+    return constants_info_.at(idx).name;
+  }
+
+  size_t constant_ndim(int64_t idx) {
+    return constants_info_.at(idx).shape.size();
+  }
+
+  const int64_t* constant_shape(int64_t idx) const {
+    return constants_info_.at(idx).shape.data();
+  }
+
+  const int64_t* constant_stride(int64_t idx) const {
+    return constants_info_.at(idx).stride.data();
+  }
+
+  int32_t constant_dtype(int64_t idx) const {
+    return constants_info_.at(idx).dtype;
+  }
+
+  int32_t constant_layout(int64_t idx) const {
+    return constants_info_.at(idx).layout;
+  }
+
+  size_t constant_offset(int64_t idx) const {
+    return constants_info_.at(idx).offset;
+  }
+
+  size_t constant_data_size(int64_t idx) const {
+    return constants_info_.at(idx).data_size;
+  }
+
+  const char* constant_original_fqn(int64_t idx) const {
+    return constants_info_.at(idx).original_fqn;
+  }
+
+  const uint8_t* opaque_metadata(int64_t idx) const {
+    return constants_info_.at(idx).opaque_metadata.data();
+  }
+
+  size_t opaque_metadata_size(int64_t idx) {
+    return constants_info_.at(idx).opaque_metadata.size();
+  }
+
+  bool constant_from_folded(int64_t idx) const {
+    return constants_info_.at(idx).from_folded;
+  }
+
+  int32_t constant_type(int64_t idx) const {
+    return constants_info_.at(idx).type;
+  }
+
+  const char* get_in_spec() const {
+    return in_spec_.c_str();
+  }
+
+  const char* get_out_spec() const {
+    return out_spec_.c_str();
+  }
+
+  void update_constants_array_from_map() {
+    if (!constants_map_) {
+      throw std::runtime_error{
+          "constants_map_ was not ready when constants_ is trying to be constructed from it!"};
+    }
+    if (!constants_) {
+      constants_ =
+          std::make_shared<std::vector<ConstantHandle>>(constants_info_.size());
+    } else {
+      constants_->resize(constants_info_.size());
+    }
+    int idx = 0;
+    for (const auto& info : constants_info_) {
+      const auto it = constants_map_->find(info.name);
+      if (it != constants_map_->end()) {
+        constants_->at(idx) = ConstantHandle(it->second);
+      }
+      idx++;
+    }
+  }
+
+  void update_constants_map(
+      std::shared_ptr<ConstantMap> constants_map,
+      bool remap_constants_array = true) {
+    constants_map_ = std::move(constants_map);
+    if (remap_constants_array) {
+      update_constants_array_from_map();
+    }
+  }
+
+  // This function allows us to update the constants_ that is used to look up
+  // the corresponding constant tensor during runtime.
+  void update_constants_array(
+      std::shared_ptr<std::vector<ConstantHandle>> constants_array) {
+    constants_ = std::move(constants_array);
+  }
+
+  /// Returns true if the model is complete.
+  bool is_finished() {
+#ifdef USE_CUDA
+    if (!run_finished_) {
+      throw std::runtime_error{"Model CUDA event was not initialized"};
+    }
+
+    auto event_status = cudaEventQuery(*run_finished_);
+    if (event_status == cudaSuccess) {
+      return true;
+    } else if (event_status == cudaErrorNotReady) {
+      return false;
+    }
+
+    throw std::runtime_error(
+        std::string("The model did not finish successfully. Error: ") +
+        cudaGetErrorString(cudaGetLastError()));
+#elif defined(USE_XPU)
+    if (!run_finished_) {
+      throw std::runtime_error{"Model XPU event was not initialized"};
+    }
+    using namespace sycl::info;
+    return (*run_finished_)->get_info<event::command_execution_status>() ==
+        event_command_status::complete;
+
+#else // !USE_CUDA && !USE_XPU
+    return run_finished_;
+#endif // USE_CUDA
+  }
+
+  /// Synchronizes completion event.
+  void wait_for_completion() {
+#ifdef USE_CUDA
+    if (!run_finished_) {
+      throw std::runtime_error{"Model event was not initialized"};
+    }
+
+    AOTI_RUNTIME_DEVICE_CHECK(cudaEventSynchronize(*run_finished_));
+#endif // USE_CUDA
+#ifdef USE_XPU
+    if (!run_finished_) {
+      throw std::runtime_error{"Model event was not initialized"};
+    }
+    (*run_finished_)->wait_and_throw();
+#endif
+  }
+
+ protected:
+  uint8_t* _get_constants_start() {
+#ifndef USE_MMAP_SELF
+    // NOLINTNEXTLINE(*const-cast*)
+    return const_cast<uint8_t*>(_binary_constants_bin_start);
+#else
+    if (self_mmap) {
+      return self_mmap;
+    }
+    Dl_info dl_info;
+    // get pointer to constant which are appended to the binary
+    AOTI_RUNTIME_CHECK(
+        dladdr(__func__, &dl_info), "Can't find shared library name");
+    int fd = open(dl_info.dli_fname, O_RDONLY);
+    AOTI_RUNTIME_CHECK(fd >= 0, "Shared library file cannot be opened");
+    auto fsize = lseek(fd, 0, SEEK_END);
+    auto weights_size =
+        reinterpret_cast<const uint64_t*>(_binary_constants_bin_start)[0];
+    auto magic_number =
+        reinterpret_cast<const uint64_t*>(_binary_constants_bin_start)[1];
+    auto weights_offset = fsize - weights_size;
+    AOTI_RUNTIME_CHECK(
+        (weights_offset & 0x3fff) == 0,
+        "weights_offset must be aligned to 16K boundary");
+    auto ptr = mmap(
+        NULL,
+        weights_size,
+        PROT_READ | PROT_WRITE,
+        MAP_PRIVATE,
+        fd,
+        weights_offset);
+    close(fd);
+    AOTI_RUNTIME_CHECK(ptr != MAP_FAILED, "mmap() failed");
+    self_mmap = static_cast<uint8_t*>(ptr);
+    AOTI_RUNTIME_CHECK(
+        reinterpret_cast<uint64_t*>(
+            self_mmap + weights_size - sizeof(uint64_t))[0] == magic_number,
+        "Weights data seems corrupt");
+    return self_mmap;
+#endif
+  }
+  struct ParamInfo {
+    const char* name = nullptr;
+  };
+
+  struct ConstInfo {
+    const char* name = nullptr;
+    std::vector<int64_t> shape;
+    std::vector<int64_t> stride;
+    int32_t dtype{};
+    int64_t offset{};
+    size_t data_size{};
+    int32_t layout{};
+    std::vector<uint8_t> opaque_metadata;
+    int64_t opaque_metadata_size{};
+    const char* original_fqn = nullptr;
+    bool from_folded{};
+    int32_t type{};
+  };
+
+  std::vector<ParamInfo> inputs_info_;
+  std::vector<ParamInfo> outputs_info_;
+  std::vector<ConstInfo> constants_info_;
+  std::string in_spec_;
+  std::string out_spec_;
+
+  std::shared_ptr<ConstantMap> constants_map_;
+  std::shared_ptr<std::vector<ConstantHandle>> constants_;
+
+  // Holds the blob storage for constants' at::Tensor.
+  RAIIDataPtr constant_blob_;
+
+#ifdef USE_MMAP_SELF
+  uint8_t* self_mmap = NULL;
+#endif
+
+  // A directory with CUDA binary files, e.g. compiled kernels, etc.
+  const std::optional<std::string> cubin_dir_;
+
+  // This is the flag that implies whether the weight is included in the model.
+  // If True, we would prepare the weight when loading the model, otherwise the
+  // model will be loaded without weights, and need to be provided by the user.
+  bool include_weights;
+
+  // Record if the model finishes an inference run so that its owning
+  // AOTModelContainer can reuse this instance.
+#ifdef USE_CUDA
+  std::optional<cudaEvent_t> run_finished_;
+#elif defined(USE_XPU)
+  std::optional<sycl::event*> run_finished_;
+#else // !USE_CUDA
+  bool run_finished_{};
+#endif
+
+  // Generated model uses this device index to create CUDA guards.
+  int32_t device_type_{};
+  int32_t device_idx_{};
+};
+
+// Codegen-ed classes can derive from this to keep pointers to loaded kernels.
+class AOTInductorModelKernelsBase {
+ public:
+  virtual ~AOTInductorModelKernelsBase() = default;
+};
+
+class AOTInductorModel : public AOTInductorModelBase<AOTInductorModel> {
+ public:
+  AOTInductorModel(
+      std::shared_ptr<ConstantMap> constants_map,
+      std::shared_ptr<std::vector<ConstantHandle>> constants_array,
+      const std::string& device_str,
+      std::optional<std::string> cubin_dir);
+
+  std::unordered_map<std::string, AtenTensorHandle> const_run_impl(
+      DeviceStreamType stream,
+      AOTIProxyExecutorHandle proxy_executor,
+      bool initialization = false);
+
+  void _const_run_impl(
+      std::vector<AtenTensorHandle>& output_handles,
+      DeviceStreamType stream,
+      AOTIProxyExecutorHandle proxy_executor);
+
+  void run_impl(
+      AtenTensorHandle*
+          input_handles, // array of input AtenTensorHandle; handles
+                         // are stolen; the array itself is borrowed
+      AtenTensorHandle*
+          output_handles, // array for writing output AtenTensorHandle; handles
+                          // will be stolen by the caller; the array itself is
+                          // borrowed
+      DeviceStreamType stream,
+      AOTIProxyExecutorHandle proxy_executor);
+
+  template <typename Inputs, typename Outputs>
+  Outputs run_impl_minimal_arrayref_interface(
+      const Inputs& inputs,
+      DeviceStreamType stream,
+      AOTIProxyExecutorHandle proxy_executor);
+
+  static std::unique_ptr<AOTInductorModel> Create(
+      std::shared_ptr<ConstantMap> constants_map,
+      std::shared_ptr<std::vector<ConstantHandle>> constants_array,
+      const std::string& device_str,
+      std::optional<std::string> cubin_dir) {
+    return std::make_unique<AOTInductorModel>(
+        std::move(constants_map),
+        std::move(constants_array),
+        device_str,
+        std::move(cubin_dir));
+  }
+
+ private:
+  std::unique_ptr<AOTInductorModelKernelsBase> kernels_;
+};
+
+} // namespace torch::aot_inductor
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/model_container.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/model_container.h
new file mode 100644
index 0000000000000000000000000000000000000000..0c65df4e616d702c3be826030d85255bcaa78b3b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/model_container.h
@@ -0,0 +1,762 @@
+#pragma once
+
+#include <algorithm>
+#include <condition_variable>
+#include <deque>
+#include <mutex>
+#include <shared_mutex>
+
+// WARNING: Be careful when adding new includes here. This header will be used
+// in model.so, and should not refer to any aten/c10 headers except the stable
+// C ABI defined in torch/csrc/inductor/aoti_torch/c/shim.h. The same rule
+// applies to other files under torch/csrc/inductor/aoti_runtime/.
+#include <torch/csrc/inductor/aoti_runtime/model.h>
+
+namespace torch::aot_inductor {
+// The state transition is done by:
+// (1) NONE state: The default state when created. This state should only exist
+// when model_container is created and no constants are being loaded or updated.
+// (2) INITIALIZED state: This state get set whenever we load the constants into
+// the buffer. This could be done by load_constants or update_constants_buffer.
+// (3) FOLDED state: This state should transition from INITIALIZED after
+// const_fold is being invoked.
+enum class ConstantState : uint8_t { NONE, INITIALIZED, FOLDED, UNKNOWN };
+
+inline std::string toStringConstantState(ConstantState state) {
+  switch (state) {
+    case ConstantState::NONE:
+      return "ConstantState::NONE";
+    case ConstantState::INITIALIZED:
+      return "ConstantState::INITIALIZED";
+    case ConstantState::FOLDED:
+      return "ConstantState::FOLDED";
+    case ConstantState::UNKNOWN:
+      return "ConstantState::UNKNOWN";
+    default:
+      return "Unknown enum class state for ConstantState";
+  }
+}
+
+class AOTInductorModelContainer {
+ public:
+  AOTInductorModelContainer(
+      size_t num_models,
+      const std::string& device_str,
+      const std::optional<std::string>& cubin_dir = std::nullopt) {
+    constants_map_ = std::make_shared<ConstantMap>();
+    constants_array_ = std::make_shared<std::vector<ConstantHandle>>();
+
+    models_.reserve(num_models);
+    available_models_.reserve(num_models);
+    for (size_t i = 0; i < num_models; ++i) {
+      models_.push_back(AOTInductorModel::Create(
+          constants_map_, constants_array_, device_str, cubin_dir));
+      available_models_.push_back(models_.back().get());
+    }
+
+    // Note that the all following fields (input_names_, output_names,
+    // etc) can be filled in by the AOT
+    // codegen. However, we choose to query such information from
+    // the owned AOTInductorModel for a couple of reasons:
+    //   * simplify the codegen templates
+    //   * reduce information fragmentation and duplication
+    //   * the initialization process below is done only once when the container
+    //     is constructed, so it would have little performance impact
+    auto* model = available_models_[0];
+    size_t num_inputs = model->num_inputs();
+    input_names_.reserve(num_inputs);
+    for (size_t i = 0; i < num_inputs; i++) {
+      input_names_.emplace_back(model->input_name(static_cast<int64_t>(i)));
+    }
+
+    size_t num_outputs = model->num_outputs();
+    output_names_.reserve(num_outputs);
+    for (size_t i = 0; i < num_outputs; i++) {
+      output_names_.emplace_back(model->output_name(static_cast<int64_t>(i)));
+    }
+    model->load_constants();
+    constant_blob_ = model->release_constant_blob();
+    constants_internal_offset_.resize(
+        model->num_constants() - model->num_folded_constants());
+    model->compute_constant_blob(blob_size_, constants_internal_offset_);
+    constant_folded_ = ConstantState::INITIALIZED;
+
+    for (auto& model : models_) {
+      model->update_constants_map(constants_map_);
+    }
+
+    in_spec_ = model->get_in_spec();
+    out_spec_ = model->get_out_spec();
+  }
+
+  void run(
+      AtenTensorHandle*
+          input_handles, // array of input AtenTensorHandle; handles
+                         // are stolen; the array itself is borrowed
+      AtenTensorHandle*
+          output_handles, // array for writing output AtenTensorHandle; handles
+                          // will be stolen by the caller; the array itself is
+                          // borrowed
+      DeviceStreamType stream,
+      AOTIProxyExecutorHandle proxy_executor) {
+    std::shared_lock model_lk(model_exec_mutex_);
+    auto* model = get_available_model();
+
+    ConstantState& const_folded =
+        use_secondary_ ? constant_folded_secondary_ : constant_folded_;
+    if (const_folded == ConstantState::INITIALIZED) {
+      // At this point, constant is not ready yet. We need to call constant
+      // folding before we execute the model. We obtain a unique lock at this
+      // point to make sure constant is ready for all.
+      model_lk.unlock();
+      std::unique_lock constants_folding_lk(model_exec_mutex_);
+      // Double locking to make sure constant folding is only ran once.
+      if (const_folded == ConstantState::INITIALIZED) {
+        auto folded_const_map = model->run_const_fold(
+            stream, proxy_executor, /* initialization = */ true);
+        update_constant_buffer(
+            std::move(folded_const_map),
+            /* use_inactive = */ false,
+            /* validate_full_update = */ false);
+        const_folded = ConstantState::FOLDED;
+      }
+      constants_folding_lk.unlock();
+      model_lk.lock();
+    } else if (const_folded != ConstantState::FOLDED) {
+      throw std::runtime_error(
+          "Unknown constant state: " + toStringConstantState(constant_folded_));
+    }
+
+    try {
+      model->run(input_handles, output_handles, stream, proxy_executor);
+    } catch (...) {
+      std::lock_guard lk(models_mutex_);
+      available_models_.push_back(model);
+      throw;
+    }
+
+    {
+      std::lock_guard lk(models_mutex_);
+      pending_models_.push_back(model);
+    }
+    pending_models_available_.notify_one();
+  }
+
+  // Non-thread-aware variant of run(). Obviously unsafe to use in a threaded
+  // environment :)
+  void run_single_threaded(
+      AtenTensorHandle*
+          input_handles, // array of input AtenTensorHandle; handles
+                         // are stolen; the array itself is borrowed
+      AtenTensorHandle*
+          output_handles, // array for writing output AtenTensorHandle; handles
+                          // will be stolen by the caller; the array itself is
+                          // borrowed
+      DeviceStreamType stream,
+      AOTIProxyExecutorHandle proxy_executor) {
+    auto* model = available_models_[0];
+
+    ConstantState& const_folded =
+        use_secondary_ ? constant_folded_secondary_ : constant_folded_;
+    if (const_folded == ConstantState::INITIALIZED) {
+      auto folded_const_map = model->run_const_fold(
+          stream, proxy_executor, /* initialization = */ true);
+      update_constant_buffer(
+          std::move(folded_const_map),
+          /* use_inactive = */ false,
+          /* validate_full_update = */ false);
+      const_folded = ConstantState::FOLDED;
+    } else if (constant_folded_ != ConstantState::FOLDED) {
+      throw std::runtime_error(
+          "Unknown constant state: " + toStringConstantState(constant_folded_));
+    }
+
+    model->run_single_threaded(
+        input_handles, output_handles, stream, proxy_executor);
+  }
+
+  const std::unordered_map<std::string, AtenTensorHandle> extract_constants_map(
+      bool use_inactive) const {
+    size_t n_consts = this->num_constants();
+    std::unordered_map<std::string, AtenTensorHandle> ret;
+    ret.reserve(n_consts);
+
+    std::shared_ptr<ConstantMap> extract_map = constants_map_;
+    // Essentially a XOR
+    if (use_inactive != use_secondary_) {
+      extract_map = constants_map_secondary_;
+    }
+    for (size_t idx = 0; idx < n_consts; idx++) {
+      if (this->constant_from_folded(idx)) {
+        continue;
+      }
+
+      auto it = extract_map->find(this->constant_name(idx));
+      if (it != extract_map->end()) {
+        ret.emplace(this->constant_original_fqn(idx), it->second);
+        continue;
+      }
+    }
+
+    return ret;
+  }
+
+  size_t num_constants() const {
+    if (this->num_models() == 0) {
+      throw std::runtime_error("No available models in container!");
+    }
+    return models_[0]->num_constants();
+  }
+
+  // retrieve the constant name of constants_info_[idx]
+  const char* constant_name(size_t idx) const {
+    if (this->num_models() == 0) {
+      throw std::runtime_error("No available models in container!");
+    }
+    return models_[0]->constant_name(static_cast<int64_t>(idx));
+  }
+
+  // retrieve original FQN of constants_info_[idx]
+  const char* constant_original_fqn(size_t idx) const {
+    if (this->num_models() == 0) {
+      throw std::runtime_error("No available models in container!");
+    }
+    return models_[0]->constant_original_fqn(static_cast<int64_t>(idx));
+  }
+
+  // retrieve whether constant is from folded of constants_info_[idx]
+  bool constant_from_folded(size_t idx) const {
+    if (this->num_models() == 0) {
+      throw std::runtime_error("No available models in container!");
+    }
+    return models_[0]->constant_from_folded(static_cast<int64_t>(idx));
+  }
+
+  size_t constant_data_size(size_t idx) const {
+    if (this->num_models() == 0) {
+      throw std::runtime_error("No available models in container!");
+    }
+    return models_[0]->constant_data_size(static_cast<int64_t>(idx));
+  }
+
+  // retrieve type of constants_info_[idx]
+  int32_t constant_type(size_t idx) const {
+    if (this->num_models() == 0) {
+      throw std::runtime_error("No available models in container!");
+    }
+    return models_[0]->constant_type(static_cast<int64_t>(idx));
+  }
+
+  // retrieve dtype of constants_info_[idx]
+  int32_t constant_dtype(size_t idx) const {
+    if (this->num_models() == 0) {
+      throw std::runtime_error("No available models in container!");
+    }
+    return models_[0]->constant_dtype(static_cast<int64_t>(idx));
+  }
+
+  void run_const_fold(
+      bool inactive_buffer,
+      DeviceStreamType stream,
+      AOTIProxyExecutorHandle proxy_executor) {
+    AOTInductorModel* model;
+    ConstantState& const_folded = inactive_buffer == use_secondary_
+        ? constant_folded_
+        : constant_folded_secondary_;
+    if (!inactive_buffer) {
+      // We would need to acquire a unique lock if we want to run constant
+      // folding on the active buffer.
+      std::unique_lock constants_folding_lk(model_exec_mutex_);
+      model = get_available_model();
+      try {
+        auto folded_const_map = model->run_const_fold(stream, proxy_executor);
+        update_constant_buffer(
+            std::move(folded_const_map),
+            /* use_inactive = */ false,
+            /* validate_full_update = */ false);
+        const_folded = ConstantState::FOLDED;
+      } catch (...) {
+        std::lock_guard lk(models_mutex_);
+        available_models_.push_back(model);
+        throw;
+      }
+    } else {
+      std::shared_lock model_lk(model_exec_mutex_);
+      model = get_available_model();
+
+      // We swap the constant mapping to the inactive buffer in the model to run
+      // const run.
+      auto constants_map = get_constants_map(/* get_inactive= */ true);
+      auto constants_array = get_constants_array(/* get_inactive= */ true);
+
+      try {
+        model->update_constants_map(
+            constants_map, /* remap_constants_array= */ false);
+        model->update_constants_array(constants_array);
+
+        auto folded_const_map = model->run_const_fold(stream, proxy_executor);
+        update_constant_buffer(
+            std::move(folded_const_map),
+            /* use_inactive = */ true,
+            /* validate_full_update = */ false);
+
+        // Swap back the model's constants mapping
+        constants_map = get_constants_map(/* get_inactive= */ false);
+        constants_array = get_constants_array(/* get_inactive= */ false);
+        model->update_constants_map(
+            constants_map, /* remap_constants_array= */ false);
+        model->update_constants_array(constants_array);
+        const_folded = ConstantState::FOLDED;
+      } catch (...) {
+        std::lock_guard lk(models_mutex_);
+        available_models_.push_back(model);
+        throw;
+      }
+    }
+
+    {
+      std::lock_guard lk(models_mutex_);
+      pending_models_.push_back(model);
+    }
+    pending_models_available_.notify_one();
+  }
+
+  bool _is_tensor_constant_type(const size_t idx) const {
+    auto constant_type = models_[0]->constant_type(static_cast<int64_t>(idx));
+    // We should skip constants
+    return constant_type == ConstantType::TensorConstant;
+  }
+
+  bool _is_buffer_type(const size_t idx) const {
+    auto constant_type = models_[0]->constant_type(static_cast<int64_t>(idx));
+    // Buffer can be optionally skipped, so if it not provided by upstream
+    // services, it is OK to relax the check.
+    return constant_type == ConstantType::Buffer;
+  }
+
+  bool _is_tensor_constant_or_buffer_type(const size_t idx) const {
+    return _is_tensor_constant_type(idx) || _is_buffer_type(idx);
+  }
+
+  void assert_all_constants(
+      const std::unordered_map<std::string, AtenTensorHandle>& constants_map) {
+    auto num_constants = models_[0]->num_constants();
+    for (size_t idx = 0; idx < num_constants; idx++) {
+      if (models_[0]->constant_from_folded(static_cast<int64_t>(idx))) {
+        continue;
+      }
+
+      auto constant_name =
+          std::string(models_[0]->constant_name(static_cast<int64_t>(idx)));
+      auto it = constants_map.find(constant_name);
+      if (it == constants_map.end()) {
+        if (_is_tensor_constant_or_buffer_type(idx)) {
+          // tracing sometimes creates tensors that are non-existent in
+          // original graph. We could skip those and do a direct copy.
+          std::cerr << "[WARNING] Found constant or module state buffer "
+                    << constant_name
+                    << " in model, but not provided by user!\n";
+          continue;
+        }
+        throw std::runtime_error(
+            std::string("Cannot find constants ") + constant_name +
+            std::string(" in constants_map!"));
+      }
+    }
+  }
+
+  // We directly take ownership from AtenTensorHandle if constants are moved.
+  void update_constant_buffer(
+      std::unordered_map<std::string, AtenTensorHandle>&& constants_map,
+      bool use_inactive,
+      bool validate_full_update) {
+    if (this->num_models() == 0) {
+      throw std::runtime_error("No model available in container!");
+    }
+    if (validate_full_update) {
+      assert_all_constants(constants_map);
+    }
+
+    ConstantState& const_folded = use_inactive == use_secondary_
+        ? constant_folded_
+        : constant_folded_secondary_;
+    const_folded = ConstantState::INITIALIZED;
+
+    auto original_constants_map = get_constants_map(!use_inactive);
+    auto constants_map_to_update = get_constants_map(use_inactive);
+
+    auto num_constants = models_[0]->num_constants();
+    for (size_t idx = 0; idx < num_constants; idx++) {
+      auto constant_name =
+          std::string(models_[0]->constant_name(static_cast<int64_t>(idx)));
+      auto it = constants_map.find(constant_name);
+      if (it == constants_map.end() &&
+          !(use_inactive && _is_tensor_constant_type(idx))) {
+        continue;
+      }
+
+      AtenTensorHandle tensor;
+      if (it == constants_map.end()) {
+        aoti_torch_clone(
+            original_constants_map->find(constant_name)->second.get(), &tensor);
+      } else {
+        tensor = it->second;
+      }
+
+      constants_map_to_update->insert_or_assign(
+          constant_name, RAIIAtenTensorHandle(tensor));
+    }
+    // Update the inactive constant array.
+    update_array_from_map(
+        get_constants_array(use_inactive), constants_map_to_update);
+  }
+
+  // This function updates the buffer for storing constants.
+  // It will update the buffer, the mapping and the array mapping.
+  void update_constant_buffer(
+      const std::unordered_map<std::string, AtenTensorHandle>& constants_map,
+      bool use_inactive,
+      bool validate_full_update,
+      bool user_managed = false) {
+    if (this->num_models() == 0) {
+      throw std::runtime_error("No model available in container!");
+    }
+    if (validate_full_update) {
+      assert_all_constants(constants_map);
+    }
+
+    ConstantState& const_folded = use_inactive == use_secondary_
+        ? constant_folded_
+        : constant_folded_secondary_;
+    const_folded = ConstantState::INITIALIZED;
+
+    auto original_constants_map = get_constants_map(!use_inactive);
+    auto constants_map_to_update = get_constants_map(use_inactive);
+
+    auto num_constants = models_[0]->num_constants();
+    for (size_t idx = 0; idx < num_constants; idx++) {
+      auto constant_name =
+          std::string(models_[0]->constant_name(static_cast<int64_t>(idx)));
+      auto it = constants_map.find(constant_name);
+      if (it == constants_map.end() &&
+          !(use_inactive && _is_tensor_constant_or_buffer_type(idx))) {
+        continue;
+      }
+
+      AtenTensorHandle tensor;
+      if (it == constants_map.end()) {
+        tensor = original_constants_map->find(constant_name)->second.get();
+      } else {
+        tensor = it->second;
+      }
+
+      if (user_managed) {
+        // If user managed, we pass in the pointer directly, and skip the
+        // copy.
+        constants_map_to_update->insert_or_assign(
+            constant_name,
+            MaybeOwningAtenTensorHandle(tensor, /* user_managed = */ true));
+        continue;
+      }
+
+      auto* constants_blob_ptr =
+          static_cast<uint8_t*>(get_constant_blob_ptr(use_inactive));
+
+      // Move the data to container handled blob.
+      uint8_t* internal_constants_ptr =
+          constants_blob_ptr + constants_internal_offset_[idx];
+      void* user_constant_ptr;
+      int64_t constant_size;
+      aoti_torch_get_data_ptr(tensor, &user_constant_ptr);
+      aoti_torch_get_storage_size(tensor, &constant_size);
+#ifdef USE_XPU
+      sycl::queue* queue_ptr = nullptr;
+      aoti_torch_get_current_sycl_queue((void**)&queue_ptr);
+      queue_ptr
+          ->memcpy(internal_constants_ptr, user_constant_ptr, constant_size)
+          .wait();
+#elif USE_CUDA
+      AOTI_RUNTIME_DEVICE_CHECK(cudaMemcpy(
+          internal_constants_ptr,
+          user_constant_ptr,
+          constant_size,
+          cudaMemcpyDefault));
+#else
+      memcpy(internal_constants_ptr, user_constant_ptr, constant_size);
+#endif
+      // Generate Tensor from container handled blob.
+      // We extract stride and offset from provided Tensor since we do not
+      // guarantee that the tensor is contiguous.
+      AtenTensorHandle tensor_handle;
+      int64_t* stride;
+      int64_t offset;
+      int device_type = models_[0]->get_device_type();
+      int device_idx = models_[0]->get_device_idx();
+      AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_strides(tensor, &stride));
+      AOTI_TORCH_ERROR_CODE_CHECK(
+          aoti_torch_get_storage_offset(tensor, &offset));
+      AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob(
+          internal_constants_ptr,
+          models_[0]->constant_ndim(idx),
+          models_[0]->constant_shape(idx),
+          stride,
+          offset,
+          models_[0]->constant_dtype(idx),
+          device_type,
+          device_idx,
+          &tensor_handle));
+
+      // Now place the tensor to constants_map. Note at this point the
+      // ownership of the tensor_handle will be taken over.
+      constants_map_to_update->insert_or_assign(
+          constant_name, RAIIAtenTensorHandle(tensor_handle));
+    }
+    // Update the inactive constant array.
+    update_array_from_map(
+        get_constants_array(use_inactive), constants_map_to_update);
+  }
+
+  void update_array_from_map(
+      const std::shared_ptr<std::vector<ConstantHandle>>& constants_array,
+      const std::shared_ptr<ConstantMap>& constants_map) {
+    auto num_constants = models_[0]->num_constants();
+    for (size_t idx = 0; idx < num_constants; idx++) {
+      if (constants_map->find(models_[0]->constant_name(
+              static_cast<int64_t>(idx))) != constants_map->end()) {
+        constants_array->at(idx) = ConstantHandle(
+            constants_map
+                ->find(models_[0]->constant_name(static_cast<int64_t>(idx)))
+                ->second);
+      }
+    }
+  }
+
+  void swap_constant_buffer() {
+    std::lock_guard unique_lk(model_exec_mutex_);
+
+    auto constants_map = get_constants_map(/* get_inactive= */ true);
+    auto constants_array = get_constants_array(/* get_inactive= */ true);
+
+    for (auto& model : models_) {
+      model->update_constants_map(
+          constants_map, /* remap_constants_array = */ false);
+      model->update_constants_array(constants_array);
+    }
+
+    use_secondary_ = !use_secondary_;
+  }
+
+  void free_inactive_constant_buffer() {
+    if (use_secondary_) {
+      constant_folded_ = ConstantState::NONE;
+      constant_blob_.reset();
+    } else {
+      constant_folded_secondary_ = ConstantState::NONE;
+      constant_blob_secondary_.reset();
+    }
+    // Free the internally held constants
+    int num_constants = static_cast<int>(models_[0]->num_constants());
+    std::shared_ptr<ConstantMap> to_free_map =
+        use_secondary_ ? constants_map_ : constants_map_secondary_;
+
+    for (int i = 0; i < num_constants; i++) {
+      if (models_[0]->constant_from_folded(i)) {
+        auto it = to_free_map->find(models_[0]->constant_name(i));
+        if (it != to_free_map->end()) {
+          it->second.reset();
+        }
+      }
+    }
+  }
+
+  size_t num_inputs() const {
+    return input_names_.size();
+  }
+
+  size_t num_outputs() const {
+    return output_names_.size();
+  }
+
+  const char* input_name(size_t idx) const {
+    return input_names_.at(idx).c_str();
+  }
+
+  const char* output_name(size_t idx) const {
+    return output_names_.at(idx).c_str();
+  }
+
+  size_t num_models() const {
+    return models_.size();
+  }
+
+  const char* get_in_spec() const {
+    return in_spec_;
+  }
+
+  const char* get_out_spec() const {
+    return out_spec_;
+  }
+
+ private:
+  std::vector<std::string> input_names_;
+  std::vector<std::string> output_names_;
+  const char* in_spec_;
+  const char* out_spec_;
+
+  // Holds the blob storage for constants' at::Tensor within the container.
+  // This blob of memory will be managed by the container.
+  RAIIDataPtr constant_blob_;
+  RAIIDataPtr constant_blob_secondary_;
+
+  size_t blob_size_;
+  std::vector<size_t> constants_internal_offset_;
+
+  // Determine which constants is being used for the model.
+  // If true,
+  // constants_map_secondary/constant_blob_secondary/constants_array_secondary
+  // is being used.
+  bool use_secondary_{false};
+
+  // Determine whether we have ran constant folding
+  ConstantState constant_folded_{ConstantState::NONE};
+  ConstantState constant_folded_secondary_{ConstantState::NONE};
+
+  // Holds the mapping of constants to at::Tensor.
+  // The underlying data of at::Tensor is in either constant_blob_ (for CUDA).
+  // or _binary_constants_bin_start (for CPU).
+  std::shared_ptr<ConstantMap> constants_map_;
+  std::shared_ptr<ConstantMap> constants_map_secondary_;
+
+  // Holds the indexed array of constant for faster lookup during runtime.
+  std::shared_ptr<std::vector<ConstantHandle>> constants_array_;
+  std::shared_ptr<std::vector<ConstantHandle>> constants_array_secondary_;
+
+  // Holds all the AOTInductorModel instances owned by this container.
+  std::vector<std::unique_ptr<AOTInductorModel>> models_;
+
+  // Holds the AOTInductorModel instances available for inference.
+  std::vector<AOTInductorModel*> available_models_;
+
+  // Holds the AOTInductorModel instances that have started running
+  // inference and can be placed onto available_models_ upon their
+  // completion.
+  std::deque<AOTInductorModel*> pending_models_;
+
+  // Protects available_models_ and pending_models_.
+  std::mutex models_mutex_;
+
+  // Notified whenever a model is placed onto pending_models_.
+  std::condition_variable pending_models_available_;
+
+  AOTInductorModel* get_available_model() {
+    std::unique_lock lk(models_mutex_);
+    if (available_models_.empty()) {
+      reclaim_finished_models(lk);
+    }
+    auto* result = available_models_.back();
+    available_models_.pop_back();
+    return result;
+  }
+
+  // This mutex is used to protect execution of model.
+  // We acquire the mutex in shared mode if we allow concurrent execution.
+  // We acquire the mutex in unique mode when we want exclusive access of the
+  // model. One such case is when we want to do a weight swapping. We want to
+  // make sure no one is executing the model.
+  std::shared_mutex model_exec_mutex_;
+
+  RAIIDataPtr allocate_constant_blob() {
+#if defined(USE_CUDA) || defined(USE_XPU) || defined(USE_MPS)
+    return RAII_gpuMalloc(blob_size_);
+#else
+    return RAII_cpuMalloc(blob_size_);
+#endif // USE_CUDA
+  }
+
+  void* get_constant_blob_ptr(bool get_inactive) {
+    if ((get_inactive && use_secondary_) ||
+        (!get_inactive && !use_secondary_)) {
+      if (!constant_blob_) {
+        constant_blob_ = allocate_constant_blob();
+      }
+      return constant_blob_.get();
+    } else {
+      if (!constant_blob_secondary_) {
+        constant_blob_secondary_ = allocate_constant_blob();
+      }
+      return constant_blob_secondary_.get();
+    }
+  }
+
+  std::shared_ptr<ConstantMap> get_constants_map(bool get_inactive) {
+    if ((get_inactive && use_secondary_) ||
+        (!get_inactive && !use_secondary_)) {
+      return constants_map_;
+    } else {
+      if (!constants_map_secondary_) {
+        constants_map_secondary_ = std::make_shared<ConstantMap>();
+      }
+      return constants_map_secondary_;
+    }
+  }
+
+  std::shared_ptr<std::vector<ConstantHandle>> get_constants_array(
+      bool get_inactive) {
+    if ((get_inactive && use_secondary_) ||
+        (!get_inactive && !use_secondary_)) {
+      return constants_array_;
+    } else {
+      if (!constants_array_secondary_) {
+        constants_array_secondary_ =
+            std::make_shared<std::vector<ConstantHandle>>(
+                models_[0]->num_constants());
+      }
+      return constants_array_secondary_;
+    }
+  }
+
+  void reclaim_finished_models(std::unique_lock<std::mutex>& lk) {
+#ifdef __aarch64__
+    // push finished model instances to the end of pending_models_
+    auto it = std::partition(
+        pending_models_.begin(),
+        pending_models_.end(),
+        [](AOTInductorModel* m) { return !m->is_finished(); });
+#else
+    // push finished model instances to the end of pending_models_
+    auto it = std::stable_partition(
+        pending_models_.begin(),
+        pending_models_.end(),
+        [](AOTInductorModel* m) { return !m->is_finished(); });
+#endif
+
+    if (it != pending_models_.end()) {
+      // We have finished model instances that can be pushed into
+      // available_models_ so that we don't have to be blocked on waiting
+      // the pending_models_available_ condition.
+      available_models_.insert(
+          available_models_.end(), it, pending_models_.end());
+      pending_models_.erase(it, pending_models_.end());
+      return;
+    }
+
+    pending_models_available_.wait(
+        lk, [this]() { return !pending_models_.empty(); });
+    // Let's make the schedule simple first. We always wait on the first
+    // pending_models_ to be complete.
+    auto* model = pending_models_.front();
+    pending_models_.pop_front();
+    lk.unlock();
+    try {
+      model->wait_for_completion();
+    } catch (...) {
+      lk.lock();
+      available_models_.push_back(model);
+      throw;
+    }
+    lk.lock();
+    available_models_.push_back(model);
+  }
+};
+
+} // namespace torch::aot_inductor
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/scalar_to_tensor.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/scalar_to_tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..c67af39195bdf4cb3a9091b461714f9d91dff88d
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/scalar_to_tensor.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <c10/util/complex.h>
+#include <torch/csrc/inductor/aoti_runtime/utils.h>
+
+namespace torch::aot_inductor {
+
+template <typename T>
+inline RAIIAtenTensorHandle scalar_to_tensor_handle(T value) {
+  throw std::runtime_error("Unsupported scalar_to_tensor_handle");
+}
+
+// Specialize for supported C++ primitive types
+#define AOTI_RUNTIME_SCALAR_TO_TENSOR(dtype, ctype)                         \
+  template <>                                                               \
+  inline RAIIAtenTensorHandle scalar_to_tensor_handle<ctype>(ctype value) { \
+    AtenTensorHandle tensor_handle;                                         \
+    AOTI_TORCH_ERROR_CODE_CHECK(                                            \
+        aoti_torch_scalar_to_tensor_##dtype(value, &tensor_handle));        \
+    return RAIIAtenTensorHandle(tensor_handle);                             \
+  }
+
+AOTI_RUNTIME_SCALAR_TO_TENSOR(float32, float)
+AOTI_RUNTIME_SCALAR_TO_TENSOR(float64, double)
+AOTI_RUNTIME_SCALAR_TO_TENSOR(uint8, uint8_t)
+AOTI_RUNTIME_SCALAR_TO_TENSOR(uint16, uint16_t)
+AOTI_RUNTIME_SCALAR_TO_TENSOR(uint32, uint32_t)
+AOTI_RUNTIME_SCALAR_TO_TENSOR(uint64, uint64_t)
+AOTI_RUNTIME_SCALAR_TO_TENSOR(int8, int8_t)
+AOTI_RUNTIME_SCALAR_TO_TENSOR(int16, int16_t)
+AOTI_RUNTIME_SCALAR_TO_TENSOR(int32, int32_t)
+AOTI_RUNTIME_SCALAR_TO_TENSOR(int64, int64_t)
+AOTI_RUNTIME_SCALAR_TO_TENSOR(bool, bool)
+AOTI_RUNTIME_SCALAR_TO_TENSOR(complex64, c10::complex<float>)
+AOTI_RUNTIME_SCALAR_TO_TENSOR(complex128, c10::complex<double>)
+#undef AOTI_RUNTIME_SCALAR_TO_TENSOR
+
+} // namespace torch::aot_inductor
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/sycl_runtime_wrappers.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/sycl_runtime_wrappers.h
new file mode 100644
index 0000000000000000000000000000000000000000..56c018a28a6368dfb7ea816383c625da0e011436
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/sycl_runtime_wrappers.h
@@ -0,0 +1,169 @@
+// NOLINT
+#pragma once
+#ifdef USE_XPU
+#include <c10/xpu/XPUFunctions.h>
+#include <level_zero/ze_api.h>
+#include <sycl/sycl.hpp>
+#include <fstream>
+#include <iostream>
+#include <string>
+
+#define ZE_CHECK(status)                                                  \
+  {                                                                       \
+    if (status != ZE_RESULT_SUCCESS) {                                    \
+      std::stringstream ss;                                               \
+      ss << "L0 runtime error: " << std::hex << std::uppercase << status; \
+      throw std::runtime_error(ss.str());                                 \
+    }                                                                     \
+  }
+
+static ze_module_handle_t _createModule(
+    const uint8_t* binaryPtr,
+    size_t binarySize) {
+  sycl::device& syclDevice =
+      c10::xpu::get_raw_device(c10::xpu::current_device());
+  auto& syclContext = c10::xpu::get_device_context();
+  auto device =
+      sycl::get_native<sycl::backend::ext_oneapi_level_zero>(syclDevice);
+  auto context =
+      sycl::get_native<sycl::backend::ext_oneapi_level_zero>(syclContext);
+
+  const char* buildFlags = "";
+  const ze_module_format_t format = ZE_MODULE_FORMAT_IL_SPIRV;
+  ze_module_desc_t moduleDescription = {};
+  moduleDescription.stype = ZE_STRUCTURE_TYPE_MODULE_DESC;
+  moduleDescription.format = format;
+  moduleDescription.inputSize = binarySize;
+  moduleDescription.pInputModule = (uint8_t*)binaryPtr;
+  moduleDescription.pBuildFlags = buildFlags;
+  ze_module_build_log_handle_t buildLog = nullptr;
+  ze_module_handle_t module = nullptr;
+  auto error_no = ZE_RESULT_SUCCESS;
+  error_no =
+      zeModuleCreate(context, device, &moduleDescription, &module, &buildLog);
+
+  if (error_no != ZE_RESULT_SUCCESS) {
+    size_t szLog = 0;
+    ZE_CHECK(zeModuleBuildLogGetString(buildLog, &szLog, nullptr));
+    char* strLog = (char*)malloc(szLog);
+    ZE_CHECK(zeModuleBuildLogGetString(buildLog, &szLog, strLog));
+    std::cerr << "L0 build module failed. Log: " << strLog << std::endl;
+    free(strLog);
+  }
+  if (buildLog) {
+    ZE_CHECK(zeModuleBuildLogDestroy(buildLog));
+  }
+  ZE_CHECK(error_no);
+  return module;
+}
+
+static std::unique_ptr<sycl::kernel> _createKernel(
+    ze_module_handle_t module,
+    const char* kernelName) {
+  assert(module);
+  assert(kernelName);
+  ze_kernel_handle_t kernel = nullptr;
+  ze_kernel_desc_t kernelDescription = {};
+  kernelDescription.stype = ZE_STRUCTURE_TYPE_KERNEL_DESC;
+  kernelDescription.pNext = nullptr;
+  kernelDescription.flags = ZE_KERNEL_FLAG_FORCE_RESIDENCY;
+  kernelDescription.pKernelName = kernelName;
+  ZE_CHECK(zeKernelCreate(module, &kernelDescription, &kernel));
+
+  auto& syclContext = c10::xpu::get_device_context();
+  auto mod = sycl::make_kernel_bundle<
+      sycl::backend::ext_oneapi_level_zero,
+      sycl::bundle_state::executable>(
+      {module, sycl::ext::oneapi::level_zero::ownership::transfer},
+      syclContext);
+  auto fun = sycl::make_kernel<sycl::backend::ext_oneapi_level_zero>(
+      {mod, kernel, sycl::ext::oneapi::level_zero::ownership::transfer},
+      syclContext);
+  return std::make_unique<sycl::kernel>(fun);
+}
+
+// GPU Cpp Wrapper API
+[[maybe_unused]] static std::unique_ptr<sycl::kernel> loadKernel(
+    std::string filePath,
+    const std::string& funcName,
+    uint32_t sharedMemBytes,
+    const std::optional<std::string>& binDir = std::nullopt) {
+  if (binDir) {
+    std::filesystem::path p1{*binDir};
+    std::filesystem::path p2{filePath};
+    filePath = (p1 / p2.filename()).string();
+  }
+
+  std::ifstream IFS(filePath.c_str(), std::ios::binary);
+  std::ostringstream OSS;
+  OSS << IFS.rdbuf();
+  std::string data(OSS.str());
+
+  auto mod = _createModule(
+      reinterpret_cast<const uint8_t*>(data.c_str()), data.size());
+
+  return _createKernel(mod, funcName.c_str());
+}
+
+// GPU Cpp Wrapper API
+[[maybe_unused]] static std::unique_ptr<sycl::kernel> loadKernel(
+    const void* start,
+    const void* end,
+    const std::string& funcName,
+    uint32_t sharedMemBytes) {
+  size_t size = reinterpret_cast<const uint8_t*>(end) -
+      reinterpret_cast<const uint8_t*>(start);
+
+  auto mod = _createModule(reinterpret_cast<const uint8_t*>(start), size);
+
+  return _createKernel(mod, funcName.c_str());
+}
+
+// GPU Cpp Wrapper API
+[[maybe_unused]] static void launchKernel(
+    std::unique_ptr<sycl::kernel>& kernelPtr,
+    uint32_t gridX,
+    uint32_t gridY,
+    uint32_t gridZ,
+    uint32_t numWarps,
+    uint32_t sharedMemory,
+    void** params,
+    sycl::queue* queuePtr) {
+  std::string kernelName =
+      kernelPtr->get_info<sycl::info::kernel::function_name>();
+  // Currently threadsPerWarp is hard code to 32 from torch.compile to triton
+  // stack.
+  int threadsPerWarp = 32;
+  uint32_t numParams = kernelPtr->get_info<sycl::info::kernel::num_args>();
+  size_t globalRangeX = gridX * threadsPerWarp * numWarps;
+  size_t globalRangeY = gridY;
+  size_t globalRangeZ = gridZ;
+  size_t localRangeX = numWarps * threadsPerWarp;
+  size_t localRangeY = 1;
+  size_t localRangeZ = 1;
+  sycl::range<3> globalRange(globalRangeZ, globalRangeY, globalRangeX);
+  sycl::range<3> localRange(localRangeZ, localRangeY, localRangeX);
+  sycl::nd_range<3> parallelWorkSize(globalRange, localRange);
+  if (sharedMemory) {
+    // numParams from sycl info  = user provided args + sharedMemroyBuffer
+    numParams -= 1;
+  }
+  // Submit the imported kernel.
+  auto cgf = [&](sycl::handler& cgh) {
+    for (uint32_t i = 0; i < numParams; ++i) {
+      cgh.set_arg(i, *(static_cast<void**>(params[i])));
+    }
+
+    if (sharedMemory > 0) {
+      constexpr int dimensions = 1;
+      using share_mem_t = sycl::local_accessor<int8_t, dimensions>;
+      share_mem_t localBuffer = share_mem_t(sharedMemory, cgh);
+      cgh.set_arg(numParams, localBuffer);
+      cgh.parallel_for(parallelWorkSize, *kernelPtr);
+    } else {
+      cgh.parallel_for(parallelWorkSize, *kernelPtr);
+    }
+  };
+  auto event = queuePtr->submit(cgf);
+}
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/thread_local.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/thread_local.h
new file mode 100644
index 0000000000000000000000000000000000000000..d1613b8176cb6eb3a60de727718f36f2299c72e8
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/thread_local.h
@@ -0,0 +1,160 @@
+#pragma once
+
+#include <torch/csrc/inductor/aoti_runtime/arrayref_tensor.h>
+
+namespace torch::aot_inductor {
+
+template <typename T>
+struct ThreadLocalCachedOutputTensor;
+
+template <>
+struct ThreadLocalCachedOutputTensor<RAIIAtenTensorHandle> {
+  explicit ThreadLocalCachedOutputTensor(const RAIIAtenTensorHandle&) {}
+  void copy_data_from(const RAIIAtenTensorHandle& handle) {
+    throw std::runtime_error("can't happen");
+  }
+
+  AtenTensorHandle tensor() const {
+    throw std::runtime_error("can't happen");
+  }
+};
+
+template <>
+struct ThreadLocalCachedOutputTensor<AtenTensorHandle> {
+  explicit ThreadLocalCachedOutputTensor(const AtenTensorHandle&) {}
+  void copy_data_from(const AtenTensorHandle& handle) {
+    throw std::runtime_error("can't happen");
+  }
+
+  AtenTensorHandle tensor() const {
+    throw std::runtime_error("can't happen");
+  }
+};
+
+template <>
+struct ThreadLocalCachedOutputTensor<ConstantHandle> {
+  explicit ThreadLocalCachedOutputTensor(const ConstantHandle&) {}
+  void copy_data_from(const ConstantHandle& handle) {
+    throw std::runtime_error("can't happen");
+  }
+
+  AtenTensorHandle tensor() const {
+    throw std::runtime_error("can't happen");
+  }
+};
+
+template <typename T>
+struct ThreadLocalCachedOutputTensor<ArrayRefTensor<T>> {
+  explicit ThreadLocalCachedOutputTensor(const ArrayRefTensor<T>& t) {
+    realloc(t);
+  }
+
+  void copy_data_from(const ArrayRefTensor<T>& t) {
+    if (t.numel() > capacity_) {
+      realloc(t);
+    }
+    std::copy(t.data(), t.data() + t.numel(), storage_.get());
+  }
+
+  AtenTensorHandle tensor() const {
+    return tensor_.get();
+  }
+
+ private:
+  void realloc(const ArrayRefTensor<T>& t) {
+    capacity_ = t.numel();
+    // NOLINTNEXTLINE(*arrays*)
+    storage_ = std::make_unique<T[]>(t.numel());
+    AtenTensorHandle handle = nullptr;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_create_tensor_from_blob(
+        storage_.get(),
+        t.sizes().size(),
+        t.sizes().data(),
+        t.strides().data(),
+        0,
+        aoti_torch_dtype<std::remove_const_t<T>>(),
+        t.device_type(),
+        t.device_idx(),
+        &handle));
+    tensor_ = handle;
+  }
+
+  // NOLINTNEXTLINE(*arrays*)
+  std::unique_ptr<T[]> storage_;
+  int64_t capacity_ = 0;
+  RAIIAtenTensorHandle tensor_;
+};
+
+template <typename T>
+struct ThreadLocalCachedOutputArray;
+
+// Just needs to compile, doesn't need to do anything.
+template <>
+struct ThreadLocalCachedOutputArray<RAIIAtenTensorHandle> {
+  explicit ThreadLocalCachedOutputArray(const RAIIAtenTensorHandle&) {
+    throw std::runtime_error("can't happen");
+  }
+
+  // Not supported yet! We would need to put contiguous() or
+  // expect_contiguous() into the ABI.
+  void copy_data_from(const RAIIAtenTensorHandle&) {
+    throw std::runtime_error("can't happen");
+  }
+
+  template <typename U>
+  ArrayRefTensor<U> arrayref_tensor() const {
+    throw std::runtime_error("can't happen");
+  }
+};
+
+// Just needs to compile, doesn't need to do anything.
+template <>
+struct ThreadLocalCachedOutputArray<ConstantHandle> {
+  explicit ThreadLocalCachedOutputArray(const ConstantHandle&) {
+    throw std::runtime_error("can't happen");
+  }
+
+  // Not supported yet! We would need to put contiguous() or
+  // expect_contiguous() into the ABI.
+  void copy_data_from(const ConstantHandle&) {
+    throw std::runtime_error("can't happen");
+  }
+
+  template <typename U>
+  ArrayRefTensor<U> arrayref_tensor() const {
+    throw std::runtime_error("can't happen");
+  }
+};
+
+template <typename T>
+struct ThreadLocalCachedOutputArray<ArrayRefTensor<T>> {
+  explicit ThreadLocalCachedOutputArray(const ArrayRefTensor<T>& t) {}
+
+  template <
+      typename U,
+      std::enable_if_t<
+          std::is_same_v<std::remove_const_t<T>, std::remove_const_t<U>>,
+          bool> = true>
+  ArrayRefTensor<T> arrayref_tensor() const {
+    return tensor_;
+  }
+
+  void copy_data_from(const ArrayRefTensor<T>& t) {
+    if (t.numel() > capacity_) {
+      capacity_ = t.numel();
+      // NOLINTNEXTLINE(*arrays*)
+      storage_ = std::make_unique<T[]>(capacity_);
+    }
+    std::copy(t.data(), t.data() + t.numel(), storage_.get());
+    tensor_ = t;
+    tensor_.set_arrayref(MiniArrayRef<T>(storage_.get(), t.numel()));
+  }
+
+ private:
+  // NOLINTNEXTLINE(*arrays*)
+  std::unique_ptr<T[]> storage_;
+  uint32_t capacity_ = 0;
+  ArrayRefTensor<T> tensor_;
+};
+
+} // namespace torch::aot_inductor
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/utils.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..4ebf74741bbe216a157c0afd6d7bcb3a5af6a7cb
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/utils.h
@@ -0,0 +1,372 @@
+#pragma once
+
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+// WARNING: Be careful when adding new includes here. This header will be used
+// in model.so, and should not refer to any aten/c10 headers except the stable
+// C ABI defined in torch/csrc/inductor/aoti_torch/c/shim.h. The same rule
+// applies to other files under torch/csrc/inductor/aoti_runtime/.
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+#define AOTI_NOINLINE __attribute__((noinline))
+#elif _MSC_VER
+#define AOTI_NOINLINE __declspec(noinline)
+#else
+#define AOTI_NOINLINE
+#endif
+
+AOTI_NOINLINE static void throw_exception(
+    const char* call,
+    const char* file,
+    int64_t line) {
+  std::stringstream ss;
+  ss << call << " API call failed at " << file << ", line " << line;
+  throw std::runtime_error(ss.str());
+}
+
+#define AOTI_TORCH_ERROR_CODE_CHECK(call)       \
+  if ((call) != AOTI_TORCH_SUCCESS) {           \
+    throw_exception(#call, __FILE__, __LINE__); \
+  }
+
+using AOTIRuntimeError = int32_t;
+#define AOTI_RUNTIME_SUCCESS 0
+#define AOTI_RUNTIME_FAILURE 1
+
+#define AOTI_RUNTIME_ERROR_CODE_CHECK(call)     \
+  if ((call) != AOTI_RUNTIME_SUCCESS) {         \
+    throw_exception(#call, __FILE__, __LINE__); \
+  }
+
+namespace torch::aot_inductor {
+
+using DeleterFnPtr = void (*)(void*);
+
+inline void noop_deleter(void*) {}
+
+inline void delete_tensor_object(void* ptr) {
+  AOTI_TORCH_ERROR_CODE_CHECK(
+      aoti_torch_delete_tensor_object(reinterpret_cast<AtenTensorHandle>(ptr)));
+}
+
+// RAIIAtenTensorHandle steals the tensor objects created by the libtorch C ABI
+class RAIIAtenTensorHandle {
+ public:
+  RAIIAtenTensorHandle() : handle_(nullptr, noop_deleter) {}
+  RAIIAtenTensorHandle(const RAIIAtenTensorHandle& other) = delete;
+  RAIIAtenTensorHandle& operator=(const RAIIAtenTensorHandle& other) = delete;
+
+  // Steal the ownership from another RAIIAtenTensorHandle using std::move
+  RAIIAtenTensorHandle(RAIIAtenTensorHandle&& other) = default;
+  RAIIAtenTensorHandle& operator=(RAIIAtenTensorHandle&& other) = default;
+
+  // Steal the ownership from raw AtenTensorHandle
+  RAIIAtenTensorHandle(AtenTensorHandle handle)
+      : handle_(handle, delete_tensor_object) {}
+
+  ~RAIIAtenTensorHandle() {
+    handle_.reset();
+  }
+
+  // Return a raw AtenTensorHandle to be used by aoti_torch functions
+  // Note: this function does NOT transfer the ownership of the handle
+  operator AtenTensorHandle() const {
+    return handle_.get();
+  }
+
+  AtenTensorHandle release() {
+    return handle_.release();
+  }
+
+  AtenTensorHandle get() const {
+    return handle_.get();
+  }
+
+  void reset() {
+    handle_.reset();
+  }
+
+  int64_t size(int64_t d) {
+    int64_t size = 0;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_size(handle_.get(), d, &size));
+    return size;
+  }
+
+  int64_t stride(int64_t d) {
+    int64_t stride = 0;
+    AOTI_TORCH_ERROR_CODE_CHECK(
+        aoti_torch_get_stride(handle_.get(), d, &stride));
+    return stride;
+  }
+
+  int64_t storage_offset() {
+    int64_t storage_offset = 0;
+    AOTI_TORCH_ERROR_CODE_CHECK(
+        aoti_torch_get_storage_offset(handle_.get(), &storage_offset));
+    return storage_offset;
+  }
+
+  void* data_ptr() const {
+    void* result = nullptr;
+    AOTI_TORCH_ERROR_CODE_CHECK(
+        aoti_torch_get_data_ptr(handle_.get(), &result));
+    return result;
+  }
+
+  int64_t* sizes() const {
+    int64_t* result = nullptr;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_sizes(handle_.get(), &result));
+    return result;
+  }
+
+  int64_t* strides() const {
+    int64_t* result = nullptr;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_strides(handle_.get(), &result));
+    return result;
+  }
+
+ private:
+  std::unique_ptr<AtenTensorOpaque, DeleterFnPtr> handle_;
+};
+
+class MaybeOwningAtenTensorHandle {
+ public:
+  MaybeOwningAtenTensorHandle() : handle_(nullptr), raii_handle_() {}
+  // We skip copy constructor as MaybeOwningAtenTensorHandle might be RAII which
+  // makes it undefined.
+  MaybeOwningAtenTensorHandle(const MaybeOwningAtenTensorHandle& other) =
+      delete;
+  MaybeOwningAtenTensorHandle& operator=(
+      const MaybeOwningAtenTensorHandle& other) = delete;
+
+  // Move constructor and move assignment operator
+  MaybeOwningAtenTensorHandle(MaybeOwningAtenTensorHandle&& other) = default;
+  MaybeOwningAtenTensorHandle& operator=(MaybeOwningAtenTensorHandle&& other) =
+      default;
+
+  // Steal the ownership from another RAIIAtenTensorHandle using std::move
+  MaybeOwningAtenTensorHandle(RAIIAtenTensorHandle&& other)
+      : raii_handle_(std::move(other)) {
+    handle_ = raii_handle_.get();
+  }
+  MaybeOwningAtenTensorHandle& operator=(RAIIAtenTensorHandle&& other) {
+    raii_handle_ = std::move(other);
+    handle_ = raii_handle_.get();
+    return *this;
+  }
+
+  // By default, steal the ownership from raw AtenTensorHandle
+  MaybeOwningAtenTensorHandle(AtenTensorHandle handle) : raii_handle_(handle) {
+    handle_ = raii_handle_.get();
+  }
+
+  // If user_managed is true, we do not steal the ownership.
+  MaybeOwningAtenTensorHandle(AtenTensorHandle handle, bool user_managed) {
+    if (user_managed) {
+      aoti_torch_new_tensor_handle(handle, &handle_);
+    } else {
+      raii_handle_ = RAIIAtenTensorHandle(handle);
+      handle_ = raii_handle_.get();
+    }
+  }
+
+  ~MaybeOwningAtenTensorHandle() {
+    // This is no-op if we don't hold raii_handle with the
+    // MaybeOwningAtenTensorHandle.
+    raii_handle_.reset();
+  }
+
+  // Return a raw AtenTensorHandle to be used by aoti_torch functions
+  // Note: this function does NOT transfer the ownership of the handle
+  operator AtenTensorHandle() const {
+    return handle_;
+  }
+
+  AtenTensorHandle release() {
+    if (raii_handle_) {
+      return raii_handle_.release();
+    } else {
+      AtenTensorHandle handle = handle_;
+      handle_ = nullptr;
+      return handle;
+    }
+  }
+
+  AtenTensorHandle get() const {
+    return handle_;
+  }
+
+  void reset() {
+    handle_ = nullptr;
+    raii_handle_.reset();
+  }
+
+  int64_t size(int64_t d) {
+    int64_t size = 0;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_size(handle_, d, &size));
+    return size;
+  }
+
+  int64_t stride(int64_t d) {
+    int64_t stride = 0;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_stride(handle_, d, &stride));
+    return stride;
+  }
+
+  int64_t storage_offset() {
+    int64_t storage_offset = 0;
+    AOTI_TORCH_ERROR_CODE_CHECK(
+        aoti_torch_get_storage_offset(handle_, &storage_offset));
+    return storage_offset;
+  }
+
+  void* data_ptr() const {
+    void* result = nullptr;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_data_ptr(handle_, &result));
+    return result;
+  }
+
+  int64_t* sizes() const {
+    int64_t* result = nullptr;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_sizes(handle_, &result));
+    return result;
+  }
+
+  int64_t* strides() const {
+    int64_t* result = nullptr;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_strides(handle_, &result));
+    return result;
+  }
+
+ private:
+  // handle_ is the underlying AtenTensorHandle of raii_handle_ if raii_handle_
+  // exists. Otherwise it would just be the AtenTensorHandle passed in by users.
+  AtenTensorHandle handle_;
+  RAIIAtenTensorHandle raii_handle_;
+};
+
+// Steal the ownership from raw AtenTensorHandle to RAIIAtenTensorHandle
+inline std::vector<RAIIAtenTensorHandle> steal_from_raw_handles_to_raii_handles(
+    AtenTensorHandle* handles,
+    size_t size) {
+  std::vector<RAIIAtenTensorHandle> result;
+  result.reserve(size);
+  for (size_t i = 0; i < size; i++) {
+    result.emplace_back(handles[i]);
+    handles[i] = nullptr;
+  }
+  return result;
+}
+
+inline AtenTensorHandle reinterpret_tensor_wrapper(
+    AtenTensorHandle self,
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    int64_t storage_offset) {
+  AtenTensorHandle result = nullptr;
+  AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch__reinterpret_tensor(
+      self, ndim, sizes_ptr, strides_ptr, storage_offset, &result));
+  return result;
+}
+
+inline void* get_data_ptr_wrapper(AtenTensorHandle tensor) {
+  void* result = nullptr;
+  AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_data_ptr(tensor, &result));
+  return result;
+}
+
+inline AtenTensorHandle unwrap_raii_handle_if_needed(
+    const RAIIAtenTensorHandle& handle) {
+  return handle.get();
+}
+
+inline RAIIAtenTensorHandle wrap_with_raii_handle_if_needed(
+    AtenTensorHandle handle) {
+  return RAIIAtenTensorHandle(handle);
+}
+
+class ConstantHandle {
+ public:
+  ConstantHandle() = default;
+
+  explicit ConstantHandle(AtenTensorHandle handle) : handle_(handle) {
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_data_ptr(handle_, &data_));
+  }
+
+  operator AtenTensorHandle() const {
+    return handle_;
+  }
+
+  AtenTensorHandle tensor() const {
+    return handle_;
+  }
+
+  AtenTensorHandle get() const {
+    return handle_;
+  }
+
+  void* data_ptr() const {
+    return data_;
+  }
+
+  int64_t* sizes() const {
+    int64_t* result = nullptr;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_sizes(handle_, &result));
+    return result;
+  }
+
+  int64_t* strides() const {
+    int64_t* result = nullptr;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_strides(handle_, &result));
+    return result;
+  }
+
+ private:
+  AtenTensorHandle handle_{};
+  void* data_ = nullptr;
+};
+
+inline void* get_data_ptr_wrapper(const ConstantHandle& constant) {
+  return constant.data_ptr();
+}
+
+inline const ConstantHandle& unwrap_raii_handle_if_needed(
+    const ConstantHandle& handle) {
+  return handle;
+}
+
+// Shouldn't be called.
+inline AtenTensorHandle wrap_with_raii_handle_if_needed(
+    const ConstantHandle& handle) = delete;
+
+// DANGEROUS.  Do not call unless you explicitly intend to get a reference to a
+// temporary value, which will expire at the end of the current expression.
+// This should only be called in cases where the C-shim API expects an optional
+// input argument (passed by pointer), and a temporary needs to be passed to it.
+template <class T>
+T& temporary_reference(T&& t) {
+  return t;
+}
+
+#define CACHE_TORCH_DTYPE(typename) \
+  static auto cached_torch_dtype_##typename = aoti_torch_dtype_##typename()
+
+#define CACHE_TORCH_DEVICE(device)                \
+  static auto cached_torch_device_type_##device = \
+      aoti_torch_device_type_##device()
+
+#define CACHE_TORCH_LAYOUT(layout) \
+  static auto cached_torch_layout_##layout = aoti_torch_layout_##layout()
+
+#define CACHE_TORCH_MEMORY_FORMAT(format)           \
+  static auto cached_torch_memory_format_##format = \
+      aoti_torch_memory_format_##format()
+
+} // namespace torch::aot_inductor
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/utils_cuda.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/utils_cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..3653d2ee78759eabd7ec96ac49b6bdcad9d43134
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/utils_cuda.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#ifdef USE_CUDA
+// WARNING: Be careful when adding new includes here. This header will be used
+// in model.so, and should not refer to any aten/c10 headers except the stable
+// C ABI defined in torch/csrc/inductor/aoti_torch/c/shim.h. The same rule
+// applies to other files under torch/csrc/inductor/aoti_runtime/.
+#include <torch/csrc/inductor/aoti_runtime/utils.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#ifndef USE_ROCM
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_fp8.h>
+#endif
+
+namespace torch::aot_inductor {
+
+inline void delete_cuda_guard(void* ptr) {
+  AOTI_TORCH_ERROR_CODE_CHECK(
+      aoti_torch_delete_cuda_guard(reinterpret_cast<CUDAGuardHandle>(ptr)));
+}
+
+inline void delete_cuda_stream_guard(void* ptr) {
+  AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_delete_cuda_stream_guard(
+      reinterpret_cast<CUDAStreamGuardHandle>(ptr)));
+}
+
+class AOTICudaGuard {
+ public:
+  AOTICudaGuard(int32_t device_index) : guard_(nullptr, delete_cuda_guard) {
+    CUDAGuardHandle ptr = nullptr;
+    AOTI_TORCH_ERROR_CODE_CHECK(
+        aoti_torch_create_cuda_guard(device_index, &ptr));
+    guard_.reset(ptr);
+  }
+
+  void set_index(int32_t device_index) {
+    AOTI_TORCH_ERROR_CODE_CHECK(
+        aoti_torch_cuda_guard_set_index(guard_.get(), device_index));
+  }
+
+ private:
+  std::unique_ptr<CUDAGuardOpaque, DeleterFnPtr> guard_;
+};
+
+class AOTICudaStreamGuard {
+ public:
+  AOTICudaStreamGuard(cudaStream_t stream, int32_t device_index)
+      : guard_(nullptr, delete_cuda_stream_guard) {
+    CUDAStreamGuardHandle ptr = nullptr;
+    AOTI_TORCH_ERROR_CODE_CHECK(
+        aoti_torch_create_cuda_stream_guard(stream, device_index, &ptr));
+    guard_.reset(ptr);
+  }
+
+ private:
+  std::unique_ptr<CUDAStreamGuardOpaque, DeleterFnPtr> guard_;
+};
+
+} // namespace torch::aot_inductor
+#endif // USE_CUDA
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/utils_xpu.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/utils_xpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..025fc2f459af63178a13b7168b90d7e755d14ddc
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_runtime/utils_xpu.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#ifdef USE_XPU
+// WARNING: Be careful when adding new includes here. This header will be used
+// in model.so, and should not refer to any aten/c10 headers except the stable
+// C ABI defined in torch/csrc/inductor/aoti_torch/c/shim.h. The same rule
+// applies to other files under torch/csrc/inductor/aoti_runtime/.
+#include <torch/csrc/inductor/aoti_runtime/utils.h>
+#include <torch/csrc/inductor/aoti_torch/c/shim_xpu.h>
+
+namespace torch::aot_inductor {
+
+inline void delete_xpu_guard(void* ptr) {
+  AOTI_TORCH_ERROR_CODE_CHECK(
+      aoti_torch_delete_xpu_guard(reinterpret_cast<XPUGuardHandle>(ptr)));
+}
+
+inline void delete_xpu_stream_guard(void* ptr) {
+  AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_delete_xpu_stream_guard(
+      reinterpret_cast<XPUStreamGuardHandle>(ptr)));
+}
+
+class AOTIXpuGuard {
+ public:
+  AOTIXpuGuard(int32_t device_index) : guard_(nullptr, delete_xpu_guard) {
+    XPUGuardHandle ptr = nullptr;
+    AOTI_TORCH_ERROR_CODE_CHECK(
+        aoti_torch_create_xpu_guard(device_index, &ptr));
+    guard_.reset(ptr);
+  }
+
+  void set_index(int32_t device_index) {
+    AOTI_TORCH_ERROR_CODE_CHECK(
+        aoti_torch_xpu_guard_set_index(guard_.get(), device_index));
+  }
+
+ private:
+  std::unique_ptr<XPUGuardOpaque, DeleterFnPtr> guard_;
+};
+
+class AOTIXpuStreamGuard {
+ public:
+  AOTIXpuStreamGuard(void* stream, int32_t device_index)
+      : guard_(nullptr, delete_xpu_stream_guard) {
+    XPUStreamGuardHandle ptr = nullptr;
+    AOTI_TORCH_ERROR_CODE_CHECK(
+        aoti_torch_create_xpu_stream_guard(stream, device_index, &ptr));
+    guard_.reset(ptr);
+  }
+
+ private:
+  std::unique_ptr<XPUStreamGuardOpaque, DeleterFnPtr> guard_;
+};
+
+} // namespace torch::aot_inductor
+#endif // USE_XPU
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/c/shim.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/c/shim.h
new file mode 100644
index 0000000000000000000000000000000000000000..d1ae35753d2752bd7d88609b441ee59d4cc785e9
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/c/shim.h
@@ -0,0 +1,804 @@
+#ifndef AOTI_TORCH_SHIM
+#define AOTI_TORCH_SHIM
+
+#include <stddef.h>
+#include <stdint.h>
+
+// This header defines a stable C API for certain ATen functionality in
+// libtorch. The AOTInductor compiled model.so will only refer to this header
+// instead of other headers from aten/c10, which means it will NOT be able to
+// directly use any data structures or call functions from libtorch.
+//
+// What problems are we trying to solve here?  Direct use of aten/c10 APIs
+// means use of C++ APIs on a library that doesn't have any ABI compatibility
+// guarantees.  However, we want model.so to remain usable across updates
+// to the PyTorch C++ libraries, which requires a stable ABI.  By introducing
+// a C shim layer, we can minimize the surface that will cause breakage. The
+// corresponding software stack can be illustrated as follows:
+//
+// |--------------------------------|
+// |     inference service code     |
+// |--------------------------------|
+// |           model.so             |
+// |--------------|-----------------|
+// |           <c shim>             |
+// |          libtorch.so           |
+// |--------------------------------|
+//
+// The general guidelines for the C API:
+//
+//  - No exceptions, return an explicit error code to be checked at call site
+//  - Only pointers (AtenTensorHandle counts), integers and floats in headers
+//
+// If you want to make changes to this header, you MUST MAINTAIN ABI
+// compatibility.  Typically, this means you will have to add a _v2 version
+// of a function that you, e.g., want to add a new function parameter to, and
+// maintain the old and new versions of the APIs until all old model.so
+// go out of use.
+
+#ifdef __GNUC__
+#define AOTI_TORCH_EXPORT __attribute__((__visibility__("default")))
+#else // !__GNUC__
+#ifdef _WIN32
+// PyTorch2 doesn't currently work on Windows. Exporting these APIs can lead
+// to symbol clashes at link time if libtorch is included in a DLL and binary
+// that depends on the DLL. As a short term fix, we don't export the symbols.
+// In the long term, this will need to be addressed when Windows is supported.
+#ifdef OVRSOURCE
+// Do not export AOTI on Windows for internal builds
+#define AOTI_TORCH_EXPORT
+#else /* OVRSOURCE */
+#ifdef EXPORT_AOTI_FUNCTIONS
+#define AOTI_TORCH_EXPORT __declspec(dllexport)
+#else
+#define AOTI_TORCH_EXPORT __declspec(dllimport)
+#endif
+#endif /* OVRSOURCE */
+#else // !_WIN32
+#define AOTI_TORCH_EXPORT
+#endif // _WIN32
+#endif // __GNUC__
+
+// The following files are implemented in a header-only way and are guarded by
+// test/cpp/aoti_abi_check
+#include <c10/util/BFloat16.h>
+#include <c10/util/Half.h>
+#include <c10/util/complex.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// AtenTensorHandle represents an abstract notion of Tensor that can be passed
+// between model.so and libtorch.so.  The contents of the structure itself
+// are private; model.so is not allowed to access any fields directly, it must
+// go through functions defined in this ABI.  Under the hood, this is
+// represented as at::Tensor*, but we reserve the right to change this (and in
+// fact, we probably should change it to at::TensorImpl* at least).
+//
+// An AtenTensorHandle can be owning (please check the API reference for exact
+// ownership/borrow semantics).  If you have an owning AtenTensorHandle
+// in model.so, you are obligated to aoti_torch_delete_tensor_object when you
+// are done.  You can use the helper C++ class RAIIAtenTensorHandle
+// (see aot_runtime/model.h) to ensure the deallocator is called in RAII style
+// (note that RAIIAtenTensorHandle is private to model.so, and never crosses
+// the ABI boundary.)
+struct AtenTensorOpaque;
+using AtenTensorHandle = AtenTensorOpaque*;
+
+struct AtenGeneratorOpaque;
+using AtenGeneratorHandle = AtenGeneratorOpaque*;
+
+struct AOTIProxyExecutorOpaque;
+using AOTIProxyExecutorHandle = AOTIProxyExecutorOpaque*;
+
+using AOTITorchError = int32_t;
+#define AOTI_TORCH_SUCCESS 0
+#define AOTI_TORCH_FAILURE 1
+
+// Getter functions for retrieving various constants from the runtime, that
+// can subsequently be passed to other aoti_* functions.  By hiding these
+// behind functions, the precise value of device/dtype is NOT part of the
+// ABI contract.  (In practice, aten/c10 is pretty good about not renumbering
+// these, so we probably could later switch to having these in the ABI, if
+// desired for perf reasons.)
+AOTI_TORCH_EXPORT int32_t aoti_torch_device_type_cpu();
+AOTI_TORCH_EXPORT int32_t aoti_torch_device_type_cuda();
+AOTI_TORCH_EXPORT int32_t aoti_torch_device_type_meta();
+AOTI_TORCH_EXPORT int32_t aoti_torch_device_type_xpu();
+AOTI_TORCH_EXPORT int32_t aoti_torch_device_type_mps();
+AOTI_TORCH_EXPORT int32_t aoti_torch_device_type_privateuse1();
+
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_float8_e5m2();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_float8_e4m3fn();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_float8_e5m2fnuz();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_float8_e4m3fnuz();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_bfloat16();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_float16();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_float32();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_float64();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_uint8();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_uint16();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_uint32();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_uint64();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_int8();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_int16();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_int32();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_int64();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_bool();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_complex32();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_complex64();
+AOTI_TORCH_EXPORT int32_t aoti_torch_dtype_complex128();
+AOTI_TORCH_EXPORT size_t aoti_torch_dtype_element_size(int32_t dtype);
+
+AOTI_TORCH_EXPORT int32_t aoti_torch_layout_strided();
+AOTI_TORCH_EXPORT int32_t aoti_torch_layout_sparse_coo();
+AOTI_TORCH_EXPORT int32_t aoti_torch_layout_sparse_csr();
+AOTI_TORCH_EXPORT int32_t aoti_torch_layout_sparse_csc();
+AOTI_TORCH_EXPORT int32_t aoti_torch_layout_sparse_bsr();
+AOTI_TORCH_EXPORT int32_t aoti_torch_layout_sparse_bsc();
+AOTI_TORCH_EXPORT int32_t aoti_torch_layout__mkldnn();
+AOTI_TORCH_EXPORT int32_t aoti_torch_layout_jagged();
+
+AOTI_TORCH_EXPORT int32_t aoti_torch_memory_format_contiguous_format();
+AOTI_TORCH_EXPORT int32_t aoti_torch_memory_format_channels_last();
+AOTI_TORCH_EXPORT int32_t aoti_torch_memory_format_channels_last_3d();
+AOTI_TORCH_EXPORT int32_t aoti_torch_memory_format_preserve_format();
+
+// Get TORCH_ABI_VERSION of the built libtorch.so
+AOTI_TORCH_EXPORT uint64_t aoti_torch_abi_version();
+
+// Functions for converting a single-element tensor to a scalar value
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_item_float16(AtenTensorHandle tensor, c10::Half* ret_value);
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_item_float32(AtenTensorHandle tensor, float* ret_value);
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_item_float64(AtenTensorHandle tensor, double* ret_value);
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_item_uint8(AtenTensorHandle tensor, uint8_t* ret_value);
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_item_uint16(AtenTensorHandle tensor, uint16_t* ret_value);
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_item_uint32(AtenTensorHandle tensor, uint32_t* ret_value);
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_item_uint64(AtenTensorHandle tensor, uint64_t* ret_value);
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_item_int8(AtenTensorHandle tensor, int8_t* ret_value);
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_item_int16(AtenTensorHandle tensor, int16_t* ret_value);
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_item_int32(AtenTensorHandle tensor, int32_t* ret_value);
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_item_int64(AtenTensorHandle tensor, int64_t* ret_value);
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_item_bool(AtenTensorHandle tensor, bool* ret_value);
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_item_bfloat16(AtenTensorHandle tensor, c10::BFloat16* ret_value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_item_complex64(
+    AtenTensorHandle tensor,
+    c10::complex<float>* ret_value);
+
+// Functions for wrapping a scalar value to a single-element tensor
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_float32(
+    float value,
+    AtenTensorHandle* ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_float64(
+    double value,
+    AtenTensorHandle* ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_uint8(
+    uint8_t value,
+    AtenTensorHandle* ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_uint16(
+    uint16_t value,
+    AtenTensorHandle* ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_uint32(
+    uint32_t value,
+    AtenTensorHandle* ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_uint64(
+    uint64_t value,
+    AtenTensorHandle* ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_int8(
+    int8_t value,
+    AtenTensorHandle* ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_int16(
+    int16_t value,
+    AtenTensorHandle* ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_int32(
+    int32_t value,
+    AtenTensorHandle* ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_int64(
+    int64_t value,
+    AtenTensorHandle* ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_scalar_to_tensor_bool(bool value, AtenTensorHandle* ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_complex64(
+    c10::complex<float> value,
+    AtenTensorHandle* ret_new_tensor);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scalar_to_tensor_complex128(
+    c10::complex<double> value,
+    AtenTensorHandle* ret_new_tensor);
+
+AOTI_TORCH_EXPORT bool aoti_torch_grad_mode_is_enabled();
+AOTI_TORCH_EXPORT void aoti_torch_grad_mode_set_enabled(bool enabled);
+
+// Free the tensor object
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_delete_tensor_object(AtenTensorHandle tensor);
+
+// Get a pointer to the underlying storage data
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_data_ptr(
+    AtenTensorHandle tensor,
+    void** ret_data_ptr // returns borrowed reference
+);
+
+// Get the nbytes of the underlying storage
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_get_storage_size(AtenTensorHandle tensor, int64_t* ret_size);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_get_dim(AtenTensorHandle tensor, int64_t* ret_dim);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_get_numel(AtenTensorHandle tensor, int64_t* ret_numel);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_get_storage_numel(AtenTensorHandle tensor, int64_t* ret_numel);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_sizes(
+    AtenTensorHandle tensor,
+    int64_t** ret_sizes // returns borrowed reference
+);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_get_size(AtenTensorHandle tensor, int64_t d, int64_t* ret_size);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_strides(
+    AtenTensorHandle tensor,
+    int64_t** ret_strides // returns borrowed reference
+);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_get_stride(AtenTensorHandle tensor, int64_t d, int64_t* ret_stride);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_get_dtype(AtenTensorHandle tensor, int32_t* ret_dtype);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_get_device_type(AtenTensorHandle tensor, int32_t* ret_device_type);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_get_device_index(AtenTensorHandle tensor, int32_t* ret_device_index);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_storage_offset(
+    AtenTensorHandle tensor,
+    int64_t* ret_storage_offset);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_is_contiguous(AtenTensorHandle tensor, bool* ret_is_contiguous);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_new_tensor_handle(
+    AtenTensorHandle orig_handle,
+    AtenTensorHandle* new_handle);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch__alloc_from_pool(
+    AtenTensorHandle self,
+    int64_t offset_bytes,
+    int32_t dtype,
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    AtenTensorHandle* ret_new_tensor);
+
+// This function will create a new tensor object and its pointer is returned
+// through *out. The caller is responsible for wrapping the tensor pointer
+// with RAIIAtenTensorHandle which will call aoti_torch_delete_tensor_object
+// when going out of scope.
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch__reinterpret_tensor(
+    AtenTensorHandle self,
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    int64_t storage_offset,
+    AtenTensorHandle* ret_new_tensor // returns new reference
+);
+
+// This function will create a new tensor object and its pointer is returned
+// through *out. The caller is responsible for wrapping the tensor pointer
+// with RAIIAtenTensorHandle which will call aoti_torch_delete_tensor_object
+// when going out of scope.
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_empty_strided(
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    int32_t dtype,
+    int32_t device_type,
+    int32_t device_index,
+    AtenTensorHandle* ret_new_tensor // returns new reference
+);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_as_strided(
+    AtenTensorHandle self,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    AtenTensorHandle* ret);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob(
+    void* data,
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    int64_t storage_offset,
+    int32_t dtype,
+    int32_t device_type,
+    int32_t device_index,
+    AtenTensorHandle* ret // returns new reference
+);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_tensor_from_blob_v2(
+    void* data,
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    int64_t storage_offset,
+    int32_t dtype,
+    int32_t device_type,
+    int32_t device_index,
+    AtenTensorHandle* ret, // returns new reference
+    int32_t layout,
+    const uint8_t* opaque_metadata,
+    int64_t opaque_metadata_size);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch__embedding_bag(
+    AtenTensorHandle weight,
+    AtenTensorHandle indices,
+    AtenTensorHandle offsets,
+    int32_t scale_grad_by_freq,
+    int32_t mode,
+    int32_t sparse,
+    AtenTensorHandle per_sample_weights, // optional argument
+    int32_t include_last_offset,
+    int32_t padding_idx,
+    AtenTensorHandle* ret0, // returns new reference
+    AtenTensorHandle* ret1, // returns new reference
+    AtenTensorHandle* ret2, // returns new reference
+    AtenTensorHandle* ret3 // returns new reference
+);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch__fft_c2c(
+    AtenTensorHandle self,
+    const int64_t* dim_ptr,
+    int64_t dim_size,
+    int64_t normalization,
+    int32_t forward,
+    AtenTensorHandle* ret // returns new reference
+);
+
+// This version is deprecated. We will remove it later
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch__scaled_dot_product_flash_attention(
+    AtenTensorHandle query,
+    AtenTensorHandle key,
+    AtenTensorHandle value,
+    double dropout_p,
+    bool is_causal,
+    bool return_debug_mask,
+    double scale,
+    AtenTensorHandle* ret0, // returns new reference
+    AtenTensorHandle* ret1, // returns new reference
+    AtenTensorHandle* ret2, // returns new reference
+    AtenTensorHandle* ret3, // returns new reference
+    int64_t* ret4,
+    int64_t* ret5,
+    AtenTensorHandle* ret6, // returns new reference
+    AtenTensorHandle* ret7, // returns new reference
+    AtenTensorHandle* ret8 // returns new reference
+);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch__scaled_dot_product_flash_attention_v2(
+    AtenTensorHandle query,
+    AtenTensorHandle key,
+    AtenTensorHandle value,
+    double dropout_p,
+    int is_causal,
+    int return_debug_mask,
+    double* scale, // optional argument
+    AtenTensorHandle* ret0, // returns new reference
+    AtenTensorHandle* ret1, // returns new reference
+    AtenTensorHandle* ret2, // returns new reference
+    AtenTensorHandle* ret3, // returns new reference
+    int64_t* ret4,
+    int64_t* ret5,
+    AtenTensorHandle* ret6, // returns new reference
+    AtenTensorHandle* ret7, // returns new reference
+    AtenTensorHandle* ret8 // returns new reference
+);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch__scaled_dot_product_efficient_attention(
+    AtenTensorHandle query,
+    AtenTensorHandle key,
+    AtenTensorHandle value,
+    AtenTensorHandle attn_bias, // optional argument
+    int compute_log_sumexp,
+    double dropout_p,
+    int is_causal,
+    double* scale, // optional argument
+    AtenTensorHandle* ret0, // returns new reference
+    AtenTensorHandle* ret1, // returns new reference
+    AtenTensorHandle* ret2, // returns new reference
+    AtenTensorHandle* ret3 // returns new reference
+);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch__scaled_mm(
+    AtenTensorHandle self,
+    AtenTensorHandle mat2,
+    AtenTensorHandle bias,
+    int32_t* out_dtype,
+    AtenTensorHandle scale_a,
+    AtenTensorHandle scale_b,
+    AtenTensorHandle scale_result,
+    int8_t use_fast_accum,
+    AtenTensorHandle* ret0,
+    AtenTensorHandle* ret1);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch__scaled_mm_v2(
+    AtenTensorHandle self,
+    AtenTensorHandle mat2,
+    AtenTensorHandle scale_a,
+    AtenTensorHandle scale_b,
+    AtenTensorHandle bias,
+    AtenTensorHandle scale_result,
+    int32_t* out_dtype,
+    int8_t use_fast_accum,
+    AtenTensorHandle* ret0);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_convolution(
+    AtenTensorHandle input,
+    AtenTensorHandle weight,
+    AtenTensorHandle bias, // optional argument
+    const int64_t* stride_ptr,
+    int64_t stride_size,
+    const int64_t* padding_ptr,
+    int64_t padding_size,
+    const int64_t* dilation_ptr,
+    int64_t dilation_size,
+    int transposed,
+    const int64_t* output_padding_ptr,
+    int64_t output_padding_size,
+    int64_t groups,
+    AtenTensorHandle* ret // returns new reference
+);
+
+// This function will create a new uninitialized tensor object
+// and its pointer is returned through *ret.
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_new_uninitialized_tensor(AtenTensorHandle* ret);
+
+// WARNING: This will be deprecated. Use aoti_torch_copy_ instead.
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_tensor_copy_(AtenTensorHandle src, AtenTensorHandle dst);
+
+// Make the tensor referred to by dst an alias for the tensor referred
+// to by src. The two tensors must still be deleted with
+// aoti_torch_delete_tensor separately (or not) as before the call.
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_assign_tensors(AtenTensorHandle src, AtenTensorHandle dst);
+
+// Make a shallow copy of the tensor referred to by src and assign
+// it to the handle in the ret_dst. This is similar to the above
+// aoti_torch_assign_tensors function, but creates and sets the
+// ret_dst from within.
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_assign_tensors_out(AtenTensorHandle src, AtenTensorHandle* ret_dst);
+
+// This function will create a new tensor object and its pointer is returned
+// through *ret. The caller is responsible for wrapping the tensor pointer
+// with RAIIAtenTensorHandle which will call aoti_torch_delete_tensor_object
+// when going out of scope.
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_clone(AtenTensorHandle self, AtenTensorHandle* ret);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_clone_preserve_strides(AtenTensorHandle self, AtenTensorHandle* ret);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_addmm_out(
+    AtenTensorHandle out,
+    AtenTensorHandle self,
+    AtenTensorHandle mat1,
+    AtenTensorHandle mat2,
+    float beta,
+    float alpha);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_bmm_out(
+    AtenTensorHandle out,
+    AtenTensorHandle self,
+    AtenTensorHandle mat2);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_copy_(
+    AtenTensorHandle self,
+    AtenTensorHandle src,
+    int32_t non_blocking);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mm_out(
+    AtenTensorHandle out,
+    AtenTensorHandle self,
+    AtenTensorHandle mat2);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch__mm_plus_mm_out(
+    AtenTensorHandle out,
+    AtenTensorHandle a,
+    AtenTensorHandle b,
+    AtenTensorHandle c,
+    AtenTensorHandle d);
+
+// This will soon be deprecated after ao_quantization is complete.
+// Please refrain from using this or increasing callsites.
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_cpu_wrapped_fbgemm_pack_gemm_matrix_fp16(
+    AtenTensorHandle weight,
+    AtenTensorHandle* out);
+
+// This will soon be deprecated after ao_quantization is complete.
+// Please refrain from using this or increasing callsites.
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__wrapped_linear_prepack(
+    AtenTensorHandle weight,
+    AtenTensorHandle weight_scale,
+    AtenTensorHandle weight_zero_point,
+    AtenTensorHandle bias,
+    AtenTensorHandle* out);
+
+// This will soon be deprecated after ao_quantization is complete.
+// Please refrain from using this or increasing callsites.
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_cpu_wrapped_fbgemm_linear_fp16_weight(
+    AtenTensorHandle input,
+    AtenTensorHandle weight,
+    AtenTensorHandle bias,
+    int64_t out_channel,
+    AtenTensorHandle* out);
+
+// This will soon be deprecated after ao_quantization is complete.
+// Please refrain from using this or increasing callsites.
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_cpu__wrapped_quantized_linear_prepacked(
+    AtenTensorHandle input,
+    AtenTensorHandle input_scale,
+    AtenTensorHandle input_zero_point,
+    AtenTensorHandle weight,
+    AtenTensorHandle out_scale,
+    AtenTensorHandle out_zeropoint,
+    int64_t out_channel,
+    AtenTensorHandle* out);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_nonzero(AtenTensorHandle self, AtenTensorHandle* out);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_zero_(AtenTensorHandle self);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_repeat_interleave_Tensor(
+    AtenTensorHandle repeats,
+    int64_t* output_size,
+    AtenTensorHandle* out);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_check_inf_and_nan(const char* tensor_name, AtenTensorHandle tensor);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scatter_out(
+    AtenTensorHandle out,
+    AtenTensorHandle self,
+    int64_t dim,
+    AtenTensorHandle index,
+    AtenTensorHandle src);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_scatter_reduce_out(
+    AtenTensorHandle out,
+    AtenTensorHandle self,
+    int64_t dim,
+    AtenTensorHandle index,
+    AtenTensorHandle src,
+    const char* reduce,
+    int32_t include_self);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_index_put_out(
+    AtenTensorHandle out,
+    AtenTensorHandle self,
+    const AtenTensorHandle* indices,
+    const uint32_t num_indices,
+    const AtenTensorHandle values,
+    bool accumulate);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_view_as_real(
+    AtenTensorHandle self,
+    AtenTensorHandle* ret // returns new reference
+);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_view_dtype(
+    AtenTensorHandle self,
+    int32_t dtype,
+    AtenTensorHandle* ret // returns new reference
+);
+
+AOTI_TORCH_EXPORT void aoti_torch_print_tensor_handle(
+    AtenTensorHandle self,
+    const char* msg);
+
+// When AOTI debug printer option is enabled, this function will be invoked to
+// torch pickle save the intermediate tensor for debugging purpose.
+AOTI_TORCH_EXPORT void aoti_torch_save_tensor_handle(
+    AtenTensorHandle self,
+    const char* tensor_name,
+    const char* launch_prefix,
+    const char* kernel_name);
+
+// helpers for converting between StableIValue and actual IValues
+using StableIValue = uint64_t;
+
+class TorchLibraryOpaque;
+using TorchLibraryHandle = TorchLibraryOpaque*;
+
+// stable corollary to torch::Library constructor with Kind::IMPL
+// will create a new torch::Library object on the heap
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_library_init_impl(
+    const char* ns,
+    const char* k,
+    const char* file,
+    uint32_t line,
+    TorchLibraryHandle* ret_new_torch_lib);
+
+// stable corollary to torch::Library constructor with Kind::DEF
+// will create a new torch::Library object on the heap
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_library_init_def(
+    const char* ns,
+    const char* file,
+    uint32_t line,
+    TorchLibraryHandle* ret_new_torch_lib);
+
+// stable corollary to torch::Library constructor with Kind::FRAGMENT
+// will create a new torch::Library object on the heap
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_library_init_fragment(
+    const char* ns,
+    const char* file,
+    uint32_t line,
+    TorchLibraryHandle* ret_new_torch_lib);
+
+// stable corollary to torch::Library method m.impl(), should be
+// called from StableLibrary
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_library_impl(
+    TorchLibraryHandle self,
+    const char* name,
+    void (*fn)(StableIValue*, uint64_t, uint64_t));
+
+// stable corollary to torch::Library method m.def(), should be
+// called from StableLibrary
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_library_def(TorchLibraryHandle self, const char* schema);
+
+// the above stable constructors for torch::Library add Library objects
+// to the heap. if you are calling those functions directly, please use
+// this function to free the Library's memory. The more user friendly
+// alternative is to use StableLibrary, which will free its handle upon
+// destruction
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_delete_library_object(TorchLibraryHandle tlh);
+
+// calls the op overload defined by a given opName, overloadName, and a
+// stack of StableIValues. This call will populate any return values of the
+// op into the stack in their StableIValue form, with ret0 at index 0, ret1
+// at index 1, and so on.
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_call_dispatcher(
+    const char* opName,
+    const char* overloadName,
+    StableIValue* stack);
+
+#ifdef USE_CUDA
+
+struct CUDAGuardOpaque;
+using CUDAGuardHandle = CUDAGuardOpaque*;
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_cuda_guard(
+    int32_t device_index,
+    CUDAGuardHandle* ret_guard // returns new reference
+);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_delete_cuda_guard(CUDAGuardHandle guard);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_cuda_guard_set_index(CUDAGuardHandle guard, int32_t device_index);
+
+struct CUDAStreamGuardOpaque;
+using CUDAStreamGuardHandle = CUDAStreamGuardOpaque*;
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_cuda_stream_guard(
+    void* stream,
+    int32_t device_index,
+    CUDAStreamGuardHandle* ret_guard // returns new reference
+);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_get_current_cuda_stream(int32_t device_index, void** ret_stream);
+
+#endif // USE_CUDA
+
+// See `ProxyExecutor Design Note` in ir.py for more details
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_proxy_executor_call_function(
+    AOTIProxyExecutorHandle proxy_executor,
+    int extern_node_index,
+    int num_ints,
+    int64_t* flatten_int_args,
+    int num_tensors,
+    AtenTensorHandle* flatten_tensor_args);
+
+AOTI_TORCH_EXPORT void aoti_torch_check(
+    bool cond,
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* msg);
+
+#ifdef STRIP_ERROR_MESSAGES
+#define AOTI_TORCH_CHECK(cond, ...)              \
+  if (!(cond)) {                                 \
+    aoti_torch_check(                            \
+        false,                                   \
+        __func__,                                \
+        __FILE__,                                \
+        static_cast<uint32_t>(__LINE__),         \
+        TORCH_CHECK_MSG(cond, "", __VA_ARGS__)); \
+  }
+#else
+#define AOTI_TORCH_CHECK(cond, ...)                \
+  if (!(cond)) {                                   \
+    aoti_torch_check(                              \
+        false,                                     \
+        __func__,                                  \
+        __FILE__,                                  \
+        static_cast<uint32_t>(__LINE__),           \
+        TORCH_CHECK_MSG(cond, "", ##__VA_ARGS__)); \
+  }
+#endif
+
+AOTI_TORCH_EXPORT void aoti_torch_warn(
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* msg);
+
+#ifdef DISABLE_WARN
+#define AOTI_TORCH_WARN(...) ((void)0);
+#else
+#define AOTI_TORCH_WARN(...) \
+  aoti_torch_warn(           \
+      __func__, __FILE__, static_cast<uint32_t>(__LINE__), #__VA_ARGS__);
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+
+template <typename T>
+int32_t aoti_torch_dtype() = delete;
+
+#define DEFINE_DTYPE_SPECIALIZATION(ctype, typename) \
+  template <>                                        \
+  inline int32_t aoti_torch_dtype<ctype>() {         \
+    return aoti_torch_dtype_##typename();            \
+  }
+
+DEFINE_DTYPE_SPECIALIZATION(c10::BFloat16, bfloat16)
+DEFINE_DTYPE_SPECIALIZATION(c10::Half, float16)
+DEFINE_DTYPE_SPECIALIZATION(c10::complex<float>, complex64)
+DEFINE_DTYPE_SPECIALIZATION(float, float32)
+DEFINE_DTYPE_SPECIALIZATION(double, float64)
+DEFINE_DTYPE_SPECIALIZATION(uint8_t, uint8)
+DEFINE_DTYPE_SPECIALIZATION(int8_t, int8)
+DEFINE_DTYPE_SPECIALIZATION(int16_t, int16)
+DEFINE_DTYPE_SPECIALIZATION(int32_t, int32)
+DEFINE_DTYPE_SPECIALIZATION(int64_t, int64)
+DEFINE_DTYPE_SPECIALIZATION(bool, bool)
+
+#endif
+#endif // AOTI_TORCH_SHIM
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/c/shim_cpu.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/c/shim_cpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..59dd16f474b5ef12429818c8539fbc4426d29500
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/c/shim_cpu.h
@@ -0,0 +1,251 @@
+#ifndef AOTI_TORCH_SHIM_CPU
+#define AOTI_TORCH_SHIM_CPU
+
+#include <ATen/Config.h>
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if AT_MKLDNN_ENABLED()
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_cpu_mkldnn__convolution_pointwise_binary(
+    AtenTensorHandle X,
+    AtenTensorHandle other,
+    AtenTensorHandle W,
+    AtenTensorHandle* B,
+    const int64_t* padding,
+    int64_t padding_len_,
+    const int64_t* stride,
+    int64_t stride_len_,
+    const int64_t* dilation,
+    int64_t dilation_len_,
+    int64_t groups,
+    const char* binary_attr,
+    double* alpha,
+    const char** unary_attr,
+    const double** unary_scalars,
+    int64_t unary_scalars_len_,
+    const char** unary_algorithm,
+    AtenTensorHandle* ret0);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_cpu_mkldnn__convolution_pointwise_binary_(
+    AtenTensorHandle other,
+    AtenTensorHandle X,
+    AtenTensorHandle W,
+    AtenTensorHandle* B,
+    const int64_t* padding,
+    int64_t padding_len_,
+    const int64_t* stride,
+    int64_t stride_len_,
+    const int64_t* dilation,
+    int64_t dilation_len_,
+    int64_t groups,
+    const char* binary_attr,
+    double* alpha,
+    const char** unary_attr,
+    const double** unary_scalars,
+    int64_t unary_scalars_len_,
+    const char** unary_algorithm,
+    AtenTensorHandle* ret0);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mkldnn__convolution_pointwise(
+    AtenTensorHandle X,
+    AtenTensorHandle W,
+    AtenTensorHandle* B,
+    const int64_t* padding,
+    int64_t padding_len_,
+    const int64_t* stride,
+    int64_t stride_len_,
+    const int64_t* dilation,
+    int64_t dilation_len_,
+    int64_t groups,
+    const char* attr,
+    const double** scalars,
+    int64_t scalars_len_,
+    const char** algorithm,
+    AtenTensorHandle* ret0);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_cpu_mkldnn__convolution_transpose_pointwise(
+    AtenTensorHandle X,
+    AtenTensorHandle W,
+    AtenTensorHandle* B,
+    const int64_t* padding,
+    int64_t padding_len_,
+    const int64_t* output_padding,
+    int64_t output_padding_len_,
+    const int64_t* stride,
+    int64_t stride_len_,
+    const int64_t* dilation,
+    int64_t dilation_len_,
+    int64_t groups,
+    const char* attr,
+    const double** scalars,
+    int64_t scalars_len_,
+    const char** algorithm,
+    AtenTensorHandle* ret0);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mkldnn_rnn_layer(
+    AtenTensorHandle input,
+    AtenTensorHandle weight0,
+    AtenTensorHandle weight1,
+    AtenTensorHandle weight2,
+    AtenTensorHandle weight3,
+    AtenTensorHandle hx_,
+    AtenTensorHandle cx_,
+    int32_t reverse,
+    const int64_t* batch_sizes,
+    int64_t batch_sizes_len_,
+    int64_t mode,
+    int64_t hidden_size,
+    int64_t num_layers,
+    int32_t has_biases,
+    int32_t bidirectional,
+    int32_t batch_first,
+    int32_t train,
+    AtenTensorHandle* ret0,
+    AtenTensorHandle* ret1,
+    AtenTensorHandle* ret2,
+    AtenTensorHandle* ret3);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__linear_pointwise(
+    AtenTensorHandle X,
+    AtenTensorHandle W,
+    AtenTensorHandle* B,
+    const char* attr,
+    const double** scalars,
+    int64_t scalars_len_,
+    const char** algorithm,
+    AtenTensorHandle* ret0);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__linear_pointwise_binary(
+    AtenTensorHandle X,
+    AtenTensorHandle other,
+    AtenTensorHandle W,
+    AtenTensorHandle* B,
+    const char* attr,
+    AtenTensorHandle* ret0);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__qlinear_pointwise_tensor(
+    AtenTensorHandle X,
+    AtenTensorHandle act_scale,
+    AtenTensorHandle act_zero_point,
+    AtenTensorHandle onednn_weight,
+    AtenTensorHandle weight_scales,
+    AtenTensorHandle weight_zero_points,
+    AtenTensorHandle* B,
+    double output_scale,
+    int64_t output_zero_point,
+    const int32_t* output_dtype,
+    const char* post_op_name,
+    const double** post_op_args,
+    int64_t post_op_args_len_,
+    const char* post_op_algorithm,
+    AtenTensorHandle* ret0);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_cpu__qlinear_pointwise_binary_tensor(
+    AtenTensorHandle X,
+    AtenTensorHandle act_scale,
+    AtenTensorHandle act_zero_point,
+    AtenTensorHandle onednn_weight,
+    AtenTensorHandle weight_scales,
+    AtenTensorHandle weight_zero_points,
+    AtenTensorHandle* other,
+    AtenTensorHandle* B,
+    double output_scale,
+    int64_t output_zero_point,
+    const int32_t* output_dtype,
+    double other_scale,
+    int64_t other_zero_point,
+    const char* binary_post_op,
+    double binary_alpha,
+    const char* unary_post_op,
+    const double** unary_post_op_args,
+    int64_t unary_post_op_args_len_,
+    const char* unary_post_op_algorithm,
+    AtenTensorHandle* ret0);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__qconv_pointwise_tensor(
+    AtenTensorHandle X,
+    AtenTensorHandle act_scale,
+    AtenTensorHandle act_zero_point,
+    AtenTensorHandle onednn_weight,
+    AtenTensorHandle weight_scales,
+    AtenTensorHandle weight_zero_points,
+    AtenTensorHandle* B,
+    const int64_t* stride,
+    int64_t stride_len_,
+    const int64_t* padding,
+    int64_t padding_len_,
+    const int64_t* dilation,
+    int64_t dilation_len_,
+    int64_t groups,
+    double output_scale,
+    int64_t output_zero_point,
+    const int32_t* output_dtype,
+    const char* attr,
+    const double** post_op_args,
+    int64_t post_op_args_len_,
+    const char** algorithm,
+    AtenTensorHandle* ret0);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_cpu__qconv2d_pointwise_binary_tensor(
+    AtenTensorHandle X,
+    AtenTensorHandle act_scale,
+    AtenTensorHandle act_zero_point,
+    AtenTensorHandle onednn_weight,
+    AtenTensorHandle weight_scales,
+    AtenTensorHandle weight_zero_points,
+    AtenTensorHandle accum,
+    AtenTensorHandle* B,
+    const int64_t* stride_args,
+    int64_t stride_len_,
+    const int64_t* padding_args,
+    int64_t padding_len_,
+    const int64_t* dilation_args,
+    int64_t dilation_len_,
+    int64_t groups,
+    double output_scale,
+    int64_t output_zero_point,
+    const int32_t* output_dtype,
+    double accum_scale,
+    int64_t accum_zero_point,
+    const char* binary_attr,
+    double* alpha,
+    const char** unary_attr,
+    const double** unary_scalars,
+    int64_t unary_scalars_len_,
+    const char** unary_algorithm,
+    AtenTensorHandle* ret0);
+
+#if AT_MKL_ENABLED()
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__mkl_linear(
+    AtenTensorHandle X,
+    AtenTensorHandle W,
+    AtenTensorHandle origin_W,
+    AtenTensorHandle* B,
+    int64_t prepack_batch_size,
+    AtenTensorHandle* ret0);
+
+#endif // AT_MKL_ENABLED
+
+#endif // AT_MKLDNN_ENABLED()
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__weight_int4pack_mm_cpu_tensor(
+    AtenTensorHandle X,
+    AtenTensorHandle w,
+    AtenTensorHandle qGroupSize,
+    AtenTensorHandle qScaleAndZeros,
+    AtenTensorHandle* ret0);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+#endif // AOTI_TORCH_SHIM_CPU
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/c/shim_mps.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/c/shim_mps.h
new file mode 100644
index 0000000000000000000000000000000000000000..79ad51af3568da347fbc169b1d5ba3a331b31a54
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/c/shim_mps.h
@@ -0,0 +1,39 @@
+#ifndef AOTI_TORCH_SHIM_MPS
+#define AOTI_TORCH_SHIM_MPS
+
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AOTIMetalKernelFunctionOpaque;
+using AOTIMetalKernelFunctionHandle = AOTIMetalKernelFunctionOpaque*;
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_set_arg_tensor(
+    AOTIMetalKernelFunctionHandle func,
+    unsigned idx,
+    AtenTensorHandle tensor);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_set_arg_int(
+    AOTIMetalKernelFunctionHandle func,
+    unsigned idx,
+    int64_t val);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_mps_malloc(void** buffer, size_t num_bytes);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_free(void* ptr);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_memcpy(
+    void* buffer,
+    size_t constant_offset,
+    size_t bytes_read,
+    size_t data_size,
+    uint8_t* constants_start);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOTI_TORCH_SHIM_MPS
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/c/shim_xpu.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/c/shim_xpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..f4a1ef08c8c46574f1198ccf58dd3eb0f8c34d35
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/c/shim_xpu.h
@@ -0,0 +1,116 @@
+#ifndef AOTI_TORCH_SHIM_XPU
+#define AOTI_TORCH_SHIM_XPU
+
+#include <ATen/Config.h>
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+
+#ifdef USE_XPU
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct XPUGuardOpaque;
+using XPUGuardHandle = XPUGuardOpaque*;
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_xpu_guard(
+    int32_t device_index,
+    XPUGuardHandle* ret_guard // returns new reference
+);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_delete_xpu_guard(XPUGuardHandle guard);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_xpu_guard_set_index(XPUGuardHandle guard, int32_t device_index);
+
+struct XPUStreamGuardOpaque;
+using XPUStreamGuardHandle = XPUStreamGuardOpaque*;
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_create_xpu_stream_guard(
+    void* stream,
+    int32_t device_index,
+    XPUStreamGuardHandle* ret_guard // returns new reference
+);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_delete_xpu_stream_guard(XPUStreamGuardHandle guard);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_get_current_xpu_stream(int32_t device_index, void** ret_stream);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_get_current_xpu_device(int32_t* device_index);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_set_current_xpu_device(const int32_t& device_index);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_get_current_sycl_queue(void** ret);
+
+#if AT_MKLDNN_ENABLED()
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_xpu_mkldnn__convolution_pointwise_binary(
+    AtenTensorHandle X,
+    AtenTensorHandle other,
+    AtenTensorHandle W,
+    AtenTensorHandle* B,
+    const int64_t* padding,
+    int64_t padding_len_,
+    const int64_t* stride,
+    int64_t stride_len_,
+    const int64_t* dilation,
+    int64_t dilation_len_,
+    int64_t groups,
+    const char* binary_attr,
+    double* alpha,
+    const char** unary_attr,
+    const double** unary_scalars,
+    int64_t unary_scalars_len_,
+    const char** unary_algorithm,
+    AtenTensorHandle* ret0);
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_mkldnn__convolution_pointwise(
+    AtenTensorHandle X,
+    AtenTensorHandle W,
+    AtenTensorHandle* B,
+    const int64_t* padding,
+    int64_t padding_len_,
+    const int64_t* stride,
+    int64_t stride_len_,
+    const int64_t* dilation,
+    int64_t dilation_len_,
+    int64_t groups,
+    const char* attr,
+    const double** scalars,
+    int64_t scalars_len_,
+    const char** algorithm,
+    AtenTensorHandle* ret0);
+
+AOTI_TORCH_EXPORT AOTITorchError
+aoti_torch_xpu_mkldnn__convolution_pointwise_binary_(
+    AtenTensorHandle other,
+    AtenTensorHandle X,
+    AtenTensorHandle W,
+    AtenTensorHandle* B,
+    const int64_t* padding,
+    int64_t padding_len_,
+    const int64_t* stride,
+    int64_t stride_len_,
+    const int64_t* dilation,
+    int64_t dilation_len_,
+    int64_t groups,
+    const char* binary_attr,
+    double* alpha,
+    const char** unary_attr,
+    const double** unary_scalars,
+    int64_t unary_scalars_len_,
+    const char** unary_algorithm,
+    AtenTensorHandle* ret0);
+
+#endif // AT_MKLDNN_ENABLED()
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // USE_XPU
+#endif // AOTI_TORCH_SHIM_XPU
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/generated/c_shim_cpu.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/generated/c_shim_cpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..1c7962ceb412f97b1e87ab3333ea1d0a344b115f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/generated/c_shim_cpu.h
@@ -0,0 +1,159 @@
+
+
+// WARNING: THIS FILE IS AUTOGENERATED BY torchgen. DO NOT MODIFY BY HAND.
+// See https://github.com/pytorch/pytorch/blob/7e86a7c0155295539996e0cf422883571126073e/torchgen/gen.py#L2424-L2436 for details
+
+#pragma once
+
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__adaptive_avg_pool2d(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__adaptive_avg_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__adaptive_avg_pool3d(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__adaptive_avg_pool3d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__addmm_activation(AtenTensorHandle self, AtenTensorHandle mat1, AtenTensorHandle mat2, double beta, double alpha, int32_t use_gelu, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__cdist_backward(AtenTensorHandle grad, AtenTensorHandle x1, AtenTensorHandle x2, double p, AtenTensorHandle cdist, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__cdist_forward(AtenTensorHandle x1, AtenTensorHandle x2, double p, int64_t* compute_mode, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__dyn_quant_matmul_4bit(AtenTensorHandle inp, AtenTensorHandle packed_weights, int64_t block_size, int64_t in_features, int64_t out_features, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__dyn_quant_pack_4bit_weight(AtenTensorHandle weights, AtenTensorHandle scales_zeros, AtenTensorHandle* bias, int64_t block_size, int64_t in_features, int64_t out_features, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__efficientzerotensor(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__embedding_bag(AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, int32_t scale_grad_by_freq, int64_t mode, int32_t sparse, AtenTensorHandle* per_sample_weights, int32_t include_last_offset, int64_t padding_idx, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__embedding_bag_dense_backward(AtenTensorHandle grad, AtenTensorHandle indices, AtenTensorHandle offset2bag, AtenTensorHandle bag_size, AtenTensorHandle maximum_indices, int64_t num_weights, int32_t scale_grad_by_freq, int64_t mode, AtenTensorHandle* per_sample_weights, int64_t padding_idx, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__embedding_bag_forward_only(AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, int32_t scale_grad_by_freq, int64_t mode, int32_t sparse, AtenTensorHandle* per_sample_weights, int32_t include_last_offset, int64_t padding_idx, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__embedding_bag_per_sample_weights_backward(AtenTensorHandle grad, AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, AtenTensorHandle offset2bag, int64_t mode, int64_t padding_idx, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fft_c2c(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int32_t forward, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fft_r2c(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int32_t onesided, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fused_moving_avg_obs_fq_helper(AtenTensorHandle self, AtenTensorHandle observer_on, AtenTensorHandle fake_quant_on, AtenTensorHandle running_min, AtenTensorHandle running_max, AtenTensorHandle scale, AtenTensorHandle zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, int32_t per_row_fake_quant, int32_t symmetric_quant, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__fused_moving_avg_obs_fq_helper_functional(AtenTensorHandle self, AtenTensorHandle observer_on, AtenTensorHandle fake_quant_on, AtenTensorHandle running_min, AtenTensorHandle running_max, AtenTensorHandle scale, AtenTensorHandle zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, int32_t per_row_fake_quant, int32_t symmetric_quant, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, AtenTensorHandle* ret4, AtenTensorHandle* ret5);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__histogramdd_from_bin_cts(AtenTensorHandle self, const int64_t* bins, int64_t bins_len_, const double** range, int64_t range_len_, AtenTensorHandle* weight, int32_t density, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__int_mm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__pdist_backward(AtenTensorHandle grad, AtenTensorHandle self, double p, AtenTensorHandle pdist, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__pdist_forward(AtenTensorHandle self, double p, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__scaled_dot_product_flash_attention_for_cpu(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, double dropout_p, int32_t is_causal, AtenTensorHandle* attn_mask, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__scaled_dot_product_flash_attention_for_cpu_backward(AtenTensorHandle grad_out, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle out, AtenTensorHandle logsumexp, double dropout_p, int32_t is_causal, AtenTensorHandle* attn_mask, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__scaled_dot_product_fused_attention_overrideable(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle* attn_bias, double dropout_p, int32_t is_causal, int32_t return_debug_mask, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, int64_t* ret4, int64_t* ret5, AtenTensorHandle* ret6, AtenTensorHandle* ret7, AtenTensorHandle* ret8);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__scaled_dot_product_fused_attention_overrideable_backward(AtenTensorHandle grad_out, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle attn_bias, const int32_t* grad_input_mask, int64_t grad_input_mask_len_, AtenTensorHandle out, AtenTensorHandle logsumexp, AtenTensorHandle cum_seq_q, AtenTensorHandle cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int32_t is_causal, AtenTensorHandle philox_seed, AtenTensorHandle philox_offset, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__scaled_mm(AtenTensorHandle self, AtenTensorHandle mat2, AtenTensorHandle scale_a, AtenTensorHandle scale_b, AtenTensorHandle* bias, AtenTensorHandle* scale_result, int32_t* out_dtype, int32_t use_fast_accum, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__scaled_mm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2, AtenTensorHandle scale_a, AtenTensorHandle scale_b, AtenTensorHandle* bias, AtenTensorHandle* scale_result, int32_t* out_dtype, int32_t use_fast_accum);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__segment_reduce_backward(AtenTensorHandle grad, AtenTensorHandle output, AtenTensorHandle data, const char* reduce, AtenTensorHandle* lengths, AtenTensorHandle* offsets, int64_t axis, double* initial, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__to_sparse(AtenTensorHandle self, int32_t* layout, const int64_t** blocksize, int64_t blocksize_len_, int64_t* dense_dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__trilinear(AtenTensorHandle i1, AtenTensorHandle i2, AtenTensorHandle i3, const int64_t* expand1, int64_t expand1_len_, const int64_t* expand2, int64_t expand2_len_, const int64_t* expand3, int64_t expand3_len_, const int64_t* sumdim, int64_t sumdim_len_, int64_t unroll_dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu__weight_int8pack_mm(AtenTensorHandle self, AtenTensorHandle mat2, AtenTensorHandle scales, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_abs(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_adaptive_max_pool2d(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_adaptive_max_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle indices, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_adaptive_max_pool3d(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_adaptive_max_pool3d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle indices, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_add_Scalar(AtenTensorHandle self, double other, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_add_Tensor(AtenTensorHandle self, AtenTensorHandle other, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_addbmm(AtenTensorHandle self, AtenTensorHandle batch1, AtenTensorHandle batch2, double beta, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_addmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat1, AtenTensorHandle mat2, double beta, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_addmv(AtenTensorHandle self, AtenTensorHandle mat, AtenTensorHandle vec, double beta, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_angle(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_avg_pool2d(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_avg_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_avg_pool3d(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_avg_pool3d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_baddbmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle batch1, AtenTensorHandle batch2, double beta, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bernoulli__Tensor(AtenTensorHandle self, AtenTensorHandle p, AtenGeneratorHandle* generator);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bernoulli__float(AtenTensorHandle self, double p, AtenGeneratorHandle* generator);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bucketize_Tensor(AtenTensorHandle self, AtenTensorHandle boundaries, int32_t out_int32, int32_t right, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cat(const AtenTensorHandle* tensors, int64_t tensors_len_, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cholesky_inverse(AtenTensorHandle self, int32_t upper, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cholesky_solve(AtenTensorHandle self, AtenTensorHandle input2, int32_t upper, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_convolution(AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_convolution_backward(AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle weight, const int64_t** bias_sizes, int64_t bias_sizes_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cummax(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cummin(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cumprod(AtenTensorHandle self, int64_t dim, int32_t* dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cumsum(AtenTensorHandle self, int64_t dim, int32_t* dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_exponential(AtenTensorHandle self, double lambd, AtenGeneratorHandle* generator, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fill__Scalar(AtenTensorHandle self, double value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fractional_max_pool2d(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle random_samples, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fractional_max_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle indices, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fractional_max_pool3d(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle random_samples, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fractional_max_pool3d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle indices, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_gcd(AtenTensorHandle self, AtenTensorHandle other, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_geqrf(AtenTensorHandle self, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_grid_sampler_2d_backward(AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_hann_window(int64_t window_length, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_histc(AtenTensorHandle self, int64_t bins, double min, double max, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_histogram_bin_ct(AtenTensorHandle self, int64_t bins, const double** range, int64_t range_len_, AtenTensorHandle* weight, int32_t density, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_Tensor(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_put(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle values, int32_t accumulate, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_reduce(AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle source, const char* reduce, int32_t include_self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_kthvalue(AtenTensorHandle self, int64_t k, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_logcumsumexp(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_lu_unpack(AtenTensorHandle LU_data, AtenTensorHandle LU_pivots, int32_t unpack_data, int32_t unpack_pivots, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_masked_scatter(AtenTensorHandle self, AtenTensorHandle mask, AtenTensorHandle source, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_masked_scatter_backward(AtenTensorHandle grad_output, AtenTensorHandle mask, const int64_t* sizes, int64_t sizes_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_masked_select(AtenTensorHandle self, AtenTensorHandle mask, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max_pool2d_with_indices(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max_pool2d_with_indices_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle indices, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max_pool3d_with_indices(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max_pool3d_with_indices_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle indices, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max_unpool2d(AtenTensorHandle self, AtenTensorHandle indices, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_max_unpool3d(AtenTensorHandle self, AtenTensorHandle indices, const int64_t* output_size, int64_t output_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_median(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mode(AtenTensorHandle self, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mul_Scalar(AtenTensorHandle self, double other, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mul_Tensor(AtenTensorHandle self, AtenTensorHandle other, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nanmedian(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_narrow(AtenTensorHandle self, int64_t dim, int64_t start, int64_t length, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_dropout(AtenTensorHandle input, double p, int32_t* train, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nonzero(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_normal_functional(AtenTensorHandle self, double mean, double std, AtenGeneratorHandle* generator, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_ormqr(AtenTensorHandle self, AtenTensorHandle input2, AtenTensorHandle input3, int32_t left, int32_t transpose, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_pad(AtenTensorHandle self, const int64_t* pad, int64_t pad_len_, const char* mode, double* value, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_permute(AtenTensorHandle self, const int64_t* dims, int64_t dims_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_polar(AtenTensorHandle abs, AtenTensorHandle angle, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_pow_Scalar(double self, AtenTensorHandle exponent, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_pow_Tensor_Scalar(AtenTensorHandle self, double exponent, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_pow_Tensor_Tensor(AtenTensorHandle self, AtenTensorHandle exponent, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_rand(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_rand_generator(const int64_t* size, int64_t size_len_, AtenGeneratorHandle* generator, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randint(int64_t high, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randint_generator(int64_t high, const int64_t* size, int64_t size_len_, AtenGeneratorHandle* generator, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randint_low(int64_t low, int64_t high, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randint_low_out(AtenTensorHandle out, int64_t low, int64_t high, const int64_t* size, int64_t size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randn(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randn_generator(const int64_t* size, int64_t size_len_, AtenGeneratorHandle* generator, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randperm(int64_t n, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_repeat_interleave_Tensor(AtenTensorHandle repeats, int64_t* output_size, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_replication_pad1d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_replication_pad2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_reshape(AtenTensorHandle self, const int64_t* shape, int64_t shape_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_resize_(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_resize_as_(AtenTensorHandle self, AtenTensorHandle the_template, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_scatter_src_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle src);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_scatter_value_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, double value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_scatter_reduce_two_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle src, const char* reduce, int32_t include_self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_searchsorted_Scalar(AtenTensorHandle sorted_sequence, double self, int32_t out_int32, int32_t right, const char** side, AtenTensorHandle* sorter, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_searchsorted_Tensor(AtenTensorHandle sorted_sequence, AtenTensorHandle self, int32_t out_int32, int32_t right, const char** side, AtenTensorHandle* sorter, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_segment_reduce(AtenTensorHandle data, const char* reduce, AtenTensorHandle* lengths, AtenTensorHandle* indices, AtenTensorHandle* offsets, int64_t axis, int32_t unsafe, double* initial, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_set__source_Tensor(AtenTensorHandle self, AtenTensorHandle source);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slice_Tensor(AtenTensorHandle self, int64_t dim, int64_t* start, int64_t* end, int64_t step, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_soft_margin_loss_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sort(AtenTensorHandle self, int64_t dim, int32_t descending, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sort_stable(AtenTensorHandle self, int32_t* stable, int64_t dim, int32_t descending, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_squeeze_dim(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_to_sparse(AtenTensorHandle self, int32_t* layout, const int64_t** blocksize, int64_t blocksize_len_, int64_t* dense_dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_topk(AtenTensorHandle self, int64_t k, int64_t dim, int32_t largest, int32_t sorted, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_triangular_solve(AtenTensorHandle self, AtenTensorHandle A, int32_t upper, int32_t transpose, int32_t unitriangular, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_uniform(AtenTensorHandle self, double from, double to, AtenGeneratorHandle* generator, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_bicubic2d_backward(AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales_h, double* scales_w, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_linear1d_backward(AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_trilinear3d_backward(AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales_d, double* scales_h, double* scales_w, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_view_dtype(AtenTensorHandle self, int32_t dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_view_as_complex(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_view_as_real(AtenTensorHandle self, AtenTensorHandle* ret0);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..48715a1aa6b788e1222b51bfa97918de71159665
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h
@@ -0,0 +1,165 @@
+
+
+// WARNING: THIS FILE IS AUTOGENERATED BY torchgen. DO NOT MODIFY BY HAND.
+// See https://github.com/pytorch/pytorch/blob/7e86a7c0155295539996e0cf422883571126073e/torchgen/gen.py#L2424-L2436 for details
+
+#pragma once
+
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__adaptive_avg_pool2d(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__adaptive_avg_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__adaptive_avg_pool3d(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__adaptive_avg_pool3d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__addmm_activation(AtenTensorHandle self, AtenTensorHandle mat1, AtenTensorHandle mat2, double beta, double alpha, int32_t use_gelu, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__cdist_backward(AtenTensorHandle grad, AtenTensorHandle x1, AtenTensorHandle x2, double p, AtenTensorHandle cdist, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__cdist_forward(AtenTensorHandle x1, AtenTensorHandle x2, double p, int64_t* compute_mode, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__cudnn_rnn(AtenTensorHandle input, const AtenTensorHandle* weight, int64_t weight_len_, int64_t weight_stride0, AtenTensorHandle* weight_buf, AtenTensorHandle hx, AtenTensorHandle* cx, int64_t mode, int64_t hidden_size, int64_t proj_size, int64_t num_layers, int32_t batch_first, double dropout, int32_t train, int32_t bidirectional, const int64_t* batch_sizes, int64_t batch_sizes_len_, AtenTensorHandle* dropout_state, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, AtenTensorHandle* ret4);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__efficient_attention_backward(AtenTensorHandle grad_out_, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle* bias, AtenTensorHandle out, AtenTensorHandle* cu_seqlens_q, AtenTensorHandle* cu_seqlens_k, int64_t max_seqlen_q, int64_t max_seqlen_k, AtenTensorHandle logsumexp, double dropout_p, AtenTensorHandle philox_seed, AtenTensorHandle philox_offset, int64_t custom_mask_type, int32_t bias_requires_grad, double* scale, int64_t* num_splits_key, int64_t* window_size, int32_t shared_storage_dqdkdv, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__efficient_attention_forward(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle* bias, AtenTensorHandle* cu_seqlens_q, AtenTensorHandle* cu_seqlens_k, int64_t* max_seqlen_q, int64_t* max_seqlen_k, double dropout_p, int64_t custom_mask_type, int32_t compute_log_sumexp, double* scale, AtenTensorHandle* seqlen_k, int64_t* window_size, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, int64_t* ret4, int64_t* ret5);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__efficientzerotensor(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__embedding_bag(AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, int32_t scale_grad_by_freq, int64_t mode, int32_t sparse, AtenTensorHandle* per_sample_weights, int32_t include_last_offset, int64_t padding_idx, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__embedding_bag_dense_backward(AtenTensorHandle grad, AtenTensorHandle indices, AtenTensorHandle offset2bag, AtenTensorHandle bag_size, AtenTensorHandle maximum_indices, int64_t num_weights, int32_t scale_grad_by_freq, int64_t mode, AtenTensorHandle* per_sample_weights, int64_t padding_idx, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__embedding_bag_forward_only(AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, int32_t scale_grad_by_freq, int64_t mode, int32_t sparse, AtenTensorHandle* per_sample_weights, int32_t include_last_offset, int64_t padding_idx, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__embedding_bag_per_sample_weights_backward(AtenTensorHandle grad, AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, AtenTensorHandle offset2bag, int64_t mode, int64_t padding_idx, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fft_c2c(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int32_t forward, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fft_r2c(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int32_t onesided, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__flash_attention_backward(AtenTensorHandle grad_out, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle out, AtenTensorHandle logsumexp, AtenTensorHandle cum_seq_q, AtenTensorHandle cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int32_t is_causal, AtenTensorHandle rng_state, AtenTensorHandle unused, double* scale, int64_t* window_size_left, int64_t* window_size_right, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__flash_attention_forward(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle* cum_seq_q, AtenTensorHandle* cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int32_t is_causal, int32_t return_debug_mask, double* scale, int64_t* window_size_left, int64_t* window_size_right, AtenTensorHandle* seqused_k, AtenTensorHandle* alibi_slopes, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, AtenTensorHandle* ret4);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fused_moving_avg_obs_fq_helper(AtenTensorHandle self, AtenTensorHandle observer_on, AtenTensorHandle fake_quant_on, AtenTensorHandle running_min, AtenTensorHandle running_max, AtenTensorHandle scale, AtenTensorHandle zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, int32_t per_row_fake_quant, int32_t symmetric_quant, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fused_moving_avg_obs_fq_helper_functional(AtenTensorHandle self, AtenTensorHandle observer_on, AtenTensorHandle fake_quant_on, AtenTensorHandle running_min, AtenTensorHandle running_max, AtenTensorHandle scale, AtenTensorHandle zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, int32_t per_row_fake_quant, int32_t symmetric_quant, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, AtenTensorHandle* ret4, AtenTensorHandle* ret5);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__int_mm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__pdist_backward(AtenTensorHandle grad, AtenTensorHandle self, double p, AtenTensorHandle pdist, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__pdist_forward(AtenTensorHandle self, double p, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__scaled_dot_product_cudnn_attention(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle* attn_bias, int32_t compute_log_sumexp, double dropout_p, int32_t is_causal, int32_t return_debug_mask, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, int64_t* ret4, int64_t* ret5, AtenTensorHandle* ret6, AtenTensorHandle* ret7, AtenTensorHandle* ret8);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__scaled_dot_product_cudnn_attention_backward(AtenTensorHandle grad_out, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle out, AtenTensorHandle logsumexp, AtenTensorHandle philox_seed, AtenTensorHandle philox_offset, AtenTensorHandle attn_bias, AtenTensorHandle cum_seq_q, AtenTensorHandle cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int32_t is_causal, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__scaled_dot_product_efficient_attention(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle* attn_bias, int32_t compute_log_sumexp, double dropout_p, int32_t is_causal, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__scaled_dot_product_efficient_attention_backward(AtenTensorHandle grad_out_, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle attn_bias, AtenTensorHandle out, AtenTensorHandle logsumexp, AtenTensorHandle philox_seed, AtenTensorHandle philox_offset, double dropout_p, const int32_t* grad_input_mask, int64_t grad_input_mask_len_, int32_t is_causal, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__scaled_dot_product_flash_attention(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, double dropout_p, int32_t is_causal, int32_t return_debug_mask, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, int64_t* ret4, int64_t* ret5, AtenTensorHandle* ret6, AtenTensorHandle* ret7, AtenTensorHandle* ret8);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__scaled_dot_product_flash_attention_backward(AtenTensorHandle grad_out, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle out, AtenTensorHandle logsumexp, AtenTensorHandle cum_seq_q, AtenTensorHandle cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int32_t is_causal, AtenTensorHandle philox_seed, AtenTensorHandle philox_offset, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__scaled_dot_product_fused_attention_overrideable(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle* attn_bias, double dropout_p, int32_t is_causal, int32_t return_debug_mask, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, int64_t* ret4, int64_t* ret5, AtenTensorHandle* ret6, AtenTensorHandle* ret7, AtenTensorHandle* ret8);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__scaled_dot_product_fused_attention_overrideable_backward(AtenTensorHandle grad_out, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle attn_bias, const int32_t* grad_input_mask, int64_t grad_input_mask_len_, AtenTensorHandle out, AtenTensorHandle logsumexp, AtenTensorHandle cum_seq_q, AtenTensorHandle cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int32_t is_causal, AtenTensorHandle philox_seed, AtenTensorHandle philox_offset, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__scaled_mm(AtenTensorHandle self, AtenTensorHandle mat2, AtenTensorHandle scale_a, AtenTensorHandle scale_b, AtenTensorHandle* bias, AtenTensorHandle* scale_result, int32_t* out_dtype, int32_t use_fast_accum, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__scaled_mm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2, AtenTensorHandle scale_a, AtenTensorHandle scale_b, AtenTensorHandle* bias, AtenTensorHandle* scale_result, int32_t* out_dtype, int32_t use_fast_accum);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__segment_reduce_backward(AtenTensorHandle grad, AtenTensorHandle output, AtenTensorHandle data, const char* reduce, AtenTensorHandle* lengths, AtenTensorHandle* offsets, int64_t axis, double* initial, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__thnn_fused_lstm_cell(AtenTensorHandle input_gates, AtenTensorHandle hidden_gates, AtenTensorHandle cx, AtenTensorHandle* input_bias, AtenTensorHandle* hidden_bias, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__to_sparse(AtenTensorHandle self, int32_t* layout, const int64_t** blocksize, int64_t blocksize_len_, int64_t* dense_dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__trilinear(AtenTensorHandle i1, AtenTensorHandle i2, AtenTensorHandle i3, const int64_t* expand1, int64_t expand1_len_, const int64_t* expand2, int64_t expand2_len_, const int64_t* expand3, int64_t expand3_len_, const int64_t* sumdim, int64_t sumdim_len_, int64_t unroll_dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__weight_int4pack_mm(AtenTensorHandle self, AtenTensorHandle mat2, int64_t qGroupSize, AtenTensorHandle qScaleAndZeros, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_abs(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_adaptive_max_pool2d(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_adaptive_max_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle indices, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_adaptive_max_pool3d(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_adaptive_max_pool3d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle indices, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_add_Scalar(AtenTensorHandle self, double other, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_add_Tensor(AtenTensorHandle self, AtenTensorHandle other, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_addbmm(AtenTensorHandle self, AtenTensorHandle batch1, AtenTensorHandle batch2, double beta, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_addmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat1, AtenTensorHandle mat2, double beta, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_addmv(AtenTensorHandle self, AtenTensorHandle mat, AtenTensorHandle vec, double beta, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_angle(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_avg_pool2d(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_avg_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_avg_pool3d(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_avg_pool3d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_baddbmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle batch1, AtenTensorHandle batch2, double beta, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_bernoulli__Tensor(AtenTensorHandle self, AtenTensorHandle p, AtenGeneratorHandle* generator);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_bernoulli__float(AtenTensorHandle self, double p, AtenGeneratorHandle* generator);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_bmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_bucketize_Tensor(AtenTensorHandle self, AtenTensorHandle boundaries, int32_t out_int32, int32_t right, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cat(const AtenTensorHandle* tensors, int64_t tensors_len_, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cholesky_inverse(AtenTensorHandle self, int32_t upper, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cholesky_solve(AtenTensorHandle self, AtenTensorHandle input2, int32_t upper, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_convolution(AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_convolution_backward(AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle weight, const int64_t** bias_sizes, int64_t bias_sizes_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cummax(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cummin(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cumprod(AtenTensorHandle self, int64_t dim, int32_t* dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cumsum(AtenTensorHandle self, int64_t dim, int32_t* dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_exponential(AtenTensorHandle self, double lambd, AtenGeneratorHandle* generator, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_fill__Scalar(AtenTensorHandle self, double value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_fractional_max_pool2d(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle random_samples, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_fractional_max_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle indices, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_fractional_max_pool3d(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle random_samples, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_fractional_max_pool3d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle indices, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_gcd(AtenTensorHandle self, AtenTensorHandle other, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_geqrf(AtenTensorHandle self, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_grid_sampler_2d_backward(AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_hann_window(int64_t window_length, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_histc(AtenTensorHandle self, int64_t bins, double min, double max, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_index_Tensor(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_index_put(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle values, int32_t accumulate, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_index_reduce(AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle source, const char* reduce, int32_t include_self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_kthvalue(AtenTensorHandle self, int64_t k, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_logcumsumexp(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_lu_unpack(AtenTensorHandle LU_data, AtenTensorHandle LU_pivots, int32_t unpack_data, int32_t unpack_pivots, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_masked_scatter(AtenTensorHandle self, AtenTensorHandle mask, AtenTensorHandle source, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_masked_scatter_backward(AtenTensorHandle grad_output, AtenTensorHandle mask, const int64_t* sizes, int64_t sizes_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_masked_select(AtenTensorHandle self, AtenTensorHandle mask, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_max_pool2d_with_indices(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_max_pool2d_with_indices_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle indices, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_max_pool3d_with_indices(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_max_pool3d_with_indices_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle indices, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_max_unpool2d(AtenTensorHandle self, AtenTensorHandle indices, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_max_unpool3d(AtenTensorHandle self, AtenTensorHandle indices, const int64_t* output_size, int64_t output_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_median(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mode(AtenTensorHandle self, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mul_Scalar(AtenTensorHandle self, double other, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mul_Tensor(AtenTensorHandle self, AtenTensorHandle other, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_nanmedian(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_narrow(AtenTensorHandle self, int64_t dim, int64_t start, int64_t length, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_native_dropout(AtenTensorHandle input, double p, int32_t* train, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_nonzero(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_normal_functional(AtenTensorHandle self, double mean, double std, AtenGeneratorHandle* generator, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_ormqr(AtenTensorHandle self, AtenTensorHandle input2, AtenTensorHandle input3, int32_t left, int32_t transpose, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_pad(AtenTensorHandle self, const int64_t* pad, int64_t pad_len_, const char* mode, double* value, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_permute(AtenTensorHandle self, const int64_t* dims, int64_t dims_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_polar(AtenTensorHandle abs, AtenTensorHandle angle, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_pow_Scalar(double self, AtenTensorHandle exponent, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_pow_Tensor_Scalar(AtenTensorHandle self, double exponent, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_pow_Tensor_Tensor(AtenTensorHandle self, AtenTensorHandle exponent, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_rand(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_rand_generator(const int64_t* size, int64_t size_len_, AtenGeneratorHandle* generator, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_randint(int64_t high, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_randint_generator(int64_t high, const int64_t* size, int64_t size_len_, AtenGeneratorHandle* generator, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_randint_low(int64_t low, int64_t high, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_randint_low_out(AtenTensorHandle out, int64_t low, int64_t high, const int64_t* size, int64_t size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_randn(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_randn_generator(const int64_t* size, int64_t size_len_, AtenGeneratorHandle* generator, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_randperm(int64_t n, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_repeat_interleave_Tensor(AtenTensorHandle repeats, int64_t* output_size, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_replication_pad1d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_replication_pad2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_reshape(AtenTensorHandle self, const int64_t* shape, int64_t shape_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_resize_(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_resize_as_(AtenTensorHandle self, AtenTensorHandle the_template, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_scatter_src_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle src);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_scatter_value_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, double value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_scatter_reduce_two_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle src, const char* reduce, int32_t include_self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_searchsorted_Scalar(AtenTensorHandle sorted_sequence, double self, int32_t out_int32, int32_t right, const char** side, AtenTensorHandle* sorter, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_searchsorted_Tensor(AtenTensorHandle sorted_sequence, AtenTensorHandle self, int32_t out_int32, int32_t right, const char** side, AtenTensorHandle* sorter, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_segment_reduce(AtenTensorHandle data, const char* reduce, AtenTensorHandle* lengths, AtenTensorHandle* indices, AtenTensorHandle* offsets, int64_t axis, int32_t unsafe, double* initial, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_set__source_Tensor(AtenTensorHandle self, AtenTensorHandle source);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_slice_Tensor(AtenTensorHandle self, int64_t dim, int64_t* start, int64_t* end, int64_t step, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_soft_margin_loss_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_sort(AtenTensorHandle self, int64_t dim, int32_t descending, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_sort_stable(AtenTensorHandle self, int32_t* stable, int64_t dim, int32_t descending, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_squeeze_dim(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_to_sparse(AtenTensorHandle self, int32_t* layout, const int64_t** blocksize, int64_t blocksize_len_, int64_t* dense_dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_topk(AtenTensorHandle self, int64_t k, int64_t dim, int32_t largest, int32_t sorted, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_triangular_solve(AtenTensorHandle self, AtenTensorHandle A, int32_t upper, int32_t transpose, int32_t unitriangular, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_uniform(AtenTensorHandle self, double from, double to, AtenGeneratorHandle* generator, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_upsample_bicubic2d_backward(AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales_h, double* scales_w, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_upsample_linear1d_backward(AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_upsample_trilinear3d_backward(AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales_d, double* scales_h, double* scales_w, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_view_dtype(AtenTensorHandle self, int32_t dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_view_as_complex(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_view_as_real(AtenTensorHandle self, AtenTensorHandle* ret0);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/generated/c_shim_mps.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/generated/c_shim_mps.h
new file mode 100644
index 0000000000000000000000000000000000000000..8778429b171b5fcd980c260094ea1f85bd9ae21a
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/generated/c_shim_mps.h
@@ -0,0 +1,119 @@
+
+
+// WARNING: THIS FILE IS AUTOGENERATED BY torchgen. DO NOT MODIFY BY HAND.
+// See https://github.com/pytorch/pytorch/blob/7e86a7c0155295539996e0cf422883571126073e/torchgen/gen.py#L2424-L2436 for details
+
+#pragma once
+
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__adaptive_avg_pool2d(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__adaptive_avg_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__cdist_forward(AtenTensorHandle x1, AtenTensorHandle x2, double p, int64_t* compute_mode, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__efficientzerotensor(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__fft_c2c(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int32_t forward, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__fft_r2c(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int32_t onesided, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__fused_moving_avg_obs_fq_helper_functional(AtenTensorHandle self, AtenTensorHandle observer_on, AtenTensorHandle fake_quant_on, AtenTensorHandle running_min, AtenTensorHandle running_max, AtenTensorHandle scale, AtenTensorHandle zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, int32_t per_row_fake_quant, int32_t symmetric_quant, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, AtenTensorHandle* ret4, AtenTensorHandle* ret5);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__histogramdd_from_bin_cts(AtenTensorHandle self, const int64_t* bins, int64_t bins_len_, const double** range, int64_t range_len_, AtenTensorHandle* weight, int32_t density, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__scaled_dot_product_fused_attention_overrideable(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle* attn_bias, double dropout_p, int32_t is_causal, int32_t return_debug_mask, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, int64_t* ret4, int64_t* ret5, AtenTensorHandle* ret6, AtenTensorHandle* ret7, AtenTensorHandle* ret8);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__scaled_dot_product_fused_attention_overrideable_backward(AtenTensorHandle grad_out, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle attn_bias, const int32_t* grad_input_mask, int64_t grad_input_mask_len_, AtenTensorHandle out, AtenTensorHandle logsumexp, AtenTensorHandle cum_seq_q, AtenTensorHandle cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int32_t is_causal, AtenTensorHandle philox_seed, AtenTensorHandle philox_offset, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__trilinear(AtenTensorHandle i1, AtenTensorHandle i2, AtenTensorHandle i3, const int64_t* expand1, int64_t expand1_len_, const int64_t* expand2, int64_t expand2_len_, const int64_t* expand3, int64_t expand3_len_, const int64_t* sumdim, int64_t sumdim_len_, int64_t unroll_dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__weight_int4pack_mm(AtenTensorHandle self, AtenTensorHandle mat2, int64_t qGroupSize, AtenTensorHandle qScaleAndZeros, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps__weight_int8pack_mm(AtenTensorHandle self, AtenTensorHandle mat2, AtenTensorHandle scales, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_abs(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_adaptive_max_pool2d(AtenTensorHandle self, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_adaptive_max_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle indices, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_add_Scalar(AtenTensorHandle self, double other, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_add_Tensor(AtenTensorHandle self, AtenTensorHandle other, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_addbmm(AtenTensorHandle self, AtenTensorHandle batch1, AtenTensorHandle batch2, double beta, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_addmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat1, AtenTensorHandle mat2, double beta, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_addmv(AtenTensorHandle self, AtenTensorHandle mat, AtenTensorHandle vec, double beta, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_angle(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_avg_pool2d(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_avg_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_baddbmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle batch1, AtenTensorHandle batch2, double beta, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_bernoulli__Tensor(AtenTensorHandle self, AtenTensorHandle p, AtenGeneratorHandle* generator);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_bernoulli__float(AtenTensorHandle self, double p, AtenGeneratorHandle* generator);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_bmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_bucketize_Tensor(AtenTensorHandle self, AtenTensorHandle boundaries, int32_t out_int32, int32_t right, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_cat(const AtenTensorHandle* tensors, int64_t tensors_len_, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_cholesky_solve(AtenTensorHandle self, AtenTensorHandle input2, int32_t upper, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_convolution(AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_convolution_backward(AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle weight, const int64_t** bias_sizes, int64_t bias_sizes_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_cummax(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_cummin(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_cumprod(AtenTensorHandle self, int64_t dim, int32_t* dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_cumsum(AtenTensorHandle self, int64_t dim, int32_t* dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_exponential(AtenTensorHandle self, double lambd, AtenGeneratorHandle* generator, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_fill__Scalar(AtenTensorHandle self, double value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_hann_window(int64_t window_length, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_histc(AtenTensorHandle self, int64_t bins, double min, double max, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_histogram_bin_ct(AtenTensorHandle self, int64_t bins, const double** range, int64_t range_len_, AtenTensorHandle* weight, int32_t density, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_index_Tensor(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_index_put(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle values, int32_t accumulate, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_kthvalue(AtenTensorHandle self, int64_t k, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_logcumsumexp(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_lu_unpack(AtenTensorHandle LU_data, AtenTensorHandle LU_pivots, int32_t unpack_data, int32_t unpack_pivots, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_masked_scatter(AtenTensorHandle self, AtenTensorHandle mask, AtenTensorHandle source, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_masked_scatter_backward(AtenTensorHandle grad_output, AtenTensorHandle mask, const int64_t* sizes, int64_t sizes_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_masked_select(AtenTensorHandle self, AtenTensorHandle mask, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_max_pool2d_with_indices(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_max_pool2d_with_indices_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t ceil_mode, AtenTensorHandle indices, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_median(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_mm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_mul_Scalar(AtenTensorHandle self, double other, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_mul_Tensor(AtenTensorHandle self, AtenTensorHandle other, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_nanmedian(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_narrow(AtenTensorHandle self, int64_t dim, int64_t start, int64_t length, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_nonzero(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_normal_functional(AtenTensorHandle self, double mean, double std, AtenGeneratorHandle* generator, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_pad(AtenTensorHandle self, const int64_t* pad, int64_t pad_len_, const char* mode, double* value, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_permute(AtenTensorHandle self, const int64_t* dims, int64_t dims_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_polar(AtenTensorHandle abs, AtenTensorHandle angle, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_pow_Scalar(double self, AtenTensorHandle exponent, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_pow_Tensor_Scalar(AtenTensorHandle self, double exponent, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_pow_Tensor_Tensor(AtenTensorHandle self, AtenTensorHandle exponent, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_rand(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_rand_generator(const int64_t* size, int64_t size_len_, AtenGeneratorHandle* generator, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_randint(int64_t high, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_randint_generator(int64_t high, const int64_t* size, int64_t size_len_, AtenGeneratorHandle* generator, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_randint_low(int64_t low, int64_t high, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_randint_low_out(AtenTensorHandle out, int64_t low, int64_t high, const int64_t* size, int64_t size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_randn(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_randn_generator(const int64_t* size, int64_t size_len_, AtenGeneratorHandle* generator, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_randperm(int64_t n, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_repeat_interleave_Tensor(AtenTensorHandle repeats, int64_t* output_size, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_replication_pad1d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_replication_pad2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_reshape(AtenTensorHandle self, const int64_t* shape, int64_t shape_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_resize_(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_resize_as_(AtenTensorHandle self, AtenTensorHandle the_template, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_scatter_src_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle src);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_scatter_value_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, double value);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_scatter_reduce_two_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle src, const char* reduce, int32_t include_self);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_searchsorted_Scalar(AtenTensorHandle sorted_sequence, double self, int32_t out_int32, int32_t right, const char** side, AtenTensorHandle* sorter, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_searchsorted_Tensor(AtenTensorHandle sorted_sequence, AtenTensorHandle self, int32_t out_int32, int32_t right, const char** side, AtenTensorHandle* sorter, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_set__source_Tensor(AtenTensorHandle self, AtenTensorHandle source);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_slice_Tensor(AtenTensorHandle self, int64_t dim, int64_t* start, int64_t* end, int64_t step, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_soft_margin_loss_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_sort(AtenTensorHandle self, int64_t dim, int32_t descending, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_sort_stable(AtenTensorHandle self, int32_t* stable, int64_t dim, int32_t descending, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_squeeze_dim(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_to_sparse(AtenTensorHandle self, int32_t* layout, const int64_t** blocksize, int64_t blocksize_len_, int64_t* dense_dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_topk(AtenTensorHandle self, int64_t k, int64_t dim, int32_t largest, int32_t sorted, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_triangular_solve(AtenTensorHandle self, AtenTensorHandle A, int32_t upper, int32_t transpose, int32_t unitriangular, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_uniform(AtenTensorHandle self, double from, double to, AtenGeneratorHandle* generator, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_upsample_bicubic2d_backward(AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales_h, double* scales_w, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_upsample_linear1d_backward(AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_upsample_trilinear3d_backward(AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales_d, double* scales_h, double* scales_w, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_view_dtype(AtenTensorHandle self, int32_t dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_view_as_complex(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_mps_view_as_real(AtenTensorHandle self, AtenTensorHandle* ret0);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/generated/c_shim_xpu.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/generated/c_shim_xpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..1dca971143dd9376415198709decbca53aebc0b1
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/generated/c_shim_xpu.h
@@ -0,0 +1,67 @@
+
+
+// WARNING: THIS FILE IS AUTOGENERATED BY torchgen. DO NOT MODIFY BY HAND.
+// See https://github.com/pytorch/pytorch/blob/7e86a7c0155295539996e0cf422883571126073e/torchgen/gen.py#L2424-L2436 for details
+
+#pragma once
+
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu__addmm_activation(AtenTensorHandle self, AtenTensorHandle mat1, AtenTensorHandle mat2, double beta, double alpha, int32_t use_gelu, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu__fused_moving_avg_obs_fq_helper_functional(AtenTensorHandle self, AtenTensorHandle observer_on, AtenTensorHandle fake_quant_on, AtenTensorHandle running_min, AtenTensorHandle running_max, AtenTensorHandle scale, AtenTensorHandle zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, int32_t per_row_fake_quant, int32_t symmetric_quant, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, AtenTensorHandle* ret4, AtenTensorHandle* ret5);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu__scaled_dot_product_fused_attention_overrideable(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle* attn_bias, double dropout_p, int32_t is_causal, int32_t return_debug_mask, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, int64_t* ret4, int64_t* ret5, AtenTensorHandle* ret6, AtenTensorHandle* ret7, AtenTensorHandle* ret8);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu__scaled_dot_product_fused_attention_overrideable_backward(AtenTensorHandle grad_out, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle attn_bias, const int32_t* grad_input_mask, int64_t grad_input_mask_len_, AtenTensorHandle out, AtenTensorHandle logsumexp, AtenTensorHandle cum_seq_q, AtenTensorHandle cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int32_t is_causal, AtenTensorHandle philox_seed, AtenTensorHandle philox_offset, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu__trilinear(AtenTensorHandle i1, AtenTensorHandle i2, AtenTensorHandle i3, const int64_t* expand1, int64_t expand1_len_, const int64_t* expand2, int64_t expand2_len_, const int64_t* expand3, int64_t expand3_len_, const int64_t* sumdim, int64_t sumdim_len_, int64_t unroll_dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu__weight_int4pack_mm_with_scales_and_zeros(AtenTensorHandle self, AtenTensorHandle mat2, int64_t qGroupSize, AtenTensorHandle qScale, AtenTensorHandle qZeros, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_abs(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_add_Scalar(AtenTensorHandle self, double other, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_addbmm(AtenTensorHandle self, AtenTensorHandle batch1, AtenTensorHandle batch2, double beta, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_addmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat1, AtenTensorHandle mat2, double beta, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_addmv(AtenTensorHandle self, AtenTensorHandle mat, AtenTensorHandle vec, double beta, double alpha, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_baddbmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle batch1, AtenTensorHandle batch2, double beta, double alpha);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_bmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_cholesky_solve(AtenTensorHandle self, AtenTensorHandle input2, int32_t upper, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_convolution(AtenTensorHandle input, AtenTensorHandle weight, AtenTensorHandle* bias, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_convolution_backward(AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle weight, const int64_t** bias_sizes, int64_t bias_sizes_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, const int64_t* dilation, int64_t dilation_len_, int32_t transposed, const int64_t* output_padding, int64_t output_padding_len_, int64_t groups, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_cummax(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_cummin(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_exponential(AtenTensorHandle self, double lambd, AtenGeneratorHandle* generator, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_hann_window(int64_t window_length, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_index_put(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle values, int32_t accumulate, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_kthvalue(AtenTensorHandle self, int64_t k, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_logcumsumexp(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_masked_scatter(AtenTensorHandle self, AtenTensorHandle mask, AtenTensorHandle source, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_masked_scatter_backward(AtenTensorHandle grad_output, AtenTensorHandle mask, const int64_t* sizes, int64_t sizes_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_mm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_mul_Scalar(AtenTensorHandle self, double other, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_narrow(AtenTensorHandle self, int64_t dim, int64_t start, int64_t length, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_normal_functional(AtenTensorHandle self, double mean, double std, AtenGeneratorHandle* generator, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_pad(AtenTensorHandle self, const int64_t* pad, int64_t pad_len_, const char* mode, double* value, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_permute(AtenTensorHandle self, const int64_t* dims, int64_t dims_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_polar(AtenTensorHandle abs, AtenTensorHandle angle, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_rand(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_rand_generator(const int64_t* size, int64_t size_len_, AtenGeneratorHandle* generator, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_randint(int64_t high, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_randint_generator(int64_t high, const int64_t* size, int64_t size_len_, AtenGeneratorHandle* generator, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_randint_low(int64_t low, int64_t high, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_randint_low_out(AtenTensorHandle out, int64_t low, int64_t high, const int64_t* size, int64_t size_len_);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_randn(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_randn_generator(const int64_t* size, int64_t size_len_, AtenGeneratorHandle* generator, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_randperm(int64_t n, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_reshape(AtenTensorHandle self, const int64_t* shape, int64_t shape_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_resize_as_(AtenTensorHandle self, AtenTensorHandle the_template, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_slice_Tensor(AtenTensorHandle self, int64_t dim, int64_t* start, int64_t* end, int64_t step, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_soft_margin_loss_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_sort(AtenTensorHandle self, int64_t dim, int32_t descending, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_squeeze_dim(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_to_sparse(AtenTensorHandle self, int32_t* layout, const int64_t** blocksize, int64_t blocksize_len_, int64_t* dense_dim, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_uniform(AtenTensorHandle self, double from, double to, AtenGeneratorHandle* generator, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_xpu_view_dtype(AtenTensorHandle self, int32_t dtype, AtenTensorHandle* ret0);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/mkldnn_tensor.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/mkldnn_tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..06e872d74259a042e0808208c5e67b4940df2ab1
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/mkldnn_tensor.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+
+namespace torch::aot_inductor {
+
+void* data_ptr_from_mkldnn(at::Tensor* mkldnn_tensor);
+
+at::Tensor mkldnn_tensor_from_data_ptr(
+    void* data_ptr,
+    at::IntArrayRef dims,
+    at::ScalarType dtype,
+    at::Device device,
+    const uint8_t* opaque_metadata,
+    int64_t opaque_metadata_size);
+
+} // namespace torch::aot_inductor
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/oss_proxy_executor.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/oss_proxy_executor.h
new file mode 100644
index 0000000000000000000000000000000000000000..0886bb7abf7637ba49de72b8d8e885f18794661b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/oss_proxy_executor.h
@@ -0,0 +1,153 @@
+#pragma once
+
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <ATen/core/ivalue.h>
+#include <c10/macros/Export.h>
+#include <nlohmann/json.hpp>
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <torch/csrc/inductor/aoti_torch/proxy_executor.h>
+#include <torch/csrc/jit/api/function_impl.h> // @manual
+#include <iostream>
+#include <utility>
+
+namespace torch::aot_inductor {
+
+inline std::ostream& operator<<(std::ostream& os, DynamicArgType arg_type) {
+  os << static_cast<int>(arg_type);
+  return os;
+}
+
+struct OSSDynamicArg {
+  OSSDynamicArg(
+      int arg_index,
+      DynamicArgType arg_type,
+      int length,
+      std::optional<std::vector<std::string>> list_item_types = std::nullopt)
+      : arg_index(arg_index),
+        arg_type(arg_type),
+        length(length),
+        list_item_types(std::move(list_item_types)) {}
+  int arg_index;
+  DynamicArgType arg_type;
+  int length;
+  std::optional<std::vector<std::string>>
+      list_item_types; // only used for parsing list of optional tensors
+};
+
+struct OSSTorchBindArg {
+  OSSTorchBindArg(int arg_index, std::string arg_name)
+      : arg_index(arg_index), arg_name(std::move(arg_name)) {}
+  int arg_index;
+  // arg_name is used to find the corresponding IValue in customObjs_
+  std::string arg_name;
+};
+
+struct OSSOpKernel {
+  explicit OSSOpKernel(std::string target) : target_(std::move(target)) {}
+  // Explicitly declare copy and move constructors
+  OSSOpKernel(const OSSOpKernel&) = default;
+  OSSOpKernel(OSSOpKernel&&) = default;
+  // Explicitly declare copy and move assignment operators
+  OSSOpKernel& operator=(const OSSOpKernel&) = default;
+  OSSOpKernel& operator=(OSSOpKernel&&) = default;
+
+  std::string target_;
+  std::vector<OSSDynamicArg> dynamic_args_;
+  std::vector<OSSTorchBindArg> torchbind_args_;
+  std::vector<OSSDynamicArg> outputs_;
+  std::vector<c10::IValue> stack_;
+
+  int num_output_tensors() const {
+    int num_output_tensors = 0;
+    for (const auto& output : outputs_) {
+      if (isTensorType(output.arg_type)) {
+        num_output_tensors += output.length;
+      }
+    }
+    return num_output_tensors;
+  }
+
+  int num_output_ints() const {
+    int num_output_ints = 0;
+    for (const auto& output : outputs_) {
+      if (output.arg_type == DynamicArgType::IntType) {
+        num_output_ints += output.length;
+      }
+    }
+    return num_output_ints;
+  }
+
+  virtual void run(std::vector<c10::IValue>& stack) = 0;
+  virtual c10::FunctionSchema schema() const = 0;
+  virtual ~OSSOpKernel() = default;
+};
+
+struct OSSOpKernelOperator : public OSSOpKernel {
+  OSSOpKernelOperator(std::string target, c10::OperatorHandle op_handle)
+      : OSSOpKernel(std::move(target)), op_handle_(std::move(op_handle)) {}
+
+  c10::OperatorHandle op_handle_;
+  void run(std::vector<c10::IValue>& stack) override {
+    op_handle_.callBoxed(stack);
+  }
+
+  c10::FunctionSchema schema() const override {
+    return op_handle_.schema();
+  }
+};
+
+struct OSSCallTorchBindKernel : public OSSOpKernel {
+  OSSCallTorchBindKernel(std::string target, torch::jit::Function* method)
+      : OSSOpKernel(std::move(target)), method_(method) {}
+  torch::jit::Function* method_;
+  void run(std::vector<c10::IValue>& stack) override {
+    method_->run(stack);
+  }
+
+  c10::FunctionSchema schema() const override {
+    return method_->getSchema();
+  }
+};
+
+class OSSProxyExecutor : public ProxyExecutor {
+ public:
+  explicit OSSProxyExecutor(
+      const std::string& json_path,
+      bool is_cpu,
+      std::optional<std::unordered_map<std::string, c10::IValue>> custom_objs =
+          std::nullopt);
+
+  void call_function(
+      int extern_node_index,
+      int num_ints,
+      int64_t* flatten_int_args,
+      int num_tensors,
+      AtenTensorHandle* flatten_tensor_args) override;
+
+ private:
+  void prefill_stack_with_static_arguments(
+      size_t index,
+      const at::TypePtr& schema_arg_type,
+      const nlohmann::json& serialized_arg,
+      OSSOpKernel* op_kernel,
+      const std::string& torchbind_arg_name);
+
+  void get_input_info_from_serialized(
+      const std::vector<c10::Argument>& schema_args,
+      const nlohmann::json& serialized_node,
+      OSSOpKernel& op_kernel);
+
+  void get_output_info_from_serialized(
+      const std::vector<c10::Argument>& schema_returns,
+      const nlohmann::json& serialized_node,
+      OSSOpKernel& op_kernel);
+
+  std::unique_ptr<OSSCallTorchBindKernel> get_call_torch_bind_kernel(
+      const nlohmann::json& serialized_node);
+
+  std::vector<std::unique_ptr<OSSOpKernel>> op_kernels_;
+  std::unique_ptr<c10::Device> device_;
+  std::unordered_map<std::string, c10::IValue> custom_objs_;
+};
+
+} // namespace torch::aot_inductor
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/proxy_executor.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/proxy_executor.h
new file mode 100644
index 0000000000000000000000000000000000000000..2782a555ba133554d39a97c68b762eb0360012e8
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/proxy_executor.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <c10/macros/Export.h>
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+
+namespace torch::aot_inductor {
+
+enum class DynamicArgType : int {
+  TensorType = 0,
+  ListTensorType = 1,
+  ListOptionalTensorType = 2,
+  IntType = 3,
+  ListIntType = 4,
+  NoneType = 5,
+};
+
+inline bool isTensorType(DynamicArgType arg_type) {
+  return arg_type == DynamicArgType::TensorType ||
+      arg_type == DynamicArgType::ListTensorType ||
+      arg_type == DynamicArgType::ListOptionalTensorType;
+}
+
+class ProxyExecutor {
+ public:
+  ProxyExecutor() = default;
+  virtual ~ProxyExecutor() = default;
+
+  virtual void call_function(
+      int extern_node_index,
+      int num_ints,
+      int64_t* flatten_int_args,
+      int num_tensors,
+      AtenTensorHandle* flatten_tensor_args) = 0;
+};
+
+} // namespace torch::aot_inductor
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/tensor_converter.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/tensor_converter.h
new file mode 100644
index 0000000000000000000000000000000000000000..b9d4649644e5e41167736902393b5cb5732c0690
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/tensor_converter.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+
+namespace torch::aot_inductor {
+
+// Functions declared here are not meant to be called from the AOTInductor
+// generated model.so
+
+// unsafe_alloc_new_handles_from_tensors is used for allocating new aten
+// tensor objects and return them as a vector of AtenTensorHandle (raw
+// pointers), and those pointers will be stolen by model.so.
+TORCH_API std::vector<AtenTensorHandle> unsafe_alloc_new_handles_from_tensors(
+    const std::vector<at::Tensor>& tensors);
+
+// alloc_tensors_by_stealing_from_handles is used for creating a vector of aten
+// tensors by stealing from an array of handles. Only the handles are stolen,
+// and the array itself is borrowed.
+//
+// WARNING: Can NOT be called in model.so
+TORCH_API std::vector<at::Tensor> alloc_tensors_by_stealing_from_handles(
+    AtenTensorHandle* handles,
+    size_t length);
+
+} // namespace torch::aot_inductor
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/utils.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..09a2ca737237c27daa7b2a48b09a591c19bbda86
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/aoti_torch/utils.h
@@ -0,0 +1,225 @@
+#pragma once
+
+#include <ATen/Generator.h>
+#include <ATen/Tensor.h>
+#include <ATen/core/List.h>
+#include <c10/core/DeviceType.h>
+#include <c10/core/SymIntArrayRef.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Logging.h>
+#include <c10/util/OptionalArrayRef.h>
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <optional>
+
+#define AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE(...)    \
+  try {                                                    \
+    __VA_ARGS__                                            \
+  } catch (const std::exception& e) {                      \
+    LOG(ERROR) << "Exception in aoti_torch: " << e.what(); \
+    return AOTI_TORCH_FAILURE;                             \
+  } catch (...) {                                          \
+    LOG(ERROR) << "Exception in aoti_torch: UNKNOWN";      \
+    return AOTI_TORCH_FAILURE;                             \
+  }                                                        \
+  return AOTI_TORCH_SUCCESS;
+
+namespace torch::aot_inductor {
+
+inline at::Tensor* tensor_handle_to_tensor_pointer(AtenTensorHandle handle) {
+  return reinterpret_cast<at::Tensor*>(handle);
+}
+
+inline AtenTensorHandle tensor_pointer_to_tensor_handle(at::Tensor* tensor) {
+  return reinterpret_cast<AtenTensorHandle>(tensor);
+}
+
+inline at::Tensor resolve_tensor_dispatch_flags(AtenTensorHandle handle) {
+  at::Tensor* tensor{tensor_handle_to_tensor_pointer(handle)};
+  if (tensor->is_conj() || tensor->is_neg()) {
+    // If the conjugation or negation dispatch flags are set, runtime dispatch
+    // handles them by cloning the tensor before passing them to the native ATen
+    // function.  Since the C-shim calls the native function directly, we have
+    // to handle the flags ourselves, or results will be silently incorrect.
+    return tensor->clone();
+  }
+  return *tensor;
+}
+
+inline std::optional<at::Tensor> resolve_tensor_dispatch_flags(
+    const AtenTensorHandle* handle) {
+  return handle ? std::make_optional(resolve_tensor_dispatch_flags(*handle))
+                : std::nullopt;
+}
+
+inline std::vector<at::Tensor> resolve_tensor_list_dispatch_flags(
+    const AtenTensorHandle* handle,
+    int64_t len) {
+  std::vector<at::Tensor> ret{};
+  ret.reserve(len);
+  for (int64_t i{0}; i < len; ++i) {
+    ret.emplace_back(resolve_tensor_dispatch_flags(handle[i]));
+  }
+  return ret;
+}
+
+inline std::vector<std::optional<at::Tensor>> resolve_tensor_list_dispatch_flags(
+    const AtenTensorHandle** handle,
+    int64_t len) {
+  std::vector<std::optional<at::Tensor>> ret{};
+  ret.reserve(len);
+  for (int64_t i{0}; i < len; ++i) {
+    ret.emplace_back(resolve_tensor_dispatch_flags(handle[i]));
+  }
+  return ret;
+}
+
+inline at::Generator* generator_handle_to_generator_pointer(
+    AtenGeneratorHandle handle) {
+  return reinterpret_cast<at::Generator*>(handle);
+}
+
+inline AtenGeneratorHandle generator_pointer_to_generator_handle(
+    at::Generator* generator) {
+  return reinterpret_cast<AtenGeneratorHandle>(generator);
+}
+
+inline AtenTensorHandle new_tensor_handle(at::Tensor&& tensor) {
+  at::Tensor* new_tensor = new at::Tensor(std::move(tensor));
+  return tensor_pointer_to_tensor_handle(new_tensor);
+}
+
+inline void assert_inf_and_nan(
+    const std::string& tensor_name,
+    at::Tensor& check_tensor) {
+  auto isnan_tensor = check_tensor.isnan();
+  if (isnan_tensor.any().item<bool>()) {
+    throw std::runtime_error("At least one NaN in " + tensor_name);
+  }
+  auto isinf_tensor = check_tensor.isinf();
+  if (isinf_tensor.any().item<bool>()) {
+    throw std::runtime_error("At least one INF in " + tensor_name);
+  }
+}
+
+// utility functions to convert a pointer to an optional value
+template <class T>
+inline std::optional<T> pointer_to_optional(T* ptr) {
+  return ptr ? std::make_optional(*ptr) : std::nullopt;
+}
+
+template <class T, class U, typename = std::enable_if_t<!std::is_same_v<T, U>>>
+inline std::optional<T> pointer_to_optional(U* ptr) {
+  return ptr ? std::make_optional<T>(T(*ptr)) : std::nullopt;
+}
+
+template <>
+inline std::optional<at::Tensor> pointer_to_optional(AtenTensorHandle* ptr) {
+  return ptr ? std::make_optional(*tensor_handle_to_tensor_pointer(*ptr))
+             : std::nullopt;
+}
+
+template <>
+inline std::optional<at::Tensor> pointer_to_optional(
+    const AtenTensorHandle* ptr) {
+  return ptr ? std::make_optional(*tensor_handle_to_tensor_pointer(*ptr))
+             : std::nullopt;
+}
+
+template <>
+inline std::optional<at::Generator> pointer_to_optional(
+    AtenGeneratorHandle* ptr) {
+  return ptr ? std::make_optional(*generator_handle_to_generator_pointer(*ptr))
+             : std::nullopt;
+}
+
+inline std::optional<c10::Device> pointer_to_optional_device(
+    int32_t* device_type,
+    int32_t device_index) {
+  return device_type ? std::make_optional(c10::Device(
+                           static_cast<c10::DeviceType>(*device_type),
+                           static_cast<c10::DeviceIndex>(device_index)))
+                     : std::nullopt;
+}
+
+// utility functions to convert a pointer to a list
+template <typename T>
+struct is_optional : std::false_type {};
+template <typename T>
+struct is_optional<std::optional<T>> : std::true_type {};
+
+template <class T>
+inline c10::ArrayRef<T> pointer_to_list(T* ptr, int64_t len) {
+  return c10::ArrayRef<T>(ptr, len);
+}
+
+template <
+    class T,
+    class U,
+    typename = std::enable_if_t<!std::is_same_v<T, U>>,
+    typename = std::enable_if_t<!is_optional<T>::value>>
+inline std::vector<T> pointer_to_list(U* ptr, int64_t len) {
+  // std::vector<T> will be implicitly converted to c10::ArrayRef<T> at the call
+  // site
+  std::vector<T> result;
+  result.reserve(len);
+  for (int64_t i = 0; i < len; i++) {
+    result.emplace_back(T(ptr[i]));
+  }
+  return result;
+}
+
+template <class T, class U, typename = std::enable_if_t<is_optional<T>::value>>
+inline std::vector<T> pointer_to_list(U** ptr, int64_t len) {
+  // Here U** denotes a list of optional arguments
+  // std::vector<T> will be implicitly converted to c10::ArrayRef<T> at the call
+  // site
+  std::vector<T> result;
+  result.reserve(len);
+  for (int64_t i = 0; i < len; i++) {
+    result.emplace_back(pointer_to_optional(ptr[i]));
+  }
+  return result;
+}
+
+template <>
+inline std::vector<at::Tensor> pointer_to_list(
+    const AtenTensorHandle* ptr,
+    int64_t len) {
+  std::vector<at::Tensor> result;
+  result.reserve(len);
+  for (int64_t i = 0; i < len; i++) {
+    result.emplace_back(*tensor_handle_to_tensor_pointer(ptr[i]));
+  }
+  return result;
+}
+
+template <>
+inline std::vector<std::optional<at::Tensor>> pointer_to_list(
+    const AtenTensorHandle** ptr,
+    int64_t len) {
+  std::vector<std::optional<at::Tensor>> result;
+  result.reserve(len);
+  for (int64_t i = 0; i < len; i++) {
+    result.emplace_back(pointer_to_optional<at::Tensor>(ptr[i]));
+  }
+  return result;
+}
+
+template <int N>
+inline std::array<bool, N> pointer_to_list(const int32_t* ptr) {
+  std::array<bool, N> result;
+  std::copy(ptr, ptr + N, result.begin());
+  return result;
+}
+
+// Utility function to convert a pointer to an optional list of values
+template <class T, class U>
+inline std::optional<c10::ArrayRef<T>> pointer_to_optional_list(
+    U** ptr,
+    int64_t len) {
+  return ptr
+      ? std::make_optional<c10::ArrayRef<T>>(pointer_to_list<T>(*ptr, len))
+      : std::nullopt;
+}
+
+} // namespace torch::aot_inductor
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/array_ref_impl.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/array_ref_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..c02ddb2f0a3498aa0a6059716ade986e6f100bd0
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/array_ref_impl.h
@@ -0,0 +1,87 @@
+#pragma once
+
+#include <torch/csrc/inductor/aoti_runtime/arrayref_tensor.h>
+#include <torch/csrc/inductor/aoti_runtime/scalar_to_tensor.h>
+#include <torch/csrc/inductor/aoti_runtime/thread_local.h>
+#include <torch/csrc/inductor/aoti_torch/utils.h>
+
+namespace torch::aot_inductor {
+template <typename T>
+void convert_output_to_handle(
+    const ArrayRefTensor<T>& output,
+    AtenTensorHandle& handle) {
+  handle = output.expensiveCopyToTensor();
+}
+
+template <typename... Ts, std::size_t... Is>
+void convert_outputs_to_handles_helper(
+    const std::tuple<ArrayRefTensor<Ts>...>& outputs,
+    AtenTensorHandle* output_handles,
+    std::index_sequence<Is...>) {
+  (convert_output_to_handle(std::get<Is>(outputs), output_handles[Is]), ...);
+}
+template <typename... Ts>
+void convert_outputs_to_handles(
+    const std::tuple<ArrayRefTensor<Ts>...>& outputs,
+    AtenTensorHandle* output_handles) {
+  convert_outputs_to_handles_helper(
+      outputs, output_handles, std::make_index_sequence<sizeof...(Ts)>());
+}
+
+template <typename T>
+void convert_handle_to_arrayref_tensor(
+    AtenTensorHandle handle,
+    ArrayRefTensor<T>& input) {
+  void* data_ptr;
+  AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_data_ptr(handle, &data_ptr));
+  int64_t dim;
+  AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_dim(handle, &dim));
+  int64_t numel;
+  AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_numel(handle, &numel));
+  int64_t* sizes;
+  AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_sizes(handle, &sizes));
+  int64_t* strides;
+  AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_strides(handle, &strides));
+  int32_t dtype;
+  AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_dtype(handle, &dtype));
+  int32_t device_type;
+  AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_device_type(handle, &device_type));
+  int32_t device_index;
+  AOTI_TORCH_ERROR_CODE_CHECK(
+      aoti_torch_get_device_index(handle, &device_index));
+
+  input = ArrayRefTensor<T>(
+      MiniArrayRef<T>(reinterpret_cast<T*>(data_ptr), numel),
+      MiniArrayRef<const int64_t>(sizes, dim),
+      MiniArrayRef<const int64_t>(strides, dim),
+      device_type,
+      device_index);
+}
+
+template <typename... Ts, std::size_t... Is>
+void convert_handles_to_inputs_helper(
+    AtenTensorHandle* input_handles,
+    std::tuple<ArrayRefTensor<Ts>...>& inputs,
+    std::index_sequence<Is...>) {
+  (convert_handle_to_arrayref_tensor(input_handles[Is], std::get<Is>(inputs)),
+   ...);
+}
+
+template <typename... Ts>
+void convert_handles_to_inputs(
+    AtenTensorHandle* input_handles,
+    std::tuple<ArrayRefTensor<Ts>...>& inputs) {
+  convert_handles_to_inputs_helper(
+      input_handles, inputs, std::make_index_sequence<sizeof...(Ts)>());
+}
+
+template <typename T>
+void assert_numel(const ArrayRefTensor<T>& tensor, uint64_t numel) {
+  if (tensor.numel() != numel) {
+    std::stringstream err;
+    err << "incorrect numel for input tensor. expected " << numel << ", got "
+        << tensor.numel();
+    throw std::runtime_error(err.str());
+  }
+}
+} // namespace torch::aot_inductor
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/cpp_prefix.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/cpp_prefix.h
new file mode 100644
index 0000000000000000000000000000000000000000..31e59b249aff2f5e4b683103326ea88b6569915c
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/cpp_prefix.h
@@ -0,0 +1,1194 @@
+#pragma once
+
+#include <omp.h>
+#include <algorithm>
+#include <atomic>
+#include <cmath>
+#include <cstdlib>
+#include <limits>
+#include <map>
+#include <memory>
+#include <optional>
+
+// WARNING: be extra careful when including more ATen/c10 header files here!
+// Because AOTInductor generated code will copy-paste this cpp_prefix.h for
+// the CPU backend, we have to make sure the used headers are implemented
+// in a header-only way, i.e. all the function and class definitions are
+// in .h files instead of .cpp files, to avoid ABI backward-compatiblity
+// breakage.
+
+#include <ATen/NumericUtils.h>
+#include <ATen/core/PhiloxRNGEngine.h>
+
+#include <c10/util/BFloat16-math.h>
+#include <c10/util/BFloat16.h>
+#include <c10/util/Float8_e4m3fn.h>
+#include <c10/util/Float8_e4m3fnuz.h>
+#include <c10/util/Float8_e5m2.h>
+#include <c10/util/Float8_e5m2fnuz.h>
+#include <c10/util/Half.h>
+#include <c10/util/TypeCast.h>
+#include <c10/util/generic_math.h>
+#include <c10/util/irange.h>
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+
+#if defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2) ||  \
+    defined(CPU_CAPABILITY_ZVECTOR) || defined(CPU_CAPABILITY_NEON) || \
+    defined(CPU_CAPABILITY_VSX) || defined(CPU_CAPABILITY_SVE256)
+#define INDUCTOR_USE_VECTOR_TYPES() 1
+#else
+#define INDUCTOR_USE_VECTOR_TYPES() 0
+#endif
+
+#if INDUCTOR_USE_VECTOR_TYPES()
+#include <ATen/cpu/vec/functional.h>
+#include <ATen/cpu/vec/vec.h>
+#else
+// For calc_erfinv
+#include <ATen/native/Math.h>
+#endif
+
+template <typename T>
+struct Welford {
+  T mean = T(0);
+  T m2 = T(0);
+  // Use weight for tail cases since the index of each element in the vec may be
+  // different. A single index can not express masked welford reduction.
+  T weight = T(0);
+  uint64_t index = 0;
+};
+
+template <typename T>
+struct IsVecType : std::false_type {};
+
+template <typename T>
+struct IsVecMaskType : std::false_type {};
+
+#if INDUCTOR_USE_VECTOR_TYPES()
+template <typename T>
+struct IsVecType<at::vec::Vectorized<T>> : std::true_type {};
+template <typename T, int N>
+struct IsVecType<at::vec::VectorizedN<T, N>> : std::true_type {};
+
+template <typename T, int N>
+struct IsVecMaskType<at::vec::VecMask<T, N>> : std::true_type {};
+#endif
+
+template <typename T, uint64_t kChunkSize>
+struct WelfordHelper {
+  // A data struct to help welford reduction:
+  // 1. Save the reciprocal of weights to avoid redundant divisions.
+  // 2. Save the welford stack, which is used to combine welford reduction
+  //    with cascade summation to improve numerical stability.
+  static std::vector<typename T::value_type> weight_recps;
+  std::vector<Welford<T>> welford_stk{};
+  uint64_t depth{0}; // depth of welford_stk.
+  uint64_t num_chunks{0}; // number of chunks stored in welford_stk.
+  WelfordHelper() = default;
+  WelfordHelper(uint64_t N) {
+    uint64_t m = (N + kChunkSize - 1) / kChunkSize; // div up
+    depth = m > 0
+        ? static_cast<std::uint64_t>(ceil(log2(static_cast<double>(m))))
+        : 0;
+    welford_stk.assign(depth, Welford<T>());
+  }
+};
+
+template <typename T, uint64_t kChunkSize>
+std::vector<typename T::value_type> WelfordHelper<T, kChunkSize>::weight_recps =
+    []() {
+      using scalar_t = typename T::value_type;
+      std::vector<scalar_t> temp(kChunkSize);
+      for (const auto i : c10::irange(kChunkSize)) {
+        temp[i] = scalar_t(static_cast<double>(1) / static_cast<double>(i + 1));
+      }
+      return temp;
+    }();
+
+template <typename T>
+Welford<T> welford_combine(
+    const Welford<T>& a,
+    const Welford<T>& b,
+    bool use_index = false) {
+  if (a.index == 0) {
+    return b;
+  }
+  if (b.index == 0) {
+    return a;
+  }
+  auto delta = b.mean - a.mean;
+  auto a_weight = use_index ? T(a.index) : a.weight;
+  auto b_weight = use_index ? T(b.index) : b.weight;
+  auto new_weight = a_weight + b_weight;
+  auto new_index = a.index + b.index;
+  auto wb_over_w = b_weight / new_weight;
+  if constexpr (IsVecType<T>::value) {
+    // Guard against division by zero
+    wb_over_w = T::blendv(wb_over_w, T(0), new_weight == T(0));
+  }
+  auto result = Welford<T>{
+      a.mean + delta * wb_over_w,
+      a.m2 + b.m2 + delta * delta * a_weight * wb_over_w,
+      new_weight,
+      new_index};
+  return result;
+}
+
+template <typename T, uint64_t kChunkSize = 0>
+Welford<T> welford_combine(
+    Welford<T>& acc,
+    T& data,
+    WelfordHelper<T, kChunkSize>* w = nullptr) {
+  // Combine welford reduction with cascade summation to improve numerical
+  // stability.
+  // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+  // https://en.wikipedia.org/wiki/Pairwise_summation
+  if constexpr (IsVecType<T>::value) {
+    if (w != nullptr && w->depth > 0 && acc.index == kChunkSize) {
+      w->welford_stk[0] = welford_combine(w->welford_stk[0], acc);
+      w->num_chunks += 1;
+      acc.mean = T(0);
+      acc.m2 = T(0);
+      acc.weight = T(0);
+      acc.index = 0;
+      uint64_t mask = w->num_chunks;
+      for (uint64_t j = 1; j < w->depth && (mask & 1) == 0; ++j) {
+        w->welford_stk[j] =
+            welford_combine(w->welford_stk[j], w->welford_stk[j - 1]);
+        w->welford_stk[j - 1] = Welford<T>();
+        mask >>= 1;
+      }
+    }
+  }
+  // Add a single data point
+  uint64_t new_index = acc.index + 1;
+  auto new_weight = acc.weight + T(1);
+  auto delta = data - acc.mean;
+  T new_mean;
+  if constexpr (!IsVecType<T>::value) {
+    new_mean = acc.mean + delta / new_weight;
+  } else {
+    // use new_index to fecth 1 / new_weight to avoid divisions
+    new_mean = acc.mean +
+        ((w == nullptr || acc.index >= w->weight_recps.size())
+             ? delta / new_weight
+             : delta * T(w->weight_recps[acc.index]));
+  }
+  auto new_delta = data - new_mean;
+  auto result =
+      Welford<T>{new_mean, acc.m2 + delta * new_delta, new_weight, new_index};
+  return result;
+}
+
+template <typename T, uint64_t kChunkSize = 0>
+Welford<T> welford_combine(Welford<T>& acc, WelfordHelper<T, kChunkSize>* w) {
+  for (const auto i : c10::irange(w->depth)) {
+    acc = welford_combine(acc, w->welford_stk[i]);
+  }
+  return acc;
+}
+
+template <typename T>
+struct IndexValue {
+  int64_t index{};
+  T value;
+  IndexValue(int64_t idx, T val) : index(idx), value(val) {}
+  IndexValue() = default;
+};
+
+#if INDUCTOR_USE_VECTOR_TYPES()
+template <typename T, uint64_t kChunkSize>
+Welford<T> welford_combine(
+    Welford<T>& acc,
+    T& data,
+    int64_t tail_size,
+    WelfordHelper<T, kChunkSize>* w = nullptr) {
+  auto out = welford_combine(acc, data, w);
+  return Welford<T>{
+      T::set(acc.mean, out.mean, tail_size),
+      T::set(acc.m2, out.m2, tail_size),
+      T::set(acc.weight, out.weight, tail_size),
+      out.index};
+}
+
+template <typename T>
+T max_masked_reduce(const T& a, const T& b, const int64_t tail_size) {
+  auto out = at::vec::maximum(a, b);
+  return T::set(a, out, tail_size);
+}
+
+template <typename T>
+T min_masked_reduce(const T& a, const T& b, const int64_t tail_size) {
+  auto out = at::vec::minimum(a, b);
+  return T::set(a, out, tail_size);
+}
+
+template <typename T>
+T sum_masked_reduce(const T& a, const T& b, const int64_t tail_size) {
+  auto out = a + b;
+  return T::set(a, out, tail_size);
+}
+
+template <typename T>
+T prod_masked_reduce(const T& a, const T& b, const int64_t tail_size) {
+  auto out = a * b;
+  return T::set(a, out, tail_size);
+}
+
+template <typename T>
+T xor_sum_masked_reduce(const T& a, const T& b, const int64_t tail_size) {
+  auto out = a ^ b;
+  return T::set(a, out, tail_size);
+}
+#endif
+
+// Refer to
+// https://github.com/pytorch/pytorch/blob/b5b36cf0c4e1958f1ff25120f5d4beeef3288187/
+// aten/src/ATen/native/SharedReduceOps.h#L419-L445
+template <typename scalar_t>
+inline bool greater_or_nan(
+    scalar_t a,
+    scalar_t b,
+    int64_t idx_a,
+    int64_t idx_b) {
+  // If (a == b), then choose the one with lower idx, else max(a, b)
+  if (at::_isnan(a)) {
+    if (at::_isnan(b)) {
+      return idx_a < idx_b;
+    }
+    return true;
+  }
+  return (a == b) ? idx_a < idx_b : (a > b);
+}
+
+template <typename scalar_t>
+inline bool less_or_nan(scalar_t a, scalar_t b, int64_t idx_a, int64_t idx_b) {
+  // If (a == b), then choose the one with lower idx, else min(a, b)
+  if (at::_isnan(a)) {
+    if (at::_isnan(b)) {
+      return idx_a < idx_b;
+    }
+    return true;
+  }
+  return (a == b) ? idx_a < idx_b : (a < b);
+}
+
+template <typename T>
+inline IndexValue<T>& argmin_combine(
+    IndexValue<T>& a,
+    T next_value,
+    int64_t next_index) {
+  if (!(less_or_nan(a.value, next_value, a.index, next_index))) {
+    a.value = next_value;
+    a.index = next_index;
+  }
+  return a;
+}
+template <typename T>
+inline IndexValue<T>& argmax_combine(
+    IndexValue<T>& a,
+    T next_value,
+    int64_t next_index) {
+  if (!(greater_or_nan(a.value, next_value, a.index, next_index))) {
+    a.value = next_value;
+    a.index = next_index;
+  }
+  return a;
+}
+template <typename T>
+inline IndexValue<T>& argmin_combine(
+    IndexValue<T>& a,
+    const IndexValue<T>& next) {
+  return argmin_combine(a, next.value, next.index);
+}
+template <typename T>
+inline IndexValue<T>& argmax_combine(
+    IndexValue<T>& a,
+    const IndexValue<T>& next) {
+  return argmax_combine(a, next.value, next.index);
+}
+
+#if INDUCTOR_USE_VECTOR_TYPES()
+
+template <typename scalar_t>
+inline at::vec::Vectorized<scalar_t> div_floor_floating_vec(
+    const at::vec::Vectorized<scalar_t>& a,
+    const at::vec::Vectorized<scalar_t>& b) {
+  using vec_t = at::vec::Vectorized<scalar_t>;
+  const auto basic_div = a / b;
+  vec_t inf(std::numeric_limits<scalar_t>::infinity());
+  auto mod = a.fmod(b);
+  // Fixup for a case that isn't properly handled by Sleef_fmod
+  auto floor =
+      vec_t::blendv(a - mod, a, (basic_div.abs() == inf) & (a.abs() != inf));
+  auto div = floor / b;
+  const auto zero = vec_t(0);
+  auto mask = (mod != zero) & ((b < zero) ^ (mod < zero));
+  const auto one = vec_t(1);
+  div = vec_t::blendv(div, div - one, mask);
+  auto floordiv = div.floor();
+  mask = (div - floordiv) > vec_t(0.5);
+  floordiv = vec_t::blendv(floordiv, floordiv + one, mask);
+  floordiv = vec_t::blendv(floordiv, zero.copysign(basic_div), div == zero);
+  floordiv = vec_t::blendv(floordiv, basic_div, b == zero);
+  return floordiv;
+};
+
+template <typename scalar_t, int N>
+inline at::vec::VectorizedN<scalar_t, N> div_floor_floating_vec(
+    const at::vec::VectorizedN<scalar_t, N>& a,
+    const at::vec::VectorizedN<scalar_t, N>& b) {
+  at::vec::VectorizedN<scalar_t, N> result;
+#ifndef _MSC_VER
+#pragma unroll
+#endif
+  for (int i = 0; i < N; ++i) {
+    result[i] = div_floor_floating_vec(a[i], b[i]);
+  }
+  return result;
+}
+
+template <typename T, int NV, int NI>
+struct IndexValueVec {
+  at::vec::VectorizedN<T, NV> value;
+  at::vec::VectorizedN<int64_t, NI> index;
+
+  IndexValueVec(const T _value) {
+    value = at::vec::VectorizedN<T, NV>(_value);
+    index = at::vec::VectorizedN<int64_t, NI>(0);
+  };
+
+  IndexValueVec() {};
+};
+
+template <
+    typename T,
+    int NV,
+    int NI,
+    typename std::enable_if_t<at::vec::is_floating_point_v<T>, int> = 0>
+at::vec::VecMask<int64_t, NI> inline get_mask_for_argmin_argmax(
+    const at::vec::VecMask<T, NV>& vmask,
+    const IndexValueVec<T, NV, NI>& a,
+    const at::vec::VectorizedN<T, NV>& value,
+    const at::vec::VectorizedN<int64_t, NI>& index) {
+  /*
+  vec impl for less_or_nan and greater_or_nan
+  example for argmin:
+  a.value = [NaN, NaN, 0, 2, 1, 0]
+  value = [NaN, 0, 0, 1, 2, NaN]
+  vmask = [false, false, false, false, true, false]
+  all_nan_or_equal = [true, false, true, false, false, false]
+  imask = [a.index[0] < index[0], ..., a.index[-1] < index[-1]]
+  iv_mask = blendv (vmask, imask, all_nan_or_equal)
+          [a.index[0] < index[0], false, a.index[2] < index[2], false, true,
+  false] a_nan_b_not: [false, false, false, false, false, true] mask = iv_mask |
+  a_nan_b_not [a.index[0] < index[0], false, a.index[2] < index[2], false, true,
+  true]
+  */
+  using v_t = at::vec::VecMask<T, NV>;
+  using i_t = at::vec::VecMask<int64_t, NI>;
+  i_t vmask_itype = vmask.template cast<int64_t, NI>();
+  // use itype here since there is vec impl for operator~ for itype
+  // while there may not vec impl for vtype
+  v_t isnan_a = a.value.isnan();
+  i_t isnan_a_itype = isnan_a.template cast<int64_t, NI>();
+  v_t isnan_b = value.isnan();
+  i_t isnan_b_type = isnan_b.template cast<int64_t, NI>();
+  i_t all_nan_mask = isnan_a_itype & isnan_b_type;
+  v_t equal_mask = (a.value == value);
+  i_t equal_mask_itype = equal_mask.template cast<int64_t, NI>();
+  i_t all_nan_or_equal = all_nan_mask | equal_mask_itype;
+  i_t imask(a.index < index);
+  i_t iv_mask = i_t::blendv(vmask_itype, imask, all_nan_or_equal);
+  i_t isnan_a_notnan_b = isnan_a_itype & (~isnan_b_type);
+  return iv_mask | isnan_a_notnan_b;
+}
+
+template <
+    typename T,
+    int NV,
+    int NI,
+    typename std::enable_if_t<!at::vec::is_floating_point_v<T>, int> = 0>
+at::vec::VecMask<int64_t, NI> inline get_mask_for_argmin_argmax(
+    const at::vec::VecMask<T, NV>& vmask,
+    const IndexValueVec<T, NV, NI>& a,
+    const at::vec::VectorizedN<T, NV>& value,
+    const at::vec::VectorizedN<int64_t, NI>& index) {
+  using v_t = at::vec::VecMask<T, NV>;
+  using i_t = at::vec::VecMask<int64_t, NI>;
+  i_t vmask_itype = vmask.template cast<int64_t, NI>();
+  v_t equal_mask = (a.value == value);
+  i_t equal_mask_itype = equal_mask.template cast<int64_t, NI>();
+  i_t imask(a.index < index);
+  return i_t::blendv(vmask_itype, imask, equal_mask_itype);
+}
+
+template <typename T, int NV, int NI>
+inline IndexValueVec<T, NV, NI>& argmin_vec_impl(
+    IndexValueVec<T, NV, NI>& a,
+    at::vec::VectorizedN<T, NV> value,
+    at::vec::VectorizedN<int64_t, NI> index,
+    std::optional<int64_t> tail_size) {
+  at::vec::VecMask<T, NV> vmask(a.value < value);
+  at::vec::VecMask<int64_t, NI> final_mask =
+      get_mask_for_argmin_argmax<T, NV, NI>(vmask, a, value, index);
+  if (tail_size.has_value()) {
+    a.value = at::vec::VectorizedN<T, NV>::set(
+        a.value, at::vec::minimum(a.value, value), tail_size.value());
+    a.index = at::vec::VectorizedN<int64_t, NI>::set(
+        a.index,
+        at::vec::VecMask<int64_t, NI>::blendv(index, a.index, final_mask),
+        tail_size.value());
+  } else {
+    a.value = at::vec::minimum(a.value, value);
+    a.index = at::vec::VecMask<int64_t, NI>::blendv(index, a.index, final_mask);
+  }
+  return a;
+}
+
+template <typename T, int NV, int NI>
+inline IndexValueVec<T, NV, NI>& argmax_vec_impl(
+    IndexValueVec<T, NV, NI>& a,
+    at::vec::VectorizedN<T, NV> value,
+    at::vec::VectorizedN<int64_t, NI> index,
+    std::optional<int64_t> tail_size) {
+  at::vec::VecMask<T, NV> vmask(a.value > value);
+  at::vec::VecMask<int64_t, NI> final_mask =
+      get_mask_for_argmin_argmax<T, NV, NI>(vmask, a, value, index);
+  if (tail_size.has_value()) {
+    a.value = at::vec::VectorizedN<T, NV>::set(
+        a.value, at::vec::maximum(a.value, value), tail_size.value());
+    a.index = at::vec::VectorizedN<int64_t, NI>::set(
+        a.index,
+        at::vec::VecMask<int64_t, NI>::blendv(index, a.index, final_mask),
+        tail_size.value());
+  } else {
+    a.value = at::vec::maximum(a.value, value);
+    a.index = at::vec::VecMask<int64_t, NI>::blendv(index, a.index, final_mask);
+  }
+  return a;
+}
+
+template <typename T, int NI, bool horizontal>
+inline at::vec::VectorizedN<int64_t, NI> create_index(int64_t next_index) {
+  at::vec::VectorizedN<int64_t, NI> next_idx;
+  if constexpr (horizontal) {
+    next_idx = at::vec::VectorizedN<int64_t, NI>::arange(next_index, 1);
+  } else {
+    next_idx = at::vec::VectorizedN<int64_t, NI>(next_index);
+  }
+  return next_idx;
+}
+
+template <typename T, int NV, int NI, bool horizontal>
+inline IndexValueVec<T, NV, NI>& argmin_combine_vec(
+    IndexValueVec<T, NV, NI>& a,
+    at::vec::VectorizedN<T, NV> next_value,
+    int64_t next_index,
+    std::optional<int64_t> tail_size = std::nullopt) {
+  auto next_idx = create_index<T, NI, horizontal>(next_index);
+  return argmin_vec_impl(a, next_value, next_idx, tail_size);
+}
+
+template <typename T, int NV, int NI, bool horizontal>
+inline IndexValueVec<T, NV, NI>& argmax_combine_vec(
+    IndexValueVec<T, NV, NI>& a,
+    at::vec::VectorizedN<T, NV> next_value,
+    int64_t next_index,
+    std::optional<int64_t> tail_size = std::nullopt) {
+  auto next_idx = create_index<T, NI, horizontal>(next_index);
+  return argmax_vec_impl(a, next_value, next_idx, tail_size);
+}
+
+template <typename T, int NV, int NI>
+inline IndexValue<T> argmin_vec_reduce_all(
+    const IndexValueVec<T, NV, NI>& vec) {
+  constexpr int len = at::vec::VectorizedN<T, NV>::size();
+  __at_align__ T tmpval[len];
+  __at_align__ int64_t tmpidx[len];
+  vec.value.store(tmpval);
+  vec.index.store(tmpidx);
+  IndexValue res = IndexValue<T>(tmpidx[0], tmpval[0]);
+  for (int i = 1; i < len; i++) {
+    res = argmin_combine(res, tmpval[i], tmpidx[i]);
+  }
+  return res;
+}
+
+template <typename T, int NV, int NI>
+inline IndexValue<T> argmax_vec_reduce_all(
+    const IndexValueVec<T, NV, NI>& vec) {
+  constexpr int len = at::vec::VectorizedN<T, NV>::size();
+  __at_align__ T tmpval[len];
+  __at_align__ int64_t tmpidx[len];
+  vec.value.store(tmpval);
+  vec.index.store(tmpidx);
+  IndexValue res = IndexValue<T>(tmpidx[0], tmpval[0]);
+  for (int i = 1; i < len; i++) {
+    res = argmax_combine(res, tmpval[i], tmpidx[i]);
+  }
+  return res;
+}
+
+template <typename T, int NV, int NI>
+inline IndexValueVec<T, NV, NI>& argmin_combine_vec(
+    IndexValueVec<T, NV, NI>& vec_a,
+    const IndexValueVec<T, NV, NI>& vec_b,
+    std::optional<int64_t> tail_size = std::nullopt) {
+  return argmin_vec_impl(vec_a, vec_b.value, vec_b.index, tail_size);
+}
+
+template <typename T, int NV, int NI>
+inline IndexValueVec<T, NV, NI>& argmax_combine_vec(
+    IndexValueVec<T, NV, NI>& vec_a,
+    const IndexValueVec<T, NV, NI>& vec_b,
+    std::optional<int64_t> tail_size = std::nullopt) {
+  return argmax_vec_impl(vec_a, vec_b.value, vec_b.index, tail_size);
+}
+
+template <typename scalar_t>
+inline at::vec::Vectorized<scalar_t> vec_shuffle_down(
+    at::vec::Vectorized<scalar_t> x,
+    size_t n) {
+  using Vec = at::vec::Vectorized<scalar_t>;
+  alignas(alignof(Vec)) scalar_t array[Vec::size()];
+  x.store(array);
+  for (size_t i = 0; i + n < Vec::size(); i += 2 * n) {
+    array[i] = array[i + n];
+  }
+  return Vec::loadu(array);
+}
+
+#ifdef CPU_CAPABILITY_AVX2
+inline at::vec::Vectorized<float> vec_shuffle_down(
+    at::vec::Vectorized<float> x,
+    size_t n) {
+  using vec_t = at::vec::Vectorized<float>;
+#define SHUFFLE_MASK(z, y, x, w) ((z << 6) | (y << 4) | (x << 2) | w)
+  switch (n) {
+    case 1:
+      return vec_t(_mm256_permute_ps(x, SHUFFLE_MASK(1, 1, 3, 3)));
+    case 2:
+      return vec_t(_mm256_permute_ps(x, SHUFFLE_MASK(2, 2, 2, 2)));
+    case 4:
+      return vec_t(_mm256_permute2f128_ps(x, x, SHUFFLE_MASK(1, 1, 1, 1)));
+  }
+  throw std::runtime_error(
+      "Unhandled vec_shuffle_down value " + std::to_string(n));
+}
+#endif
+
+#ifdef CPU_CAPABILITY_AVX512
+inline at::vec::Vectorized<float> vec_shuffle_down(
+    at::vec::Vectorized<float> x,
+    size_t n) {
+  using vec_t = at::vec::Vectorized<float>;
+#define SHUFFLE_MASK(z, y, x, w) ((z << 6) | (y << 4) | (x << 2) | w)
+  switch (n) {
+    case 1:
+      return vec_t(_mm512_permute_ps(x, SHUFFLE_MASK(1, 1, 3, 3)));
+    case 2:
+      return vec_t(_mm512_permute_ps(x, SHUFFLE_MASK(2, 2, 2, 2)));
+    case 4:
+      return vec_t(_mm512_permutexvar_ps(
+          _mm512_set_epi32(
+              12, 12, 12, 12, 12, 12, 12, 12, 4, 4, 4, 4, 4, 4, 4, 4),
+          x));
+    case 8:
+      return vec_t(_mm512_permutexvar_ps(
+          _mm512_set_epi32(8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8), x));
+  }
+  throw std::runtime_error(
+      "Unhandled vec_shuffle_down value " + std::to_string(n));
+}
+#endif
+
+template <typename scalar_t>
+Welford<scalar_t> welford_vec_reduce_all(
+    Welford<at::vec::Vectorized<scalar_t>> acc) {
+  using Vec = at::vec::Vectorized<scalar_t>;
+  Welford<scalar_t> result;
+  if (acc.index == 0) {
+    return result;
+  }
+  // if all values of acc.weight are same as index,
+  // use index to reduce to save the overhead of vec_shuffle_down for acc.weight
+  bool use_index = (acc.weight - Vec(acc.index)).zero_mask() ==
+      static_cast<int>((1 << Vec::size()) - 1);
+  for (size_t n = 1; n < Vec::size(); n *= 2) {
+    auto shuffled = Welford<Vec>{
+        vec_shuffle_down(acc.mean, n),
+        vec_shuffle_down(acc.m2, n),
+        use_index ? Vec(0) : vec_shuffle_down(acc.weight, n),
+        acc.index};
+    acc = welford_combine(acc, shuffled, use_index);
+  }
+
+  alignas(alignof(Vec)) scalar_t array[Vec::size()];
+  acc.mean.store(array);
+  result.mean = array[0];
+
+  acc.m2.store(array);
+  result.m2 = array[0];
+
+  acc.weight.store(array);
+  result.weight = array[0];
+  result.index = result.weight;
+
+  return result;
+}
+
+template <typename scalar_t>
+Welford<scalar_t> welford_vec_reduce_all(
+    Welford<at::vec::VectorizedN<scalar_t, 2>> acc) {
+  auto Welford0 = Welford<at::vec::Vectorized<scalar_t>>{
+      acc.mean[0], acc.m2[0], acc.weight[0], acc.index};
+  auto Welford1 = Welford<at::vec::Vectorized<scalar_t>>{
+      acc.mean[1], acc.m2[1], acc.weight[1], acc.index};
+  return welford_vec_reduce_all(welford_combine(Welford0, Welford1));
+}
+#endif
+
+template <typename T, typename U>
+inline typename std::common_type_t<T, U> mod(T a, U b) {
+  return a % b;
+}
+template <>
+inline float mod(float a, float b) {
+  return std::fmod(a, b);
+}
+template <>
+inline double mod(double a, double b) {
+  return std::fmod(a, b);
+}
+
+template <typename scalar_t>
+inline scalar_t max_propagate_nan(scalar_t a, scalar_t b) {
+  if (at::_isnan(a)) {
+    return a;
+  }
+  return a > b ? a : b;
+}
+
+template <typename scalar_t>
+inline scalar_t min_propagate_nan(scalar_t a, scalar_t b) {
+  if (at::_isnan(a)) {
+    return a;
+  }
+  return a < b ? a : b;
+}
+
+constexpr float uint32_to_uniform_float(uint32_t value) {
+  // maximum value such that `MAX_INT * scale < 1.0` (with float rounding)
+  constexpr float scale = 4.6566127342e-10;
+  return static_cast<float>(value & 0x7FFFFFFF) * scale;
+}
+
+inline float normalized_rand_cpu(uint32_t seed, uint32_t offset) {
+  return uint32_to_uniform_float(at::Philox4_32(seed, 0, offset)());
+}
+
+inline float randn_cpu(uint32_t seed, uint32_t offset) {
+  at::Philox4_32 engine(seed, 0, offset);
+  return engine.randn(10);
+}
+
+inline int64_t randint64_cpu(
+    uint32_t seed,
+    uint32_t offset,
+    int64_t low,
+    int64_t high) {
+  auto gen = at::Philox4_32(seed, 0, offset);
+  uint64_t r0 = gen();
+  uint64_t r1 = gen();
+  uint64_t result = r0 | (r1 << 32);
+  return static_cast<int64_t>(result % (high - low)) + low;
+}
+
+template <typename T>
+struct AsIntegerType {
+  typedef T type;
+};
+template <>
+struct AsIntegerType<float> {
+  typedef uint32_t type;
+};
+template <>
+struct AsIntegerType<double> {
+  typedef uint64_t type;
+};
+template <>
+struct AsIntegerType<at::BFloat16> {
+  typedef uint16_t type;
+};
+
+template <typename T>
+typename std::enable_if_t<
+    !c10::is_reduced_floating_point_v<T>,
+    T> inline fetch_value(volatile T* addr) {
+  return *addr;
+}
+
+template <typename T>
+typename std::enable_if_t<
+    c10::is_reduced_floating_point_v<T>,
+    T> inline fetch_value(volatile T* addr) {
+  return T(addr->x, T::from_bits());
+}
+
+template <typename T>
+typename std::enable_if_t<!std::is_integral_v<T>> atomic_add(
+    volatile T* addr,
+    T offset) {
+  typedef typename AsIntegerType<T>::type alt_type;
+
+  static_assert(
+      sizeof(std::atomic<alt_type>) == sizeof(T), "std::atomic issue");
+
+  alt_type expected;
+
+  alt_type desired;
+
+  std::atomic<alt_type>* atomic_addr = (std::atomic<alt_type>*)addr;
+  do {
+    T val = fetch_value(addr);
+    reinterpret_cast<T*>(&expected)[0] = val;
+    reinterpret_cast<T*>(&desired)[0] = val + offset;
+  } while (!atomic_addr->compare_exchange_weak(
+      expected, desired, std::memory_order_relaxed));
+}
+
+// Since C++20 float is supported by fetch_add, but the performance may not
+// better than compare_exchange_weak, which can be checked by microbenchmark
+// inductor_cpu_atomic.py
+template <typename T>
+typename std::enable_if_t<std::is_integral_v<T>> atomic_add(
+    volatile T* addr,
+    T offset) {
+  static_assert(sizeof(std::atomic<T>) == sizeof(T), "std::atomic issue");
+  std::atomic<T>* atomic_addr = (std::atomic<T>*)addr;
+  atomic_addr->fetch_add(offset, std::memory_order_relaxed);
+}
+
+#if INDUCTOR_USE_VECTOR_TYPES()
+template <typename T, int NI, int NV>
+void atomic_add_vec(
+    T* addr,
+    at::vec::VectorizedN<int64_t, NI> index,
+    at::vec::VectorizedN<T, NV> offset) {
+  constexpr int len = at::vec::VectorizedN<int64_t, NI>::size();
+  static_assert(len <= at::vec::VectorizedN<T, NV>::size());
+  __at_align__ std::array<T, len> tmpbuf;
+  __at_align__ std::array<int64_t, len> tmpidx;
+  offset.store(tmpbuf.data(), len);
+  index.store(tmpidx.data(), len);
+  for (int i = 0; i < len; i++) {
+    atomic_add(addr + tmpidx[i], tmpbuf[i]);
+  }
+}
+
+template <typename T, bool atomic_add>
+struct transpose_mxn_helper;
+
+template <typename T>
+struct transpose_mxn_helper<T, true> {
+  static void call(
+      const T* src,
+      int64_t ld_src,
+      T* dst,
+      int64_t ld_dst,
+      int M,
+      int N) {
+    for (int i = 0; i < M; i++) {
+      for (int j = 0; j < N; j++) {
+        atomic_add(&dst[j * ld_dst + i], src[i * ld_src + j]);
+      }
+    }
+  }
+};
+
+template <typename T>
+struct transpose_mxn_helper<T, false> {
+  static void call(
+      const T* src,
+      int64_t ld_src,
+      T* dst,
+      int64_t ld_dst,
+      int M,
+      int N) {
+    at::vec::transpose_mxn<T>(src, ld_src, dst, ld_dst, M, N);
+  }
+};
+
+template <typename T, bool atomic_add>
+inline void transpose_mxn(
+    const T* src,
+    int64_t ld_src,
+    T* dst,
+    int64_t ld_dst,
+    int M,
+    int N) {
+  transpose_mxn_helper<T, atomic_add>::call(src, ld_src, dst, ld_dst, M, N);
+}
+
+template <typename T, int M, int N, bool atomic_add>
+inline void transpose_mxn(
+    const T* src,
+    int64_t ld_src,
+    T* dst,
+    int64_t ld_dst) {
+  transpose_mxn<T, atomic_add>(src, ld_src, dst, ld_dst, M, N);
+}
+#endif
+
+// NOLINTBEGIN(*-avoid-c-arrays)
+inline std::tuple<std::shared_ptr<int64_t[]>, int> _get_factors(
+    int64_t number) {
+  int count = 0;
+  for (auto i = static_cast<int64_t>(std::sqrt(number)); i > 0; --i) {
+    if (number % i == 0) {
+      count += 2;
+    }
+  }
+  auto factors = std::shared_ptr<int64_t[]>(new int64_t[count]);
+  int index = 0;
+  for (auto i = static_cast<int64_t>(std::sqrt(number)); i > 0; --i) {
+    if (number % i == 0) {
+      factors[index++] = number / i;
+      factors[index++] = i;
+    }
+  }
+  return std::make_tuple(factors, count);
+}
+
+inline std::tuple<std::shared_ptr<int64_t[]>, int> get_factors(int64_t number) {
+  thread_local std::map<int64_t, std::tuple<std::shared_ptr<int64_t[]>, int>>
+      cache;
+  auto it = cache.find(number);
+  if (it != cache.end()) {
+    return it->second;
+  } else {
+    auto factors = _get_factors(number);
+    cache[number] = factors;
+    return factors;
+  }
+}
+// NOLINTEND(*-avoid-c-arrays)
+
+inline void _mm_get_thread_blocking(
+    int num_threads,
+    int max_k_slices,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t Mr,
+    int64_t Nr,
+    int64_t Kr,
+    int64_t& Mt,
+    int64_t& Nt,
+    int64_t& Kt) {
+  // see NOTE [Thread blocking in Cpp GEMM] for heuristics
+  Mt = Nt = Kt = 0;
+
+  auto get_blocking = [](int64_t m_factor,
+                         int64_t n_factor,
+                         int64_t k_factor,
+                         int64_t m_blocks,
+                         int64_t n_blocks,
+                         int64_t k_blocks) {
+    int64_t thread_block_k = (k_blocks + k_factor - 1) / k_factor;
+    int64_t thread_block_n = (n_blocks + n_factor - 1) / n_factor;
+    int64_t thread_block_m = (m_blocks + m_factor - 1) / m_factor;
+    return std::make_tuple(thread_block_m, thread_block_n, thread_block_k);
+  };
+
+  auto is_better_blocking = [=](int64_t Mt_,
+                                int64_t Nt_,
+                                int64_t Kt_,
+                                int64_t Mt,
+                                int64_t Nt,
+                                int64_t Kt) {
+    return Mt == 0 || Kt_ < Kt || Mt_ * Mr + Nt_ * Nr < Mt * Mr + Nt * Nr;
+  };
+
+  int64_t m_blocks = (M + Mr - 1) / Mr;
+  int64_t n_blocks = (N + Nr - 1) / Nr;
+  int64_t k_blocks = (K + Kr - 1) / Kr;
+
+  auto [factors, count] = get_factors(num_threads);
+  assert(count > 0);
+
+  for (int i = 0; i < count; ++i) {
+    int64_t n_factor = factors[i];
+    int64_t m_factor = num_threads / n_factor;
+    if (n_blocks >= n_factor && m_blocks >= m_factor) {
+      auto [Mt_, Nt_, Kt_] =
+          get_blocking(m_factor, n_factor, 1, m_blocks, n_blocks, k_blocks);
+      if (is_better_blocking(Mt_, Nt_, Kt_, Mt, Nt, Kt)) {
+        std::tie(Mt, Nt, Kt) = std::make_tuple(Mt_, Nt_, Kt_);
+      }
+    }
+  }
+
+  if (Mt != 0) {
+    return;
+  }
+
+  for (int i = 0; i < count; ++i) {
+    int64_t k_factor = factors[i];
+    if (k_blocks >= k_factor &&
+        (max_k_slices == 0 || k_factor <= max_k_slices)) {
+      auto [mxn_factors, mxn_count] = get_factors(num_threads / k_factor);
+      for (int j = 0; j < mxn_count; ++j) {
+        int64_t n_factor = mxn_factors[j];
+        int64_t m_factor = num_threads / (k_factor * n_factor);
+        if (n_blocks >= n_factor && m_blocks >= m_factor) {
+          auto [Mt_, Nt_, Kt_] = get_blocking(
+              m_factor, n_factor, k_factor, m_blocks, n_blocks, k_blocks);
+          if (is_better_blocking(Mt_, Nt_, Kt_, Mt, Nt, Kt)) {
+            std::tie(Mt, Nt, Kt) = std::make_tuple(Mt_, Nt_, Kt_);
+          }
+        }
+      }
+    }
+  }
+
+  if (Mt != 0) {
+    return;
+  }
+
+  for (int i = 0; i < count; ++i) {
+    int64_t n_factor = factors[i];
+    int64_t m_factor = num_threads / n_factor;
+    if (n_blocks >= n_factor || m_blocks >= m_factor) {
+      auto [Mt_, Nt_, Kt_] =
+          get_blocking(m_factor, n_factor, 1, m_blocks, n_blocks, k_blocks);
+      if (is_better_blocking(Mt_, Nt_, Kt_, Mt, Nt, Kt)) {
+        std::tie(Mt, Nt, Kt) = std::make_tuple(Mt_, Nt_, Kt_);
+      }
+    }
+  }
+
+  assert(Mt != 0);
+}
+
+inline void mm_get_thread_blocking(
+    int num_threads,
+    int max_k_slices,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t Mr,
+    int64_t Nr,
+    int64_t Kr,
+    int64_t& Mt,
+    int64_t& Nt,
+    int64_t& Kt) {
+  thread_local std::map<
+      std::
+          tuple<int, int, int64_t, int64_t, int64_t, int64_t, int64_t, int64_t>,
+      std::tuple<int64_t, int64_t, int64_t>>
+      cache;
+  auto key = std::make_tuple(num_threads, max_k_slices, M, N, K, Mr, Nr, Kr);
+  auto it = cache.find(key);
+  if (it != cache.end()) {
+    std::tie(Mt, Nt, Kt) = it->second;
+    return;
+  } else {
+    _mm_get_thread_blocking(
+        num_threads, max_k_slices, M, N, K, Mr, Nr, Kr, Mt, Nt, Kt);
+    cache[key] = std::make_tuple(Mt, Nt, Kt);
+  }
+}
+
+// NOLINTBEGIN(*-narrowing-conversions)
+template <typename X_t, typename W_t>
+void _mm_get_cache_blocking(
+    int num_threads,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t Mr,
+    int64_t Nr,
+    int64_t Kr,
+    int64_t Mt_blocks,
+    int64_t Nt_blocks,
+    int64_t Kt_blocks,
+    int64_t& Mc_blocks,
+    int64_t& Nc_blocks,
+    int64_t& Kc_blocks,
+    uint32_t L1_cache_size,
+    uint32_t L2_cache_size) {
+  // See NOTE [CPP GEMM Cache Blocking Algorithm] for the cache blocking
+  // algorithm.
+  // TODO(jgong5): cache cache blocking results
+  // TODO: tune the factor here
+  float L1_limit_factor = 0.8;
+  float L2_limit_factor = 0.5;
+
+  auto L1 = L1_cache_size * L1_limit_factor;
+  auto L2 = L2_cache_size * L2_limit_factor;
+
+  constexpr size_t num_byte_A = sizeof(X_t);
+  constexpr size_t num_byte_B = sizeof(W_t);
+
+  int64_t size_cache_B = Kr * Kt_blocks * Nr * num_byte_B;
+  Kc_blocks = Kt_blocks;
+  if (size_cache_B > L1) {
+    Kc_blocks = (int64_t)std::floor(L1 / (Kr * Nr * num_byte_B));
+  }
+
+  float min_Mc_ratio = 2;
+  int64_t min_Mc_blocks = std::ceil(min_Mc_ratio * Mr / Nr);
+  auto Kt_bytes = Kt_blocks * Kr * num_byte_A;
+  if (min_Mc_blocks * Mr * Kt_bytes < L2) {
+    Mc_blocks = std::min(Mt_blocks, (int64_t)std::floor(L2 / (Mr * Kt_bytes)));
+    Nc_blocks = 1;
+  } else {
+    Mc_blocks = Mt_blocks;
+    Nc_blocks =
+        std::min((int64_t)std::ceil((float)Mc_blocks * Mr / Nr), Nt_blocks);
+    auto Nc_bytes = Nc_blocks * Nr * 4;
+    auto Kc_bytes = Kc_blocks * Kr * num_byte_A;
+    if (Mc_blocks * Mr * (Kc_bytes + Nc_bytes) > L2) {
+      auto M_max = (std::sqrt(Kc_bytes * Kc_bytes + 16 * L2) - Kc_bytes) / 8;
+      if (M_max < Mc_blocks * Mr) {
+        Mc_blocks = (int64_t)std::floor(M_max / Mr);
+        Nc_blocks =
+            std::min((int64_t)std::ceil((float)Mc_blocks * Mr / Nr), Nt_blocks);
+      }
+    }
+  }
+}
+// NOLINTEND(*-narrowing-conversions)
+
+template <typename X_t, typename W_t>
+void mm_get_cache_blocking(
+    int num_threads,
+    int64_t M,
+    int64_t N,
+    int64_t K,
+    int64_t Mr,
+    int64_t Nr,
+    int64_t Kr,
+    int64_t Mt_blocks,
+    int64_t Nt_blocks,
+    int64_t Kt_blocks,
+    int64_t& Mc_blocks,
+    int64_t& Nc_blocks,
+    int64_t& Kc_blocks,
+    uint32_t L1_cache_size,
+    uint32_t L2_cache_size) {
+  thread_local std::map<
+      std::tuple<
+          int,
+          int64_t,
+          int64_t,
+          int64_t,
+          int64_t,
+          int64_t,
+          int64_t,
+          int64_t,
+          int64_t,
+          int64_t,
+          int64_t,
+          int64_t>,
+      std::tuple<int64_t, int64_t, int64_t>>
+      cache;
+  auto key = std::make_tuple(
+      num_threads,
+      M,
+      N,
+      K,
+      Mr,
+      Nr,
+      Kr,
+      Mt_blocks,
+      Nt_blocks,
+      Kt_blocks,
+      L1_cache_size,
+      L2_cache_size);
+  auto it = cache.find(key);
+  if (it != cache.end()) {
+    std::tie(Mc_blocks, Nc_blocks, Kc_blocks) = it->second;
+    return;
+  } else {
+    _mm_get_cache_blocking<X_t, W_t>(
+        num_threads,
+        M,
+        N,
+        K,
+        Mr,
+        Nr,
+        Kr,
+        Mt_blocks,
+        Nt_blocks,
+        Kt_blocks,
+        Mc_blocks,
+        Nc_blocks,
+        Kc_blocks,
+        L1_cache_size,
+        L2_cache_size);
+    cache[key] = std::make_tuple(Mc_blocks, Nc_blocks, Kc_blocks);
+  }
+}
+
+struct amx_tilecfg {
+  uint8_t palette_id{0};
+  uint8_t start_row{0};
+  std::array<uint8_t, 14> reserved_0{};
+  std::array<uint16_t, 16> colsb{};
+  std::array<uint8_t, 16> rows{};
+};
+
+class AMXState {
+ private:
+  amx_tilecfg tilecfg_{};
+  uint8_t rows_{0};
+  uint16_t colsb_{0};
+  uint8_t num_tile_rows_{0};
+  uint8_t num_tile_columns_{0};
+
+ public:
+  AMXState() = default;
+
+  inline void configure(
+      uint8_t rows,
+      uint16_t colsb,
+      uint8_t num_tile_rows,
+      uint8_t num_tile_columns,
+      void (*loadconfig)(const amx_tilecfg&)) {
+    if (tilecfg_.palette_id == 1 && rows_ == rows && colsb_ == colsb &&
+        num_tile_rows_ == num_tile_rows &&
+        num_tile_columns_ == num_tile_columns) {
+      return;
+    }
+    tilecfg_.palette_id = 1;
+    rows_ = rows;
+    colsb_ = colsb;
+    num_tile_rows_ = num_tile_rows;
+    num_tile_columns_ = num_tile_columns;
+    const auto num_c_tiles = num_tile_rows * num_tile_columns;
+    // For C
+    for (int i = 0; i < num_c_tiles; i++) {
+      tilecfg_.rows[i] = rows;
+      tilecfg_.colsb[i] = 64;
+    }
+    // For A
+    for (int i = 0; i < num_tile_rows; i++) {
+      tilecfg_.rows[i + num_c_tiles] = rows;
+      tilecfg_.colsb[i + num_c_tiles] = colsb;
+    }
+    // For B
+    for (int i = 0; i < num_tile_columns; i++) {
+      tilecfg_.rows[i + num_c_tiles + num_tile_rows] = colsb / 4;
+      tilecfg_.colsb[i + num_c_tiles + num_tile_rows] = 64;
+    }
+    loadconfig(tilecfg_);
+  }
+
+  inline void release(void (*tile_release)()) {
+    tilecfg_.palette_id = 0;
+    tile_release();
+  }
+};
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/cpp_wrapper/array_ref.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/cpp_wrapper/array_ref.h
new file mode 100644
index 0000000000000000000000000000000000000000..d95f29fb119936d254de5888303d812611ddf7b6
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/cpp_wrapper/array_ref.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#include <torch/csrc/inductor/aoti_runtime/arrayref_tensor.h>
+#include <torch/csrc/inductor/aoti_runtime/thread_local.h>
+#include <torch/csrc/inductor/array_ref_impl.h>
+#include <torch/csrc/inductor/cpp_wrapper/common.h>
+#include <torch/csrc/inductor/cpp_wrapper/device_internal/cpu.h>
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/cpp_wrapper/common.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/cpp_wrapper/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e6e76a91e6dc3c937f2e443ee17763e4fb82edd
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/cpp_wrapper/common.h
@@ -0,0 +1,76 @@
+#pragma once
+
+#include <array>
+#include <filesystem>
+#include <optional>
+#include <utility>
+
+#include <Python.h>
+#define PYBIND11_SIMPLE_GIL_MANAGEMENT
+#include <pybind11/gil.h>
+
+// Include some often-used cpp_wrapper headers, for precompiling.
+#include <c10/util/BFloat16.h>
+#include <torch/csrc/Device.h>
+#include <torch/csrc/DynamicTypes.h>
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/utils/pythoncapi_compat.h>
+#include <torch/csrc/utils/tensor_memoryformats.h>
+
+namespace py = pybind11; // NOLINT(misc-unused-alias-decls)
+
+class RAIIPyObject {
+ public:
+  RAIIPyObject() = default;
+  // steals a reference to a PyObject
+  RAIIPyObject(PyObject* obj) : obj_{obj} {}
+  RAIIPyObject(const RAIIPyObject& other) : obj_{other.obj_} {
+    Py_XINCREF(obj_);
+  }
+  RAIIPyObject(RAIIPyObject&& other) noexcept {
+    // refcount doesn't change, and obj_ is currently nullptr
+    std::swap(obj_, other.obj_);
+  }
+  ~RAIIPyObject() {
+    Py_XDECREF(obj_);
+  }
+  RAIIPyObject& operator=(const RAIIPyObject& other) {
+    if (this != &other) {
+      Py_XDECREF(obj_);
+      obj_ = other.obj_;
+      Py_XINCREF(obj_);
+    }
+    return *this;
+  }
+  RAIIPyObject& operator=(RAIIPyObject&& other) noexcept {
+    // refcount to the current object decreases, but refcount to other.obj_ is
+    // the same
+    Py_XDECREF(obj_);
+    obj_ = std::exchange(other.obj_, nullptr);
+    return *this;
+  }
+  operator bool() const noexcept {
+    return obj_;
+  }
+  operator PyObject*() {
+    return obj_;
+  }
+  PyObject* get() {
+    return obj_;
+  }
+
+ private:
+  PyObject* obj_{nullptr};
+};
+
+#include <torch/csrc/inductor/aoti_runtime/device_utils.h>
+#include <torch/csrc/inductor/aoti_runtime/utils.h>
+using namespace torch::aot_inductor;
+
+#include <c10/util/generic_math.h>
+#include <torch/csrc/inductor/aoti_runtime/scalar_to_tensor.h>
+
+// Round up to the nearest multiple of 64
+[[maybe_unused]] inline int64_t align(int64_t nbytes) {
+  return (nbytes + 64 - 1) & -64;
+}
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/cpp_wrapper/cpu.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/cpp_wrapper/cpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..273db3b37b1bd93eb4f259f6bd5c0003315a5360
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/cpp_wrapper/cpu.h
@@ -0,0 +1,4 @@
+#pragma once
+
+#include <torch/csrc/inductor/cpp_wrapper/common.h>
+#include <torch/csrc/inductor/cpp_wrapper/device_internal/cpu.h>
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/cpp_wrapper/cuda.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/cpp_wrapper/cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e5cc50895930e4373bf97cf287494e64d075cea
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/cpp_wrapper/cuda.h
@@ -0,0 +1,4 @@
+#pragma once
+
+#include <torch/csrc/inductor/cpp_wrapper/common.h>
+#include <torch/csrc/inductor/cpp_wrapper/device_internal/cuda.h>
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/cpp_wrapper/device_internal/cpu.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/cpp_wrapper/device_internal/cpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..90a9d3f027a0e1c004040c139a51e36fe121e8a1
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/cpp_wrapper/device_internal/cpu.h
@@ -0,0 +1,3 @@
+#pragma once
+
+#include <torch/csrc/inductor/aoti_torch/generated/c_shim_cpu.h>
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/cpp_wrapper/device_internal/cuda.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/cpp_wrapper/device_internal/cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..281c01d6f974616ec2277a3466988912a1728ff9
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/cpp_wrapper/device_internal/cuda.h
@@ -0,0 +1,4 @@
+#pragma once
+
+#include <torch/csrc/inductor/aoti_runtime/utils_cuda.h>
+#include <torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h>
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/cpp_wrapper/device_internal/mps.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/cpp_wrapper/device_internal/mps.h
new file mode 100644
index 0000000000000000000000000000000000000000..be096167f19293354e1b78d9830bbb160005ba4f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/cpp_wrapper/device_internal/mps.h
@@ -0,0 +1,4 @@
+#pragma once
+
+#include <torch/csrc/inductor/aoti_torch/c/shim_mps.h>
+#include <torch/csrc/inductor/aoti_torch/generated/c_shim_mps.h>
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/cpp_wrapper/device_internal/xpu.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/cpp_wrapper/device_internal/xpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..2bb01d76a79940d5295382f733f16f73c3cf234e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/cpp_wrapper/device_internal/xpu.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#include <torch/csrc/inductor/aoti_runtime/sycl_runtime_wrappers.h>
+#include <torch/csrc/inductor/aoti_runtime/utils_xpu.h>
+#include <torch/csrc/inductor/aoti_torch/generated/c_shim_xpu.h>
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/cpp_wrapper/mps.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/cpp_wrapper/mps.h
new file mode 100644
index 0000000000000000000000000000000000000000..700995ffaade186e9b4c09f63979ea2b73bbdded
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/cpp_wrapper/mps.h
@@ -0,0 +1,4 @@
+#pragma once
+
+#include <torch/csrc/inductor/cpp_wrapper/common.h>
+#include <torch/csrc/inductor/cpp_wrapper/device_internal/mps.h>
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/cpp_wrapper/xpu.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/cpp_wrapper/xpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..21876bb43d10eabe38fee4f1593d29ed25c82458
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/cpp_wrapper/xpu.h
@@ -0,0 +1,4 @@
+#pragma once
+
+#include <torch/csrc/inductor/cpp_wrapper/common.h>
+#include <torch/csrc/inductor/cpp_wrapper/device_internal/xpu.h>
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/inductor_ops.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/inductor_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..e7f5036c53d8c8c78dcc169fe71545c610f3f248
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/inductor_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+
+namespace torch::inductor {
+
+TORCH_API at::Tensor _mm_plus_mm_out(
+    at::Tensor& out,
+    const at::Tensor& a,
+    const at::Tensor& b,
+    const at::Tensor& c,
+    const at::Tensor& d);
+
+// After adding _mm_plus_mm_out, this should not be exposed and called by model
+// code. Keeping it around for backward compatibility. Will be deprecated later.
+TORCH_API at::Tensor _mm_plus_mm(
+    const at::Tensor& a,
+    const at::Tensor& b,
+    const at::Tensor& c,
+    const at::Tensor& d,
+    at::Tensor& out);
+
+TORCH_API at::Tensor _alloc_from_pool(
+    const at::Tensor& self,
+    int64_t offset_bytes,
+    at::ScalarType dtype,
+    at::IntArrayRef size,
+    at::IntArrayRef stride);
+
+// Similar to as_strided with the following differences
+// - offset is added to the existing offset (rather than replacing it)
+// - view tracking is disabled similar to unsafe_view
+TORCH_API at::Tensor _reinterpret_tensor(
+    const at::Tensor& self,
+    at::IntArrayRef size,
+    at::IntArrayRef stride,
+    int64_t offset_increment = 0);
+
+} // namespace torch::inductor
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/static_cuda_launcher.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/static_cuda_launcher.h
new file mode 100644
index 0000000000000000000000000000000000000000..125396f123c299cab990cd1acb9ad34d229d3ddf
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/inductor/static_cuda_launcher.h
@@ -0,0 +1,7 @@
+#pragma once
+#if defined(USE_CUDA) && !defined(USE_ROCM)
+#include <torch/csrc/inductor/cpp_wrapper/device_internal/cuda.h>
+#include <torch/csrc/python_headers.h>
+
+bool StaticCudaLauncher_init(PyObject* module);
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/instruction_counter/Module.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/instruction_counter/Module.h
new file mode 100644
index 0000000000000000000000000000000000000000..6fde1e52d247bd579701dd90a2918e7077bb99fc
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/instruction_counter/Module.h
@@ -0,0 +1,8 @@
+#pragma once
+#include <torch/csrc/python_headers.h>
+
+namespace torch::instruction_counter {
+
+void initModule(PyObject* module);
+
+} // namespace torch::instruction_counter
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/itt.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/itt.h
new file mode 100644
index 0000000000000000000000000000000000000000..01c002ae3e66b0db0428496bc73c764458834aeb
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/itt.h
@@ -0,0 +1,8 @@
+#ifndef ITT_H
+#define ITT_H
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch::profiler {
+void initIttBindings(PyObject* module); // namespace torch::profiler
+}
+#endif // ITT_H
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/itt_wrapper.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/itt_wrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..288d4155774a2350c39f5fa008d41de58e110428
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/itt_wrapper.h
@@ -0,0 +1,12 @@
+#ifndef PROFILER_ITT_H
+#define PROFILER_ITT_H
+#include <c10/macros/Export.h>
+
+namespace torch::profiler {
+TORCH_API bool itt_is_available();
+TORCH_API void itt_range_push(const char* msg);
+TORCH_API void itt_range_pop();
+TORCH_API void itt_mark(const char* msg);
+} // namespace torch::profiler
+
+#endif // PROFILER_ITT_H
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/api/compilation_unit.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/api/compilation_unit.h
new file mode 100644
index 0000000000000000000000000000000000000000..921a2ef228732ad1a1ffe30d5df35c432ee1f324
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/api/compilation_unit.h
@@ -0,0 +1,351 @@
+#pragma once
+#include <ATen/core/function.h>
+#include <c10/util/Exception.h>
+#include <torch/csrc/jit/api/function_impl.h>
+#include <torch/csrc/jit/frontend/name_mangler.h>
+#include <torch/csrc/jit/frontend/source_range.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/runtime/graph_executor.h>
+
+#include <torch/csrc/Export.h>
+
+#include <ATen/core/function_schema.h>
+#include <ATen/core/qualified_name.h>
+#include <c10/util/ArrayRef.h>
+#include <optional>
+
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <ostream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace torch::jit {
+
+struct Def;
+struct Property;
+struct ClassDef;
+struct SugaredValue;
+struct Resolver;
+
+using ResolverPtr = std::shared_ptr<Resolver>;
+struct Self {
+  virtual ~Self() = default;
+  virtual std::shared_ptr<SugaredValue> makeSugared(Value* v) const = 0;
+  virtual ClassTypePtr getClassType() const = 0;
+};
+
+// A CompilationUnit is a list of named Functions
+// with helper methods to iterate the list or invoke the function.
+// Classes have a CompilationUnit holding the class methods,
+// and Modules have a CompilationUnit holding the Functions that
+// are used to implement their Methods
+
+struct TORCH_API CompilationUnit {
+  enum class FunctionType { Method, Hook, PreHook };
+  // constructor that takes a set of functions to compile using the native
+  // resolver
+  explicit CompilationUnit(const std::string& source);
+  CompilationUnit() = default;
+
+  CompilationUnit& operator=(CompilationUnit&&) = default;
+  CompilationUnit(CompilationUnit&&) = default;
+  CompilationUnit& operator=(const CompilationUnit&) = delete;
+  CompilationUnit(const CompilationUnit&) = delete;
+
+  Function* find_function(const c10::QualifiedName& name) const {
+    auto it = dict_.find(name);
+    if (it == dict_.end()) {
+      return nullptr;
+    }
+    return functions_[it->second].get();
+  }
+
+  Function& get_function(const c10::QualifiedName& name) const {
+    if (auto r = find_function(name)) {
+      return *r;
+    }
+    TORCH_CHECK(false, "attempted to get undefined function ", name.name());
+  }
+
+  void set_optimized(bool o) {
+    TORCH_WARN(
+        "CompilationUnit::set_optimized() is deprecated and has no effect. "
+        "Please use setGraphExecutorOptimize()");
+  }
+
+  bool is_optimized() const {
+    TORCH_WARN(
+        "CompilationUnit::is_optimized() is deprecated and always returns true. "
+        "Please use getGraphExecutorOptimize()");
+    return true;
+  }
+
+  // for historic reasons, these are defined in ir_emitter.cpp
+  // Returns the list of Functions just defined.
+  std::vector<Function*> define(
+      const std::optional<c10::QualifiedName>& prefix,
+      const std::vector<Property>& properties,
+      const std::vector<ResolverPtr>& propResolvers,
+      const std::vector<Def>& definitions,
+      const std::vector<ResolverPtr>&
+          defResolvers, /* determines how we handle free
+                     variables in each definition*/
+      // if non-null, the first argument to each def, is bound to this value
+      const Self* self,
+      // see [name mangling]
+      bool shouldMangle = false,
+      std::optional<size_t> operator_set_version = std::nullopt);
+
+  void define_hooks(
+      const std::optional<c10::QualifiedName>& prefix,
+      const std::vector<Def>& hookDefs,
+      const std::vector<ResolverPtr>& hookResolvers,
+      const std::vector<Def>& preHookDefs,
+      const std::vector<ResolverPtr>& preHookResolvers,
+      const Self* self,
+      bool shouldMangle = false);
+
+  // same as above but parse the definitions from source
+  // Returns the list of Functions just defined.
+  std::vector<Function*> define(
+      // prefix namespace to put all the defined functions into
+      const std::optional<c10::QualifiedName>& prefix,
+      const std::string& source,
+      const ResolverPtr& resolver,
+      const Self* self);
+
+  void define_interface(
+      const c10::QualifiedName& qualifiedName,
+      const ClassDef& classDef,
+      ResolverPtr rcb,
+      bool is_module = false);
+
+  Function* create_function(
+      c10::QualifiedName name,
+      std::shared_ptr<Graph> graph,
+      bool shouldMangle = false) {
+    if (shouldMangle) {
+      name = mangle(name);
+    }
+    auto fn = std::make_unique<GraphFunction>(
+        std::move(name), std::move(graph), nullptr);
+    auto ret = fn.get();
+    register_function(std::move(fn));
+    return ret;
+  }
+
+  std::vector<Function*> get_functions() const {
+    return fmap(functions_, [](const std::unique_ptr<Function>& fn) {
+      return fn.get();
+    });
+  }
+
+  /// Run a method from this compilation.
+  ///
+  /// For example:
+  /// @code
+  ///   IValue output = module->run("relu_script", a, b);
+  /// @endcode
+  ///
+  /// To get a compile a module from a source string, see torch::jit::compile
+  ///
+  /// @param method_name The name of the method to run
+  /// @param args Arguments to be passed to the method
+  /// @return An IValue containing the return value (or values if it is a tuple)
+  /// from the method
+  template <typename... Types>
+  IValue run_method(const c10::QualifiedName& method_name, Types&&... args) {
+    return get_function(method_name)({IValue(std::forward<Types>(args))...});
+  }
+
+  void drop_all_functions() {
+    dict_.clear();
+    functions_.clear();
+  }
+
+  /**
+   * Register a class as being owned by this compilation unit.
+   */
+  void register_type(c10::NamedTypePtr namedType) {
+    // TODO: class types cannot be redefined because we have no way right now
+    // of invalidating their methods. NamedTuples are fine though, since they
+    // don't have methods.
+    TORCH_CHECK(
+        0 == classDict_.count(*namedType->name()),
+        "class '",
+        namedType->name()->qualifiedName(),
+        "' already defined.");
+    classes_.push_back(std::move(namedType));
+    classDict_[*classes_.back()->name()] = classes_.size() - 1;
+  }
+
+  c10::ClassTypePtr get_class(const c10::QualifiedName& name) const {
+    auto type = get_type(name);
+    if (!type) {
+      return nullptr;
+    }
+    return type->cast<c10::ClassType>();
+  }
+
+  c10::InterfaceTypePtr get_interface(const c10::QualifiedName& name) const {
+    auto type = get_type(name);
+    if (!type) {
+      return nullptr;
+    }
+    return type->cast<c10::InterfaceType>();
+  }
+
+  c10::TupleTypePtr get_named_tuple(const c10::QualifiedName& name) const {
+    for (const auto& cls : classes_) {
+      if (cls->name()->qualifiedName() == name.qualifiedName()) {
+        return cls->expect<TupleType>();
+      }
+    }
+    return nullptr;
+  }
+
+  c10::NamedTypePtr get_type(const c10::QualifiedName& name) const {
+    auto it = classDict_.find(name);
+    if (it == classDict_.end()) {
+      return nullptr;
+    }
+    return classes_[it->second];
+  }
+
+  // For testing: clear all Python-defined classes to ensure that unit tests
+  // have isolation.
+  void _clear_python_cu() {
+    // Delete all the associated class methods
+    for (const auto& type : classes_) {
+      if (auto cls = type->cast<ClassType>()) {
+        for (auto method : cls->methods()) {
+          // Tombstone the method in the compilation unit.
+          // Don't erase because the dict_
+          auto it = dict_.find(method->qualname());
+          if (it != dict_.end()) {
+            functions_[it->second] = nullptr;
+            // Erase in our big lookup table
+            dict_.erase(it);
+          }
+        }
+        // Classes can have multiple pointers to the same hook,
+        // need to make sure to not delete it twice
+        std::unordered_set<Function*> hooks_to_delete;
+        for (const auto& hook : cls->getForwardHooks()) {
+          hooks_to_delete.insert(hook);
+        }
+        for (const auto& pre_hook : cls->getForwardPreHooks()) {
+          hooks_to_delete.insert(pre_hook);
+        }
+        for (const auto& hook : hooks_to_delete) {
+          // Tombstone the hook in the compilation unit.
+          auto it = dict_.find(hook->qualname());
+          if (it != dict_.end()) {
+            functions_[it->second] = nullptr;
+            // Erase in our big lookup table
+            dict_.erase(it);
+          }
+        }
+      }
+    }
+    classes_.clear();
+    classDict_.clear();
+  }
+
+  // [Internal Only] Remove method.
+  // Note Used for freezing.
+  void unsafeRemoveMethod(const c10::QualifiedName& method_name) {
+    auto it = dict_.find(method_name);
+    TORCH_CHECK(
+        it != dict_.end(),
+        "method '",
+        method_name.qualifiedName(),
+        "' does not exist.");
+    functions_[it->second] = nullptr;
+    dict_.erase(it);
+  }
+
+  // [name mangling] All code objects must have a unique qualified name in a
+  // CompilationUnit. In Python, sometimes functions won't have unique qualified
+  // name (for example, nested functions). So we mangle Python functions to
+  // ensure that they are uniquely named.
+  //
+  // We also use mangling to distinguish different Module instances. Since each
+  // Module is a singleton class instance, different instances of the same
+  // Python Module will have different types but the same qualified name.
+  c10::QualifiedName mangle(const c10::QualifiedName& name) const {
+    auto mangled = name;
+    while (get_type(mangled) || find_function(mangled)) {
+      mangled = mangler_.mangle(mangled);
+    }
+    return mangled;
+  }
+
+ private:
+  std::unique_ptr<Function> define(
+      const std::optional<c10::QualifiedName>& prefix,
+      const Def& def,
+      const ResolverPtr& resolver,
+      const Self* self,
+      const std::unordered_map<std::string, Function*>& function_table,
+      bool shouldMangle = false,
+      FunctionType type = FunctionType::Method,
+      std::optional<size_t> version = std::nullopt) const;
+
+  // Define a property on \p self.
+  struct PropertyPair;
+  PropertyPair define_property(
+      const std::optional<c10::QualifiedName>& prefix,
+      const Property& prop,
+      const ResolverPtr& resolver,
+      const Self* self,
+      const std::unordered_map<std::string, Function*>& function_table,
+      bool shouldMangle = false) const;
+
+  Function& register_function(std::unique_ptr<Function> fn) {
+    TORCH_CHECK(
+        0 == dict_.count(fn->qualname().qualifiedName()),
+        "method '",
+        fn->qualname().qualifiedName(),
+        "' already defined.");
+    functions_.emplace_back(std::move(fn));
+    dict_[functions_.back()->qualname()] = functions_.size() - 1;
+    return *functions_.back();
+  }
+  std::vector<std::unique_ptr<Function>> functions_;
+  // for fast lookup
+  std::unordered_map<c10::QualifiedName, size_t> dict_;
+  std::unordered_map<c10::QualifiedName, size_t> classDict_;
+
+  // [class ownership] Right now there are two relationships between classes
+  // and compilation units:
+  // 1. Classes have compilation units internally that hold their methods.
+  // 2. On load, the TypePtrs of any imported classes are owned by the main
+  // module's compilation unit.
+  std::vector<c10::NamedTypePtr> classes_;
+
+  mutable NameMangler mangler_;
+};
+
+// An owning pointer to a Function. Just a pair of a raw Function ptr and it's
+// owning CU. We need this because pybind requires a ref-counted way to refer to
+// Functions.
+struct StrongFunctionPtr {
+  StrongFunctionPtr(std::shared_ptr<CompilationUnit> cu, Function* function)
+      : cu_(std::move(cu)), function_(function) {
+    TORCH_INTERNAL_ASSERT(cu_);
+    TORCH_INTERNAL_ASSERT(function_);
+  }
+  std::shared_ptr<CompilationUnit> cu_;
+  Function* function_;
+};
+
+namespace script {
+// We once had a `script::` namespace that was deleted. This is for backcompat
+// of the public API; new code should not use this type alias.
+using CompilationUnit = ::torch::jit::CompilationUnit;
+} // namespace script
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/api/function_impl.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/api/function_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..aef33457821191c45b4a48dcb2a5fb1f8be0ee64
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/api/function_impl.h
@@ -0,0 +1,180 @@
+#pragma once
+
+#include <ATen/core/function.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/runtime/graph_executor.h>
+
+namespace torch::jit {
+
+struct TORCH_API GraphFunction : public Function {
+  GraphFunction(
+      c10::QualifiedName name,
+      std::shared_ptr<Graph> graph,
+      std::function<void(GraphFunction&)> function_creator,
+      std::optional<ExecutorExecutionMode> executor_execution_mode =
+          std::nullopt)
+      : name_(std::move(name)),
+        graph_(std::move(graph)),
+        executor_execution_mode_(executor_execution_mode),
+        function_creator_(std::move(function_creator)) {}
+
+  bool isGraphFunction() const override {
+    return true;
+  }
+
+  void run(Stack& stack) override;
+
+  std::function<void(GraphFunction&)> function_creator() const {
+    return function_creator_;
+  }
+
+  c10::intrusive_ptr<c10::ivalue::Future> runAsync(
+      Stack& stack,
+      TaskLauncher taskLauncher = at::launch) override;
+
+  std::shared_ptr<Graph> graph() const {
+    return graph_;
+  }
+
+  std::shared_ptr<Graph> optimized_graph() const;
+
+  const c10::QualifiedName& qualname() const override {
+    return name_;
+  }
+
+  // private/unstable api. sets the initial execution mode
+  // will not affect executor if there is an existing executor
+  // created for this function
+  void _set_initial_executor_execution_mode(ExecutorExecutionMode mode) {
+    executor_execution_mode_ = mode;
+  }
+  // private/unstable api. sets flag of whether or not to ignore amp.
+  // will not affect executor if there is an existing executor
+  // created for this function
+  void _set_ignore_amp(bool ignore_amp) {
+    force_no_amp_ = ignore_amp;
+  }
+
+  // if this isn't yet defined, run its method_creator function
+  void ensure_defined() override;
+
+  size_t num_inputs() const override {
+    return graph()->inputs().size();
+  }
+
+  Function& setSchema(FunctionSchema schema) override {
+    schema_ = std::make_unique<FunctionSchema>(std::move(schema));
+    return *this;
+  }
+
+  const FunctionSchema& getSchema() const override;
+
+  GraphExecutorState getDebugState() {
+    return get_executor().getDebugState();
+  }
+
+  bool is_optimized() const {
+    TORCH_WARN(
+        "GraphFunction::is_optimized() is deprecated and always returns true. "
+        "Please use getGraphExecutorOptimize()");
+    return true;
+  }
+
+  void check_single_output() {
+    TORCH_CHECK(
+        graph()->outputs().size() == 1,
+        "Method (but not graphs in general) require a single output. Use None/Tuple for 0 or 2+ outputs");
+  }
+
+  GraphExecutor& get_executor() {
+    ensure_defined();
+    std::lock_guard<std::recursive_mutex> lock(compile_mutex);
+    auto& executor = executors_[currentSpecialization()];
+    if (executor) {
+      return *executor;
+    }
+    check_single_output();
+    const std::string& name = name_.name();
+    std::shared_ptr<Graph> opt_graph = optimized_graph();
+    if (!executor_execution_mode_) {
+      executor = GraphExecutor(opt_graph, name);
+    } else {
+      executor = GraphExecutor(opt_graph, name, *executor_execution_mode_);
+    }
+    return *executor;
+  }
+
+  using Function::call;
+  bool call(
+      Stack& stack,
+      std::optional<size_t> bailOut,
+      c10::function_ref<void(const Code&)> f) override {
+    f(get_executor().getPlanFor(stack, bailOut).code);
+    return true;
+  }
+
+  void clear_optimized_graphs() {
+    optimized_graphs_.fill(nullptr);
+  }
+
+ private:
+  enum SpecializationKey {
+    AutocastOff,
+    CpuAutocastOn,
+    GpuAutocastOn,
+    CpuGpuAutocastOn,
+
+    // This provides the number of specializations
+    // (Must be last entry)
+    TotalCount
+  };
+
+  SpecializationKey currentSpecialization() const;
+
+ private:
+  c10::QualifiedName name_;
+  // The original, non-optimized graph
+  std::shared_ptr<Graph> graph_; // for debugging and for inlining
+
+  // allows users to specify Simple/Profiling Executor for function
+  // TODO: add more executors
+  mutable std::optional<ExecutorExecutionMode> executor_execution_mode_;
+
+  // if invoked on a graph that has already traced through amp
+  // don't invoke amp pass
+  mutable bool force_no_amp_ = false;
+  // Optimized graph, computed lazily. Used for inlining.
+  mutable std::array<std::shared_ptr<Graph>, SpecializationKey::TotalCount>
+      optimized_graphs_;
+
+  // GraphFunctions are invokable from multiple threads, so this lock needs to
+  // be held when we're initializing graph executor for the first time or
+  // computing the optimized graph. We're using reentrant mutex so that we don't
+  // need to worry about causing a deadlock by calling one method from another
+  // (e.g. optimized_graph() from get_executor()).
+  mutable std::recursive_mutex compile_mutex;
+
+  // executor_[0] - autocast off
+  // executor_[1] - autocast cpu on
+  // executor_[2] - autocast gpu on
+  // executor_[3] - autocast cpu & gpu on
+  std::array<std::optional<GraphExecutor>, SpecializationKey::TotalCount>
+      executors_;
+
+  // an optional function that actually creates the method when
+  // ensure_defined() is called. This is used by the compiler so
+  // that it can construct methods out of order
+  std::function<void(GraphFunction&)> function_creator_;
+
+  // if absent, then we generate a default schema based on the graph
+  // mutable because getSchema caches the default schema if one is requested
+  // before a call to setSchema
+  mutable std::unique_ptr<FunctionSchema> schema_;
+};
+
+// Short hands for dynamic_cast<GraphFunction*>.
+TORCH_API GraphFunction* tryToGraphFunction(Function&) noexcept;
+TORCH_API GraphFunction& toGraphFunction(Function&);
+TORCH_API const GraphFunction& toGraphFunction(const Function&);
+} // namespace torch::jit
+C10_DECLARE_bool(torch_jit_do_not_store_optimized_graph);
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/api/method.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/api/method.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ffaa04ac4c6357589894c21a341bd60a95c6495
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/api/method.h
@@ -0,0 +1,84 @@
+#pragma once
+
+#include <ATen/core/function.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/stack.h>
+#include <torch/csrc/api/include/torch/imethod.h>
+#include <torch/csrc/jit/api/function_impl.h>
+
+namespace torch::jit {
+
+using ObjectPtr = c10::intrusive_ptr<c10::ivalue::Object>;
+
+// A method in a module, e.g. f in:
+//
+// class M(ScriptModule):
+//   @script_method
+//   def f(self, x):
+//     ...
+// Note: because Method/Module are exposed to python these
+// classes use python method naming conventions
+struct TORCH_API Method : public torch::IMethod {
+  Method(ObjectPtr owner, Function* function);
+
+  // the module that contains this method.
+  Module owner() const;
+  // the raw objectptr that owns this method, for when the method is owned by a
+  // torchbind object.
+  ObjectPtr raw_owner() const;
+  void run(Stack& stack);
+  void run(Stack&& stack) {
+    run(stack);
+  }
+
+  c10::IValue operator()(
+      std::vector<c10::IValue> stack,
+      const Kwargs& kwargs = Kwargs()) const override;
+
+  // Run method async. Invocation on this function would invokes a JIT
+  // interpreter that executes ops inline, one by one, on caller's thread. A
+  // model can utilize async op, i.e. `fork`, to launch an asynchronous task
+  // which will be launched on provided `taskLauncher`.
+  c10::intrusive_ptr<c10::ivalue::Future> run_async(
+      std::vector<c10::IValue> stack,
+      const Kwargs& kwargs = Kwargs(),
+      TaskLauncher taskLauncher = at::launch);
+
+  std::shared_ptr<Graph> graph() const {
+    return toGraphFunction(*function_).graph();
+  }
+
+  const std::string& name() const override {
+    return function_->name();
+  }
+
+  size_t num_inputs() const {
+    return function_->num_inputs();
+  }
+
+  GraphExecutor& get_executor() {
+    return toGraphFunction(*function_).get_executor();
+  }
+
+  Function& function() const {
+    return *function_;
+  }
+
+ private:
+  void setArgumentNames(std::vector<std::string>&) const override;
+
+  // Methods are uniqued onwed by a single module. This raw pointer allows
+  // looking up the module.
+  ObjectPtr owner_;
+
+  // Underlying unbound function
+  Function* function_;
+};
+
+namespace script {
+// We once had a `script::` namespace that was deleted. This is for backcompat
+// of the public API; new code should not use this type alias.
+using Method = ::torch::jit::Method;
+} // namespace script
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/api/module.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/api/module.h
new file mode 100644
index 0000000000000000000000000000000000000000..f0141b66a9c07f2daed2306f870821a7e64b8f75
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/api/module.h
@@ -0,0 +1,685 @@
+#pragma once
+#include <c10/util/Exception.h>
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/jit/api/object.h>
+#include <torch/csrc/jit/frontend/source_range.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/ir/named_value.h>
+#include <torch/csrc/jit/runtime/argument_spec.h>
+#include <torch/csrc/jit/runtime/graph_executor.h>
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/api/include/torch/ordered_dict.h>
+#include <torch/csrc/jit/api/compilation_unit.h>
+
+#include <ATen/core/function_schema.h>
+#include <ATen/core/qualified_name.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/irange.h>
+#include <optional>
+
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <ostream>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+// This file contains classes which assist in desugaring Python style
+// modules and their methods into flattened graphs which don't have any
+// function calls.
+
+namespace torch::jit {
+
+using ::c10::Argument;
+using ::c10::FunctionSchema;
+using ::c10::QualifiedName;
+// Map which stores filename to content.
+using ExtraFilesMap = std::unordered_map<std::string, std::string>;
+
+using ModulePtr = c10::intrusive_ptr<c10::ivalue::Object>;
+
+struct Module;
+
+template <typename T>
+struct slot_list_impl;
+
+template <typename T>
+struct Named {
+  std::string name;
+  T value;
+};
+
+using NameModule = Named<Module>;
+using NameValue = Named<IValue>;
+using NameTensor = Named<at::Tensor>;
+
+namespace detail {
+struct TORCH_API ModulePolicy;
+struct TORCH_API ParameterPolicy;
+struct TORCH_API AttributePolicy;
+struct TORCH_API BufferPolicy;
+template <typename P>
+struct NamedPolicy;
+} // namespace detail
+
+using module_list = slot_list_impl<detail::ModulePolicy>;
+using named_module_list =
+    slot_list_impl<detail::NamedPolicy<detail::ModulePolicy>>;
+
+using parameter_list = slot_list_impl<detail::ParameterPolicy>;
+using named_parameter_list =
+    slot_list_impl<detail::NamedPolicy<detail::ParameterPolicy>>;
+
+using attribute_list = slot_list_impl<detail::AttributePolicy>;
+using named_attribute_list =
+    slot_list_impl<detail::NamedPolicy<detail::AttributePolicy>>;
+
+using buffer_list = slot_list_impl<detail::BufferPolicy>;
+using named_buffer_list =
+    slot_list_impl<detail::NamedPolicy<detail::BufferPolicy>>;
+
+using ModuleLookup = std::function<Module(const std::vector<std::string>&)>;
+
+struct TORCH_API Module : public Object {
+  explicit Module(c10::QualifiedName class_name);
+  Module(std::shared_ptr<CompilationUnit> cu, const c10::ClassTypePtr& type);
+  Module() = default;
+  Module(const Module&) = default;
+  Module& operator=(const Module&) = default;
+  Module(Module&&) noexcept = default;
+  Module& operator=(Module&&) noexcept = default;
+  Module(
+      c10::QualifiedName,
+      std::shared_ptr<CompilationUnit> cu,
+      bool shouldMangle = false);
+  Module(ModulePtr module_value) : Object(std::move(module_value)) {}
+  ~Module() = default;
+
+  void set_optimized(bool o) {
+    TORCH_WARN(
+        "Module::set_optimized() is deprecated and has no effect. "
+        "Please use setGraphExecutorOptimize()");
+  }
+
+  bool is_optimized() const {
+    TORCH_WARN(
+        "Module::is_optimized() is deprecated and always returns true. "
+        "Please use getGraphExecutorOptimize()");
+    return true;
+  }
+
+  IValue forward(std::vector<IValue> inputs, const Kwargs& kwargs = Kwargs()) {
+    return get_method("forward")(std::move(inputs), kwargs);
+  }
+
+  // In script modules, buffers are Tensors attribute that are _not_ registered
+  // as parameters. This is different than in nn.Module where there is a special
+  // register_buffer method. With this simplification, we only need to track
+  // whether a slot is a parameter to be able to classify it.
+  void register_buffer(const std::string& name, at::Tensor v) {
+    bool is_param = false;
+    bool is_buffer = true;
+    std::lock_guard<std::mutex> lock(*register_mutex_);
+    type()->addOrCheckAttribute(name, TensorType::get(), is_param, is_buffer);
+    _ivalue()->setAttr(name, std::move(v));
+  }
+
+  void register_parameter(
+      const std::string& name,
+      at::Tensor v,
+      bool is_buffer) {
+    std::lock_guard<std::mutex> lock(*register_mutex_);
+    type()->addOrCheckAttribute(name, TensorType::get(), !is_buffer, is_buffer);
+    _ivalue()->setAttr(name, std::move(v));
+  }
+
+  void register_attribute(
+      const std::string& name,
+      const TypePtr& t,
+      IValue v,
+      bool is_param = false,
+      bool is_buffer = false) {
+    type()->addOrCheckAttribute(name, t, is_param, is_buffer);
+    _ivalue()->setAttr(name, std::move(v));
+  }
+
+  void register_module(const std::string& name, const Module& module) {
+    type()->addOrCheckAttribute(name, module.type());
+    _ivalue()->setAttr(name, module._ivalue());
+  }
+
+  void apply(const std::function<void(Module&)>& fn);
+
+  buffer_list buffers(bool recurse = true) const;
+  named_buffer_list named_buffers(bool recurse = true) const;
+
+  module_list children() const; // direct modules
+  named_module_list named_children() const;
+  module_list modules() const; // all modules, including this one, recursively
+  named_module_list named_modules() const;
+
+  // all tensors involved in gradient optimization
+  parameter_list parameters(bool recurse = true) const;
+  named_parameter_list named_parameters(bool recurse = true) const;
+
+  // all members of the object, similar to iterating over dir(obj) in python
+  attribute_list attributes(bool recurse = true) const;
+  named_attribute_list named_attributes(bool recurse = true) const;
+
+  void dump(
+      bool print_method_bodies,
+      bool print_attr_values,
+      bool print_param_values) const;
+
+  std::string dump_to_str(
+      bool print_method_bodies,
+      bool print_attr_values,
+      bool print_param_values) const;
+
+  /// Enables "training" mode.
+  void train(bool on = true);
+  /// Calls train(false) to enable "eval" mode.
+  /// Do not override this method, override `train()` instead.
+  void eval() {
+    train(/*on=*/false);
+  }
+  /// True if the module is in training mode.
+  bool is_training() const {
+    return attr("training", true).toBool();
+  }
+
+  /// Recursively casts all parameters to the given `dtype` and `device`.
+  ///
+  /// If `non_blocking` is true and the source is in pinned memory and
+  /// destination is on the GPU or vice versa, the copy is performed
+  /// asynchronously with respect to the host. Otherwise, the argument has no
+  /// effect.
+  void to(at::Device device, at::ScalarType dtype, bool non_blocking = false);
+
+  /// Recursively casts all parameters to the given dtype.
+  ///
+  /// If `non_blocking` is true and the source is in pinned memory and
+  /// destination is on the GPU or vice versa, the copy is performed
+  /// asynchronously with respect to the host. Otherwise, the argument has no
+  /// effect.
+  void to(at::ScalarType dtype, bool non_blocking = false);
+
+  /// Recursively moves all parameters to the given device.
+  ///
+  /// If `non_blocking` is true and the source is in pinned memory and
+  /// destination is on the GPU or vice versa, the copy is performed
+  /// asynchronously with respect to the host. Otherwise, the argument has no
+  /// effect.
+  void to(at::Device device, bool non_blocking = false);
+
+  void save(
+      std::ostream& out,
+      const ExtraFilesMap& extra_files = ExtraFilesMap()) const;
+
+  void save(
+      const std::string& filename,
+      const ExtraFilesMap& extra_files = ExtraFilesMap()) const;
+
+  void _save_for_mobile(
+      std::ostream& out,
+      const ExtraFilesMap& extra_files = ExtraFilesMap(),
+      bool save_mobile_debug_info = false,
+      bool use_flatbuffer = false) const;
+
+  void _save_for_mobile(
+      const std::string& filename,
+      const ExtraFilesMap& extra_files = ExtraFilesMap(),
+      bool save_mobile_debug_info = false,
+      bool use_flatbuffer = false) const;
+
+  Module copy() const;
+
+  Module deepcopy(std::optional<at::Device> device = std::nullopt) const;
+
+  // Clones both the underlying `ClassType` and the module instance(data), this
+  // function creates a new `ClassType` and returns a new instance that has the
+  // same data as the current instance but with the new type, shared ClassType
+  // will be preserved as well
+  Module clone(bool inplace = false) const;
+
+  // Clones both the underlying `ClassType` and the module instance(data), this
+  // function creates a new `ClassType` and returns a new instance that has the
+  // same data as the current instance but with the new type, shared ClassType
+  // will be preserved as well. Also allows the caller to specify a set of
+  // method and attribute names to not clone.
+  Module clone(
+      bool inplace,
+      const std::unordered_set<std::string>& ignored_method,
+      const std::unordered_set<std::string>& ignored_attributes) const;
+
+  void clone_method(const Module& orig, const std::string& name);
+
+  IValue operator()(std::vector<IValue> inputs);
+
+  template <typename... Types>
+  IValue create_class(const c10::QualifiedName& name, Types&&... args) const {
+    return create_class(name, {IValue(std::forward<Types>(args))...});
+  }
+
+  IValue create_class(const c10::QualifiedName& name, Stack stack) const;
+
+  inline bool operator==(const Module& y) const noexcept {
+    return _ivalue() == y._ivalue();
+  }
+
+  void set_delete_memory(std::shared_ptr<char> delete_mem) {
+    mem_to_delete_ = std::move(delete_mem);
+  }
+
+  // A set of functions to maintain input shapes through torch.jit.save and
+  // torch.jit.load. It only works on tensors and lists/dicts of tensors
+  // because tracing is only supported by these types.
+  void store_traced_inputs(
+      const std::string& func_name,
+      std::vector<IValue> inputs) {
+    if (inputs.empty()) {
+      return;
+    }
+    auto c10_inputs = c10::impl::GenericList(AnyType::get());
+    for (IValue& value : inputs) {
+      // Not checking whether this is traceable type as that is already checked
+      // higher up in the stack and changing that would require a larger
+      // restructuring.
+      c10_inputs.emplace_back(std::move(value));
+    }
+    traced_inputs_.insert_or_assign(func_name, c10_inputs);
+  }
+
+  c10::Dict<std::string, c10::impl::GenericList> retrieve_traced_inputs()
+      const {
+    return traced_inputs_;
+  }
+
+ private:
+  Module clone_impl(
+      std::unordered_map<TypePtr, TypePtr>& type_remap,
+      bool inplace,
+      IValue::HashIdentityIValueMap memo,
+      const std::unordered_set<std::string>& ignored_methods,
+      const std::unordered_set<std::string>& ignored_attributes) const;
+
+  void clone_method(
+      const Module& orig,
+      const Function& method,
+      const std::unordered_map<TypePtr, TypePtr>& type_remap);
+
+  c10::QualifiedName getNameForMethod(std::string basename) const {
+    return QualifiedName(*type()->name(), std::move(basename));
+  }
+
+  void to_impl(
+      const std::optional<at::Device>& device,
+      const std::optional<at::ScalarType>& dtype,
+      bool non_blocking);
+
+  // Extra handle for the module to delete when itself is deleted
+  std::shared_ptr<char> mem_to_delete_;
+
+  // Map of function names to the traced inputs that they have been traced with
+  c10::Dict<std::string, c10::impl::GenericList> traced_inputs_;
+
+  // Mutex to keep registring buffer or parameter thread safe.
+  std::shared_ptr<std::mutex> register_mutex_ = std::make_shared<std::mutex>();
+};
+
+// C++ equivalent api of `torch.jit.freeze`. See documentation there for
+// details.
+TORCH_API Module freeze(
+    const Module& module,
+    const std::optional<std::vector<std::string>>& preserved_attrs =
+        std::nullopt,
+    bool optimize_numerics = true);
+
+// C++ equivalent api of `torch.jit.optimize_for_inference`. See documentation
+// there for details.
+TORCH_API Module optimize_for_inference(
+    Module& module,
+    const std::vector<std::string>& other_methods = {});
+
+enum class FusionBehavior { STATIC, DYNAMIC };
+
+using FusionStrategy = std::vector<std::pair<FusionBehavior, size_t>>;
+// clang-format off
+/*
+Sets the type and number of specializations that can occur during fusion.
+
+Usage: provide a list of pairs (type, depth) where type is one of STATIC or DYNAMIC
+and depth is an integer.
+
+Behavior - static vs dynamic:
+    In STATIC fusion, fused ops are compiled to have fixed input shapes. The shape is determined
+    based on some initial profiling runs.
+    In DYNAMIC fusion, fused ops are compiled to have variable input shapes, so that multiple
+    shapes are possible.
+
+In both cases, we also recompile on new striding behavior, device, or dtype.
+
+Behavior - fallback functions & depth:
+    When an input doesn't match the format required by the specialized compiled op, it will run
+    a fallback function. Fallback functions are recursively be compiled and specialized based
+    on the observed tensor shapes. Since compilation can be slow, the "depth" parameter is provided to
+    limit the number of specializations that can be compiled, before giving up on recompiling and
+    falling back to a completely un-fused, un-specialized implementation.
+
+The list of (type, depth) pairs controls the type of specializations and the number of
+specializations. For example: [(STATIC, 2), (DYNAMIC, 2)] indicates that the first
+two specializations will use static fusions, the following two specializations will use
+dynamic fusion, and any inputs that satisfy none of the 4 options will run an
+unfused implementation.
+
+NB: in the future, if more as more fusion backends are added there may be more granular
+apis for specific fusers.
+*/
+// clang-format on
+TORCH_API FusionStrategy getFusionStrategy();
+// returns previous strategy
+TORCH_API FusionStrategy setFusionStrategy(FusionStrategy& fusion_strategy);
+
+namespace detail {
+
+struct TORCH_API SlotCursor {
+  Module module_;
+  int64_t i_; // slot offset, -1 indicates the module itself
+};
+
+} // namespace detail
+
+// This iterator allows the (optionally recursive) enumeration of
+// the  members of a Module. It performs a depth-first pre-order
+// traversal of the module. The Policy template parameter determines
+// which slots of the object should be included. For instance,
+// when iterating parameters, we return the parameter tensors,
+// but skip modules, buffers, and other attributes.
+// See ModulePolicy for comments about Policy object's API.
+template <typename Policy>
+struct slot_iterator_impl {
+  using SlotCursor = detail::SlotCursor;
+  using value_type = typename Policy::value_type;
+  slot_iterator_impl(
+      Module root,
+      bool recurse, // if true, do a depth-first search, otherwise, just look at
+                    // slots of root
+      bool return_module) // if true include root itself as the first thing
+                          // visited (used in modules())
+      : cursors_({SlotCursor{std::move(root), return_module ? -1 : 0}}),
+        recurse_(recurse) {
+    // advance iterator to first valid element (or the end, if empty)
+    while_not_valid_next();
+  }
+  // empty cursors_, represents end of iteration
+  slot_iterator_impl() : recurse_(false) {}
+  value_type operator*() const {
+    return Policy::create(cursors_, cur());
+  }
+  value_type operator->() const {
+    return **this;
+  }
+  slot_iterator_impl& operator++() {
+    next_valid();
+    return *this;
+  }
+  slot_iterator_impl operator++(int) {
+    // this is really expensive, should we delete it so people don't use it
+    // instead of prefix?
+    slot_iterator_impl old = *this;
+    ++(*this);
+    return old;
+  }
+
+ private:
+  // return_module() is a corner case where instead of returning a submodule
+  // of root, we are returning root itself, because we are iterating modules(),
+  // which contains the root module itself.
+  // It is represented with a single SlotCursor whose index is -1.
+  bool return_module() const {
+    return top().i_ == -1;
+  }
+  const SlotCursor& top() const {
+    return cursors_.back();
+  }
+  SlotCursor& top() {
+    return cursors_.back();
+  }
+  IValue cur() const {
+    return return_module() ? top().module_._ivalue()
+                           : top().module_._ivalue()->getSlot(top().i_);
+  }
+
+  // advance to the next slot in a depth first pre-order traversal of the
+  // modules slots. This function does not guarantee the next slot is a
+  // valid element of the iteration. That is done by valid().
+  // invariant: !cursors_.empty()
+  void next() {
+    // we just returned the module itself, advance i_ to 0 so we are now
+    // at the first slot of the module.
+    if (return_module()) {
+      ++top().i_;
+      return;
+    }
+    // the last traversal action advanced beyond the number of slots in the
+    // module so continue the iteration in the parent.
+    if (top().i_ >= int64_t(top().module_._ivalue()->type()->numAttributes())) {
+      cursors_.pop_back();
+      if (!cursors_.empty()) {
+        ++top().i_;
+      }
+      return;
+    }
+    // if the current thing is a module, we have to scan it for recursive
+    // traversals. We do this by adding a new SlotCursor to track the traversal.
+    if (recurse_ &&
+        top().module_._ivalue()->type()->getAttribute(top().i_)->is_module()) {
+      cursors_.emplace_back(SlotCursor{cur().toModule(), 0});
+      return;
+    }
+    // common case: advance to the next slot.
+    ++top().i_;
+  }
+  // is the current position of the iterator a valid one?
+  // otherwise, we have to continue advancing.
+  bool valid() const {
+    return top().i_ <
+        int64_t(top().module_._ivalue()->type()->numAttributes()) &&
+        Policy::valid(
+               top().module_._ivalue()->type(),
+               top().i_,
+               top().module_._ivalue()->getSlot(top().i_));
+  }
+  void while_not_valid_next() {
+    // advance iteration until we are either at the end (cursors_.empty())
+    // or in a valid state. return_module() is a special case,
+    // and is always considered valid, regardless of Policy, because it is
+    // it is only true when we are iterating modules.
+    while (!cursors_.empty() && !return_module() && !valid()) {
+      next();
+    }
+  }
+  void next_valid() {
+    // avoid crashing if this is empty
+    if (cursors_.empty()) {
+      return;
+    }
+    // advance to next element, which is maybe not valid
+    next();
+    while_not_valid_next();
+  }
+
+  std::vector<SlotCursor> cursors_;
+  bool recurse_;
+
+  friend inline bool operator!=(
+      const slot_iterator_impl<Policy>& a,
+      const slot_iterator_impl<Policy>& b) {
+    // we are finished iteration when we have no more iteration SlotCursors.
+    // end is always an empty iterator with no cursors.
+    return (a.cursors_.empty() != b.cursors_.empty());
+  }
+};
+
+// This type represents lists of parameters, attributes, and
+// submodules contained in the module. It is abstract because
+// they are not stored directly in std::vectors but inside the
+// module's IValue object itself.
+template <typename Policy>
+struct slot_list_impl {
+  using iterator = slot_iterator_impl<Policy>;
+  using const_iterator = slot_iterator_impl<Policy>;
+  using value_type = typename iterator::value_type;
+  slot_iterator_impl<Policy> begin() const {
+    return slot_iterator_impl<Policy>(module_, recurse_, return_module_);
+  }
+  slot_iterator_impl<Policy> end() const {
+    return slot_iterator_impl<Policy>();
+  }
+  size_t size() const {
+    if (!size_) {
+      size_ = size_t(0);
+      for ([[maybe_unused]] const value_type& _ : *(this)) {
+        ++*size_;
+      }
+    }
+    return *size_;
+  }
+
+  slot_list_impl(Module module, bool recurse, bool return_module)
+      : module_(std::move(module)),
+        recurse_(recurse),
+        return_module_(return_module),
+        size_(std::nullopt) {
+    if (!recurse && !return_module && Policy::all_slots) {
+      size_ = module_.num_slots();
+    }
+  }
+
+ private:
+  Module module_;
+  bool recurse_;
+  bool return_module_;
+  // size of this list, cached on first request
+  // when we need to filter the slot list
+  mutable std::optional<size_t> size_;
+  friend struct Module;
+};
+
+namespace detail {
+
+// slot_iterator_impl always iterate over all the slots in a module,
+// the Policy template argument determines slots should be returned and their
+// types
+struct TORCH_API ModulePolicy {
+  // the type of the value being returned
+  using value_type = Module;
+
+  // the logic for creating the type being returned, given the raw IValue
+  // of that object.
+  static value_type create(
+      const std::vector<detail::SlotCursor>& cursors,
+      IValue v) {
+    return Module(std::move(v).toObject());
+  }
+  // is slot i in typ something that this iterator should return, otherwise,
+  // we skip it.
+  static bool valid(const ClassTypePtr& typ, size_t i, const IValue& v) {
+    return typ->getAttribute(i)->is_module();
+  }
+  // are we going to return everything? If so, we can optimize the calculate
+  // of the size of the list.
+  static constexpr bool all_slots = false;
+};
+
+struct TORCH_API ParameterPolicy {
+  using value_type = at::Tensor;
+  static value_type create(
+      const std::vector<detail::SlotCursor>& cursors,
+      IValue v) {
+    return std::move(v).toTensor();
+  }
+  static bool valid(const ClassTypePtr& typ, size_t i, const IValue& v) {
+    return typ->is_parameter(i) && v.isTensor();
+  }
+  static constexpr bool all_slots = false;
+};
+
+struct TORCH_API BufferPolicy {
+  using value_type = at::Tensor;
+  static value_type create(
+      const std::vector<detail::SlotCursor>& cursors,
+      IValue v) {
+    return std::move(v).toTensor();
+  }
+  static bool valid(const ClassTypePtr& typ, size_t i, const IValue& v) {
+    return typ->getAttribute(i)->isSubtypeOf(*TensorType::get()) &&
+        typ->is_buffer(i);
+  }
+  static constexpr bool all_slots = false;
+};
+
+struct TORCH_API AttributePolicy {
+  using value_type = IValue;
+  static value_type create(
+      const std::vector<detail::SlotCursor>& cursors,
+      IValue v) {
+    return v;
+  }
+  static bool valid(const ClassTypePtr& typ, size_t i, const IValue& v) {
+    return true;
+  }
+  static constexpr bool all_slots = true;
+};
+
+// take a Policy object, and make a version of it that returns the slot.
+// along with the fully qualified name of that slot. This is used for the named_
+// variants like named_parameters().
+template <typename Policy>
+struct NamedPolicy {
+  using value_type = Named<typename Policy::value_type>;
+  static value_type create(
+      const std::vector<detail::SlotCursor>& cursors,
+      IValue v) {
+    std::string name;
+    if (cursors.size() == 1) {
+      name = (cursors.back().i_ == -1) ? "" : nameFragment(cursors.back());
+    } else {
+      std::ostringstream ss;
+      for (const auto i : c10::irange(cursors.size())) {
+        if (i > 0) {
+          ss << ".";
+        }
+        ss << nameFragment(cursors[i]);
+      }
+      name = ss.str();
+    }
+    return value_type{std::move(name), Policy::create(cursors, std::move(v))};
+  }
+  static bool valid(const ClassTypePtr& t, size_t i, const IValue& v) {
+    return Policy::valid(t, i, v);
+  }
+  static constexpr bool all_slots = Policy::all_slots;
+
+ private:
+  static std::string nameFragment(const detail::SlotCursor& f) {
+    return f.module_.type()->getAttributeName(f.i_);
+  }
+};
+
+} // namespace detail
+
+TORCH_API bool& getInlineEverythingMode();
+
+namespace script {
+// We once had a `script::` namespace that was deleted. This is for backcompat
+// of the public API; new code should not use this type alias.
+using Module = ::torch::jit::Module;
+using ExtraFilesMap = ::torch::jit::ExtraFilesMap;
+} // namespace script
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/api/object.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/api/object.h
new file mode 100644
index 0000000000000000000000000000000000000000..7f16bfe12c2b43618035aed29e8a646428185a6b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/api/object.h
@@ -0,0 +1,200 @@
+#pragma once
+
+#include <ATen/core/functional.h>
+#include <ATen/core/ivalue.h>
+#include <torch/csrc/jit/api/method.h>
+#include <optional>
+
+#include <utility>
+
+namespace torch::jit {
+
+struct Resolver;
+using ResolverPtr = std::shared_ptr<Resolver>;
+
+using ObjectPtr = c10::intrusive_ptr<c10::ivalue::Object>;
+
+// Throw this in C++ land if `attr` fails. This will be converted to a Python
+// AttributeError by the Python binding code
+class ObjectAttributeError : public std::runtime_error {
+ public:
+  ObjectAttributeError(const std::string& what) : std::runtime_error(what) {}
+};
+
+struct TORCH_API Object {
+  Object() = default;
+  Object(const Object&) = default;
+  Object& operator=(const Object&) = default;
+  Object(Object&&) noexcept = default;
+  Object& operator=(Object&&) noexcept = default;
+  Object(ObjectPtr _ivalue) : _ivalue_(std::move(_ivalue)) {}
+  Object(std::shared_ptr<CompilationUnit> cu, const c10::ClassTypePtr& type);
+  Object(
+      c10::QualifiedName,
+      std::shared_ptr<CompilationUnit> cu,
+      bool shouldMangle = false);
+
+  ObjectPtr _ivalue() const {
+    TORCH_INTERNAL_ASSERT(_ivalue_);
+    return _ivalue_;
+  }
+
+  c10::ClassTypePtr type() const {
+    return _ivalue()->type();
+  }
+
+  struct Property {
+    std::string name;
+    Method getter_func;
+    std::optional<Method> setter_func;
+  };
+
+  void setattr(const std::string& name, c10::IValue v) {
+    if (_ivalue()->type()->hasConstant(name)) {
+      TORCH_CHECK(
+          false,
+          "Can't set constant '",
+          name,
+          "' which has value:",
+          _ivalue()->type()->getConstant(name));
+    } else if (auto slot = _ivalue()->type()->findAttributeSlot(name)) {
+      const c10::TypePtr& expected = _ivalue()->type()->getAttribute(*slot);
+      TORCH_CHECK(
+          v.type()->isSubtypeOf(*expected),
+          "Expected a value of type '",
+          expected->repr_str(),
+          "' for field '",
+          name,
+          "', but found '",
+          v.type()->repr_str(),
+          "'");
+      _ivalue()->setSlot(*slot, std::move(v));
+    } else {
+      TORCH_CHECK(false, "Module has no attribute '", name, "'");
+    }
+  }
+
+  c10::IValue attr(const std::string& name) const {
+    if (auto r = _ivalue()->type()->findAttributeSlot(name)) {
+      return _ivalue()->getSlot(*r);
+    }
+    if (auto r = _ivalue()->type()->findConstantSlot(name)) {
+      return _ivalue()->type()->getConstant(*r);
+    }
+    std::stringstream err;
+    err << _ivalue()->type()->repr_str() << " does not have a field with name '"
+        << name.c_str() << "'";
+    throw ObjectAttributeError(err.str());
+  }
+
+  c10::IValue attr(const std::string& name, c10::IValue or_else) const {
+    if (auto r = _ivalue()->type()->findAttributeSlot(name)) {
+      return _ivalue()->getSlot(*r);
+    }
+    if (auto r = _ivalue()->type()->findConstantSlot(name)) {
+      return _ivalue()->type()->getConstant(*r);
+    }
+    return or_else;
+  }
+
+  bool hasattr(const std::string& name) const {
+    return _ivalue()->type()->hasAttribute(name) ||
+        _ivalue()->type()->hasConstant(name);
+  }
+
+  // each object owns its methods. The reference returned here
+  // is guaranteed to stay valid until this module has been destroyed
+  Method get_method(const std::string& name) const {
+    if (auto method = find_method(name)) {
+      return *method;
+    }
+    TORCH_CHECK(false, "Method '", name, "' is not defined.");
+  }
+
+  const std::vector<Method> get_methods() const {
+    return c10::fmap(type()->methods(), [&](Function* func) {
+      return Method(_ivalue(), func);
+    });
+  }
+
+  bool has_property(const std::string& name) const {
+    for (const auto& prop : type()->properties()) {
+      if (prop.name == name) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  const Property get_property(const std::string& name) const {
+    for (const auto& prop : type()->properties()) {
+      if (prop.name == name) {
+        std::optional<Method> setter = std::nullopt;
+        if (prop.setter) {
+          setter = Method(_ivalue(), prop.setter);
+        }
+        return Property{
+            prop.name, Method(_ivalue(), prop.getter), std::move(setter)};
+      }
+    }
+    TORCH_CHECK(false, "Property '", name, "' is not defined.");
+  }
+
+  const std::vector<Property> get_properties() const {
+    return c10::fmap(type()->properties(), [&](ClassType::Property prop) {
+      std::optional<Method> setter = std::nullopt;
+      if (prop.setter) {
+        setter = Method(_ivalue(), prop.setter);
+      }
+      return Property{
+          std::move(prop.name),
+          Method(_ivalue(), prop.getter),
+          std::move(setter)};
+    });
+  }
+
+  std::optional<Method> find_method(const std::string& basename) const;
+
+  /// Run a method from this module.
+  ///
+  /// For example:
+  /// @code
+  ///   IValue output = module->run("relu_script", a, b);
+  /// @endcode
+  ///
+  /// To get a compile a module from a source string, see torch::jit::compile
+  ///
+  /// @param method_name The name of the method to run
+  /// @param args Arguments to be passed to the method
+  /// @return An IValue containing the return value (or values if it is a tuple)
+  /// from the method
+  template <typename... Types>
+  IValue run_method(const std::string& method_name, Types&&... args) {
+    return get_method(method_name)({IValue(std::forward<Types>(args))...});
+  }
+
+  // so that C++ users can easily add methods
+  void define(const std::string& src, const ResolverPtr& resolver = nullptr);
+
+  size_t num_slots() const {
+    return _ivalue()->slots().size();
+  }
+
+  // shallow copy the object
+  Object copy() const;
+
+  // Copies all the attributes of the object recursively without creating new
+  // `ClassType`, including deepcopy of Tensors
+  Object deepcopy() const;
+
+ private:
+  // mutable be we lazily initialize in module_object.
+  mutable ObjectPtr _ivalue_;
+};
+
+namespace script {
+// We once had a `script::` namespace that was deleted. This is for backcompat
+// of the public API; new code should not use this type alias.
+using Object = ::torch::jit::Object;
+} // namespace script
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend.h
new file mode 100644
index 0000000000000000000000000000000000000000..80e8e9512153fc55d0c8642f0aff6f281feea6c4
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend.h
@@ -0,0 +1,114 @@
+#pragma once
+
+#include <ATen/core/builtin_function.h>
+#include <ATen/core/stack.h>
+#include <torch/csrc/jit/backends/backend_interface.h>
+#include <torch/custom_class.h>
+
+namespace torch::jit {
+namespace {
+inline c10::FunctionSchema getIsAvailableSchema() {
+  c10::Argument self("self", c10::AnyType::get());
+  c10::Argument available("available", c10::BoolType::get());
+  c10::FunctionSchema preprocessor_schema(
+      "is_available",
+      /*overload_name=*/"",
+      /*arguments=*/{self},
+      /*returns=*/{available});
+  return preprocessor_schema;
+}
+
+constexpr static auto kBackendsNamespace = "__backends__";
+
+inline c10::FunctionSchema getCompileSchema() {
+  c10::Argument self("self", c10::AnyType::get());
+  c10::Argument mod("processed", c10::AnyType::get());
+  auto any_dict_ty =
+      c10::DictType::create(c10::StringType::get(), c10::AnyType::get());
+  c10::Argument method_compile_spec("method_compile_spec", any_dict_ty);
+  c10::Argument handles("handles", any_dict_ty);
+
+  c10::FunctionSchema compile_schema(
+      "compile",
+      /*overload_name=*/"",
+      /*arguments=*/{self, mod, method_compile_spec},
+      /*returns=*/{handles});
+  return compile_schema;
+}
+
+inline c10::FunctionSchema getExecuteSchema() {
+  auto any_list_ty = c10::ListType::create(c10::AnyType::get());
+  c10::Argument self("self", c10::AnyType::get());
+  c10::Argument handle("handle", c10::AnyType::get());
+  c10::Argument input("input", any_list_ty);
+  c10::Argument output("output", any_list_ty);
+  return c10::FunctionSchema(
+      "execute",
+      /*overload_name=*/"",
+      /*arguments=*/{self, handle, input},
+      /*returns=*/{output});
+}
+
+template <typename TBackendInterface>
+std::function<void(Stack&)> getIsAvailableFunc() {
+  return [](Stack& stack) {
+    auto self = pop(stack).toCustomClass<TBackendInterface>();
+    auto ret = self->is_available();
+    push(stack, ret);
+  };
+}
+
+template <typename TBackendInterface>
+std::function<void(Stack&)> getCompileFunc() {
+  return [](Stack& stack) {
+    auto method_compile_spec = pop(stack).toGenericDict();
+    auto processed = pop(stack);
+    auto self = pop(stack).toCustomClass<TBackendInterface>();
+    auto ret = self->compile(processed, method_compile_spec);
+    push(stack, ret);
+  };
+}
+
+template <typename TBackendInterface>
+std::function<void(Stack&)> getExecuteFunc() {
+  return [](Stack& stack) {
+    auto args = pop(stack);
+    auto handle = pop(stack);
+    auto self = pop(stack);
+    auto backend = self.toCustomClass<TBackendInterface>();
+    auto res = backend->execute(handle, args.toList());
+    push(stack, res);
+  };
+}
+} // namespace
+
+// Static registration API for backends.
+template <class TBackendInterface>
+class backend {
+  static_assert(
+      std::is_base_of_v<PyTorchBackendInterface, TBackendInterface>,
+      "torch::jit::backend<T> requires T to inherit from PyTorchBackendInterface");
+  std::string backend_name_;
+
+ public:
+  // Registers a new backend with /p name, and the given /p preprocess
+  // function.
+  backend(const std::string& name) : backend_name_(name) {
+    static auto cls = torch::class_<TBackendInterface>(kBackendsNamespace, name)
+                          .def(torch::init<>())
+                          ._def_unboxed(
+                              "is_available",
+                              getIsAvailableFunc<TBackendInterface>(),
+                              getIsAvailableSchema())
+                          ._def_unboxed(
+                              "compile",
+                              getCompileFunc<TBackendInterface>(),
+                              getCompileSchema())
+                          ._def_unboxed(
+                              "execute",
+                              getExecuteFunc<TBackendInterface>(),
+                              getExecuteSchema());
+  }
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_debug_handler.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_debug_handler.h
new file mode 100644
index 0000000000000000000000000000000000000000..0bb2b6596cb8f8b3623fb034e713e0d5182c8577
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_debug_handler.h
@@ -0,0 +1,138 @@
+#pragma once
+#include <ATen/core/ivalue.h>
+
+#include <torch/csrc/jit/backends/backend_detail.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/ir/scope.h>
+
+#include <atomic>
+
+namespace torch::jit {
+
+/*
+ *  BackendDebugHandleManager is responsible for issuing debug handles to
+ *  backends. Debug handles are associated with nodes of a graph.
+ *  BackendDebugHandleManager also maintains a map
+ *  [debug-handle, DebugInfoTuple = {source range, inlined callstack ptr]} that
+ *  will help generate a callstack for exception raised using debug handles.
+ *  Effectively debug handles are something that is given to backend and later
+ *  when an exception occurs in the backend, backend can tell, using debug
+ *  handle, that an exception occurred here. Then the runtime can generate
+ *  callstack correspoding to the exception.
+ *  There are two parts to BackendDebugHandleManager:
+ *  1. static std::atomic debug_handle
+ *  2. Map of [debug-handle, DebugInfoTuple]
+ *
+ *  About 1:
+ *  Why do they have to be unique. The reason is that by ensuring
+ *  uniqueness of debug handles, we remove the burden of another layer of
+ *  mapping where we need to say this set of debug handles were generated for
+ *  this lowered module or this bytecode function. This simplifies the API for
+ *  serialization since debug handles can uniquely identify DebugInfoTuple.
+ *  Thus simplifies the runtime API for throwing exception. Exception throwing
+ *  only needs to know debug_handle and not which module or method threw it.
+ *  There are 2 issues to keep in mind, though,for static std::atomic
+ *  debug_handle: A. Performance implications of using atomic variable. However
+ *  this is only used for compilation so we assume to absorb some of that
+ *  penalty. Plus if there is no contention then we should have less to worry
+ *  about. B. If repeated compilation is part of a long running process then we
+ *  may overflow int64_t. We may detect and fail on this. For now this is not
+ *  done.
+ *
+ *  Now about 2:
+ *  There are two usecases for [debug-handle, DebugInfoTuple]
+ *  A. During bytecode generation the DebugInfoTuple corresponding to the nodes
+ *  of the inlined graph being serialized, are stored in this object and a
+ *  unique debug handle is returned. This unique debug handle is stored in
+ *  mobile_debug info for pytorch lite models. It will be used for raising
+ *  exceptions as well as profiling. B. During backend lowering, each backend's
+ *  preprocess/compile method can compile method's graph and serialize those
+ *  methods. Once the method is lowered to backend, graph is essentially lost.
+ *  Without access to graph it is hard to generate model level debug info. Thus
+ *  the debug handles provide a way to map nodes of the graph to the model level
+ *  debug info.
+ *
+ *  During byte-code model serialization, [debug-handle, DebugInfoTuple] is
+ *  serialized. Now we know a. debug handles and b. how to map debug handles to
+ *  model source code. Thus we can either do eager symbolication by converting
+ *  debug handles to corresponding source code at runtime, or do lazy
+ *  symbolicattion offline.
+ *
+ *  Note that it is not necessary to serialize [debug-handle, DebugInfoTuple]
+ *  corresponding to lowered backend if the lowering process, that is
+ *  preprocess/compile, and execution happens in the same session, then eager
+ *  symbolication can be employed.
+ *
+ *  Now how does BackendDebugHandleManager capture all of the above?
+ *  By providing two API.
+ *  1. getNextDebugHandle which given a Node* returns a unique debug handle,
+ *     that will uniquely identify DebugInfoTuple.
+ *     and
+ *  2. getCallStackPtrMap which returns the map
+ *     [debug-handle, DebugInfoTuple]
+ *
+ *  1 provides debug handles to backends and 2 provides runtime a way to map
+ *  debug handles to source level debug info.
+ *
+ *  So why does debug handle map to DebugInfoTuple = {source range and inlined
+ *  cs}? {debug_handle, source_range_tag, serialized_callstack} Take this
+ *  example: class L(nn.Module): def __init__(self) -> None:
+ *      ...
+ *    def forward(self, x):
+ *      return x * 5
+ *  class M(nn.Module):
+ *    def __init__(self) -> None:
+ *      ...
+ *    def forward(self, x):
+ *      return x - 2
+ *  class N(nn.Module):
+ *    def __init__(self) -> None:
+ *      self.m = M()
+ *    def forward(self, x):
+ *      return self.m(x) + 3
+ *  m = torch.jit.script(N())
+ *  Once you inline m's forward method, m.forward.graph will look something
+ *  like this
+ *  graph(%self...):
+ *   %x = aten::mul(..)
+ *   %x = aten::sub(x, ..)
+ *   %y = aten::add(x, ..)
+ *   ..
+ *  Inlined callstack ptr for these two nodes will look like:
+ *  aten::mul's inlined CS (callstack): [N.forward, source range] -> [M.forward,
+ *  source range] aten::sub's inlined CS (callstack): [N.forward, source range]
+ *  aten::add's inlined CS: null
+ *  mul node's inlined CS contains only information about the callsites' source
+ *  range The information about mul node's source range ('return x * 5') is not
+ *  available in its inlined CS. It is rather part of node's source range
+ *  instead of inlined CS. Thus to get full stack: [N.forward, source range] ->
+ *  [M.forward, source range] -> [aten::mul's source range] We need to track
+ *  mul's source range and inlined CS both.
+ */
+
+using BackendDebugInfoMapType =
+    std::unordered_map<torch::jit::DebugHandleType, DebugInfoTuple>;
+
+/*
+ * This class is used to generate debug info map.
+ * backend's preprocess will call generate_debug_handles (see
+ * backend_detail.cpp), which uses debug_handle_manager to generate debug
+ * handles. When lowering process finishes, calling stopRecording will
+ * return debug info map from debug_handle_manager
+ */
+class TORCH_API BackendDebugInfoRecorder {
+ public:
+  BackendDebugInfoRecorder() = default;
+  int64_t getNextDebugHandle(const Node* node);
+  // Reason this is not done as RAII is that work done in stopRecording
+  // can throw, and throwing with dtor will call terminate and thus voids any
+  // exception catching at a higher level.
+  BackendDebugInfoMapType stopRecording();
+  NodeToDebugHandle generate_debug_handles(const std::shared_ptr<Graph>& graph);
+
+ private:
+  static std::atomic<DebugHandleType> unique_debug_handle_;
+  BackendDebugInfoMapType handles_to_inlined_callstack_ptrs_;
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_debug_info.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_debug_info.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0a66b5c7a10302e7364914ac0958ad84af67760
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_debug_info.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#ifndef BUILD_LITE_INTERPRETER
+#include <torch/csrc/jit/backends/backend_debug_handler.h>
+#endif
+#include <torch/custom_class.h>
+
+namespace torch::jit {
+
+constexpr static auto kBackendUtilsNamespace = "backendutils";
+constexpr static auto kBackendDebugInfoClass = "BackendDebugInfo";
+
+#ifndef BUILD_LITE_INTERPRETER
+/*
+ * Custom class for holding debug information in lowered modules, intended
+ * purely for keeping this information to be later serialized outside of the
+ * lowered module itself.
+ * Its usage pattern is:
+ * 1. LoweredModule declares an instance of this class in __backend_debug_info
+ * 2. During serialization, __backend_debug_info is used to obtain the debug
+ *    information.
+ * 3. The contents of LoweredModule.__backend_debug_info are not serialized
+ *    within the LoweredModule itself.
+ */
+class TORCH_API PyTorchBackendDebugInfo : public torch::CustomClassHolder {
+ public:
+  PyTorchBackendDebugInfo() = default;
+
+  std::optional<BackendDebugInfoMapType>& getDebugInfoMap() {
+    return debug_info_map_;
+  }
+
+  void setDebugInfoMap(BackendDebugInfoMapType&& debug_info_map) {
+    debug_info_map_ = std::move(debug_info_map);
+  }
+
+ private:
+  std::optional<BackendDebugInfoMapType> debug_info_map_;
+};
+
+#else
+
+/*
+ * Dummy instance exists for the following reason:
+ * __backend_debug_info is of type BackendDebugInfo which is a torchbind'
+ * class backed by cpp class PyTorchBackendDebugInfo.
+ * PyTorchBackendDebugInfo, depends on ir.h., scope.h, source_range etc.
+ * We dont include this on lite interpreter side. Thus on lite interpreter side
+ * we cannot have valid definition of PyTorchBackendDebugInfo. However we do not
+ * need valid instance of __backend_debug_info in lite interpreter anyway as we
+ * dont serialize this info as part of LowerdModule as mentioned ealrier.
+ * However since LoweredModule has registered attribute of __backend_debug_info
+ * we still need to make sure that BackendDebugInfo is registered with
+ * TorchScript. However in this instance it does not have to be backed by
+ * PyTorchBackendDebugInfo, so we create a dummy PyTorchBackendDebugInfoDummy
+ * just for this purpose.
+ */
+class PyTorchBackendDebugInfoDummy : public torch::CustomClassHolder {
+ public:
+  PyTorchBackendDebugInfoDummy() = default;
+};
+#endif
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_detail.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_detail.h
new file mode 100644
index 0000000000000000000000000000000000000000..a48eb42a7743550f535b9e94fb34d2b9336b0ce9
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_detail.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+
+#include <ATen/core/jit_type.h>
+
+#include <functional>
+
+namespace torch::jit {
+
+using DebugHandleType = int64_t;
+
+using NodeToDebugHandle = std::unordered_map<Node*, DebugHandleType>;
+
+using BackendDebugHandleGenerator =
+    std::function<NodeToDebugHandle(const std::shared_ptr<Graph>&)>;
+
+namespace detail {
+
+using BackendPreprocessFunction = std::function<c10::IValue(
+    const Module&,
+    const c10::Dict<IValue, IValue>&,
+    const BackendDebugHandleGenerator& generate_debug_handles)>;
+
+TORCH_API void registerBackendPreprocessFunction(
+    const std::string& name,
+    const BackendPreprocessFunction& preprocess);
+
+bool hasBackendPreprocessFunction(const std::string& name);
+
+BackendPreprocessFunction getBackendPreprocessFunction(const std::string& name);
+
+TORCH_API Module codegen_backend_module(
+    const std::string& backend_name,
+    const Module& orig_module,
+    const c10::Dict<IValue, IValue>& method_compile_spec,
+    const c10::DictTypePtr& any_dict_ty);
+} // namespace detail
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_exception.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_exception.h
new file mode 100644
index 0000000000000000000000000000000000000000..70be4063726ede0d2a2d1925f9e87d5699a6a0c3
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_exception.h
@@ -0,0 +1,56 @@
+#pragma once
+#include <c10/util/Exception.h>
+
+#include <utility>
+
+namespace c10 {
+class TORCH_API BackendRuntimeException : public c10::Error {
+ public:
+  // Use debug_handle to throw exception
+  BackendRuntimeException(
+      SourceLocation loc,
+      std::string msg,
+      int64_t debug_handle)
+      : c10::Error(loc, std::move(msg)) {
+    debug_handles.push_back(debug_handle);
+  }
+  // If rethrowing, can push another debug_handle
+  // This is useful in couple of scenarios.
+  // 1. A submodule is lowered and lite interperter has CallMethod
+  //    to lowered module's method. In this case lowered module will throw with
+  //    a handle, plus there will be another debug handle corresponding
+  //    to the CallMethod node in lite interpreter. Both together give complete
+  //    trace. This function allows lite interpreter to rethrow with debug
+  //    handle it has for CallMethod.
+  // 2. Another scenarios is when lite interperter can make function calls or
+  //    the lowered backend also has function call ability. Thus we have
+  //    multiple function frames. Now we need a stack of handles to symbolicate
+  //    entire stack trace.
+  void pushDebugHandle(int64_t debug_handle) {
+    debug_handles.push_back(debug_handle);
+  }
+  const std::vector<int64_t>& getDebugHandles() {
+    return debug_handles;
+  }
+
+ private:
+  // Stores stack of debug handles.
+  std::vector<int64_t> debug_handles;
+};
+
+} // namespace c10
+#define TORCH_DELEGATED_BACKEND_THROW(cond, msg, debug_handle) \
+  if (C10_UNLIKELY_OR_CONST(!(cond))) {                        \
+    throw ::c10::BackendRuntimeException(                      \
+        {__func__, __FILE__, static_cast<uint32_t>(__LINE__)}, \
+        msg,                                                   \
+        debug_handle);                                         \
+  }
+
+#define TORCH_DELEGATED_BACKEND_RETHROW(e, debug_handle) \
+  do {                                                   \
+    e.pushDebugHandle(debug_handle);                     \
+    throw;                                               \
+  } while (false)
+
+#define DEBUG_HANDLE_UNKNOWN -1
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_init.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_init.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7af13360e2024f3aa92ee19c09f6aef955e6e52
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_init.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/csrc/jit/python/pybind.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch::jit {
+// Initialize Python bindings for JIT to_<backend> functions.
+void initJitBackendBindings(PyObject* module);
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_interface.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_interface.h
new file mode 100644
index 0000000000000000000000000000000000000000..2d62abf06cc8d5cefe66b7053cb8cbb8712ad510
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_interface.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <torch/custom_class.h>
+
+namespace torch::jit {
+
+// Interface for a JIT backend.
+class TORCH_API PyTorchBackendInterface : public torch::CustomClassHolder {
+ public:
+  PyTorchBackendInterface() noexcept;
+  ~PyTorchBackendInterface() override;
+
+  // Returns true if the backend is available to process delegation calls.
+  virtual bool is_available() = 0;
+
+  // Compile the module contained in \p processed using the details provided in
+  // \p method_compile_spec for each module method that should be compiled for
+  // the backend. \p method_compile_spec should be of type Dict<string, Any>.
+  // \returns a dictionary of type Dict<string, Any> that contains a backend
+  // handle each method that can run on the backend (i.e. each key in \p
+  // method_compile_spec).
+  virtual c10::impl::GenericDict compile(
+      c10::IValue processed,
+      c10::impl::GenericDict method_compile_spec) = 0;
+
+  // Execute the method specified by \p handle using \p inputs. \returns the
+  // outputs as a tuple.
+  virtual c10::impl::GenericList execute(
+      c10::IValue handle,
+      c10::impl::GenericList inputs) = 0;
+};
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_preprocess.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_preprocess.h
new file mode 100644
index 0000000000000000000000000000000000000000..611112e91eae68cff719575c3b8579fb0b6e6bfa
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_preprocess.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <torch/csrc/jit/backends/backend_detail.h>
+namespace torch::jit {
+class backend_preprocess_register {
+  std::string backend_name_;
+
+ public:
+  backend_preprocess_register(
+      const std::string& name,
+      const detail::BackendPreprocessFunction& preprocess)
+      : backend_name_(name) {
+    detail::registerBackendPreprocessFunction(name, preprocess);
+  }
+};
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_resolver.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_resolver.h
new file mode 100644
index 0000000000000000000000000000000000000000..c833167d61ccbd4c7d23b91e45561a36ad45d346
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/backend_resolver.h
@@ -0,0 +1,8 @@
+#pragma once
+
+#include <torch/csrc/jit/frontend/resolver.h>
+
+namespace torch::jit {
+// Create a Resolver for use in generating LoweredModules for specific backends.
+TORCH_API std::shared_ptr<Resolver> loweredModuleResolver();
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/coreml/cpp/context.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/coreml/cpp/context.h
new file mode 100644
index 0000000000000000000000000000000000000000..d278a9071e4247ed9ad278e4727b183fc394c963
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/coreml/cpp/context.h
@@ -0,0 +1,22 @@
+#ifndef PTM_COREML_Context_h
+#define PTM_COREML_Context_h
+
+#include <string>
+
+namespace torch::jit::mobile::coreml {
+
+struct ContextInterface {
+  virtual ~ContextInterface() = default;
+  virtual void setModelCacheDirectory(std::string path) = 0;
+};
+
+class BackendRegistrar {
+ public:
+  explicit BackendRegistrar(ContextInterface* ctx);
+};
+
+void setModelCacheDirectory(std::string path);
+
+} // namespace torch::jit::mobile::coreml
+
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/coreml/objc/PTMCoreMLCompiler.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/coreml/objc/PTMCoreMLCompiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..f27caa402f04c9328cf13bffee4ea465bf880574
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/coreml/objc/PTMCoreMLCompiler.h
@@ -0,0 +1,22 @@
+#import <CoreML/CoreML.h>
+
+#include <string>
+
+NS_ASSUME_NONNULL_BEGIN
+
+@interface PTMCoreMLCompiler : NSObject
+
++ (void)setCacheDirectory:(const std::string&)dir;
+
++ (NSString*)cacheDirectory;
+
++ (BOOL)compileModel:(const std::string&)modelSpecs modelID:(const std::string&)modelID;
+
++ (nullable MLModel*)loadModel:(const std::string)modelID
+                       backend:(const std::string)backend
+             allowLowPrecision:(BOOL)allowLowPrecision
+                         error:(NSError**)error;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/coreml/objc/PTMCoreMLExecutor.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/coreml/objc/PTMCoreMLExecutor.h
new file mode 100644
index 0000000000000000000000000000000000000000..284bdcd511a5a9ce4552ff2b3668b5b0cc74df77
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/coreml/objc/PTMCoreMLExecutor.h
@@ -0,0 +1,19 @@
+#import <torch/csrc/jit/backends/coreml/objc/PTMCoreMLFeatureProvider.h>
+
+#import <CoreML/CoreML.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+@interface PTMCoreMLExecutor : NSObject
+
+@property(atomic, strong) MLModel* model;
+
+- (instancetype)initWithFeatureNames:(NSArray<NSString*>*)featureNames;
+
+- (void)setInputs:(c10::impl::GenericList)inputs;
+
+- (id<MLFeatureProvider>)forward:(NSError**)error;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/coreml/objc/PTMCoreMLFeatureProvider.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/coreml/objc/PTMCoreMLFeatureProvider.h
new file mode 100644
index 0000000000000000000000000000000000000000..b96da7a8dc7d3339aa0ed9a51a47c745edba4d3f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/coreml/objc/PTMCoreMLFeatureProvider.h
@@ -0,0 +1,16 @@
+#import <ATen/ATen.h>
+#import <CoreML/CoreML.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+@interface PTMCoreMLFeatureProvider : NSObject<MLFeatureProvider>
+
+- (instancetype)initWithFeatureNames:(NSSet<NSString*>*)featureNames;
+
+- (void)clearInputTensors;
+
+- (void)setInputTensor:(const at::Tensor&)tensor forFeatureName:(NSString*)name;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/coreml/objc/PTMCoreMLModelWrapper.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/coreml/objc/PTMCoreMLModelWrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..e778eaeda1b6ef4ca091e290217a757a20bdb0de
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/coreml/objc/PTMCoreMLModelWrapper.h
@@ -0,0 +1,41 @@
+#include <ATen/core/ivalue.h>
+#include <torch/csrc/jit/backends/coreml/objc/PTMCoreMLExecutor.h>
+#include <torch/csrc/jit/backends/coreml/objc/PTMCoreMLTensorSpec.h>
+
+namespace torch {
+namespace jit {
+namespace mobile {
+namespace coreml {
+
+class MLModelWrapper : public CustomClassHolder {
+ public:
+  PTMCoreMLExecutor* executor;
+  std::vector<TensorSpec> outputs;
+
+  MLModelWrapper() = delete;
+
+  MLModelWrapper(PTMCoreMLExecutor* executor) : executor(executor) {
+    [executor retain];
+  }
+
+  MLModelWrapper(const MLModelWrapper& oldObject) {
+    executor = oldObject.executor;
+    outputs = oldObject.outputs;
+    [executor retain];
+  }
+
+  MLModelWrapper(MLModelWrapper&& oldObject) {
+    executor = oldObject.executor;
+    outputs = oldObject.outputs;
+    [executor retain];
+  }
+
+  ~MLModelWrapper() {
+    [executor release];
+  }
+};
+
+} // namespace coreml
+} // namespace mobile
+} // namespace jit
+} // namespace torch
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/coreml/objc/PTMCoreMLTensorSpec.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/coreml/objc/PTMCoreMLTensorSpec.h
new file mode 100644
index 0000000000000000000000000000000000000000..2b7af89c89f59f695383611f91d51e4cc8f0229a
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/coreml/objc/PTMCoreMLTensorSpec.h
@@ -0,0 +1,26 @@
+#include <c10/core/ScalarType.h>
+#import <nlohmann/json.hpp>
+
+#include <string>
+
+namespace torch::jit::mobile::coreml {
+
+struct TensorSpec {
+  std::string name;
+  c10::ScalarType dtype = c10::ScalarType::Float;
+};
+
+static inline c10::ScalarType scalar_type(const std::string& type_string) {
+  if (type_string == "0") {
+    return c10::ScalarType::Float;
+  } else if (type_string == "1") {
+    return c10::ScalarType::Double;
+  } else if (type_string == "2") {
+    return c10::ScalarType::Int;
+  } else if (type_string == "3") {
+    return c10::ScalarType::Long;
+  }
+  return c10::ScalarType::Undefined;
+}
+
+} // namespace torch::jit::mobile::coreml
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..b3aa04eb9353fe26d1f6935b9200082441115829
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/xnnpack/compiler/xnn_compiler.h
@@ -0,0 +1,24 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <caffe2/torch/csrc/jit/backends/xnnpack/executor/xnn_executor.h>
+#include <xnnpack.h>
+#include <memory>
+#include <vector>
+
+namespace torch::jit::xnnpack::delegate {
+
+class XNNCompiler {
+ public:
+  // Takes Flatbuffer Serialized XNNPack Model and rebuilds the xnn-subgraph
+  // returns an executor object that holds the xnn runtime object which we
+  // can then use to set inputs and run inference using the xnn graph.
+  static void compileModel(
+      const void* buffer_pointer,
+      size_t num_bytes,
+      XNNExecutor* executor);
+};
+
+} // namespace torch::jit::xnnpack::delegate
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/xnnpack/executor/xnn_executor.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/xnnpack/executor/xnn_executor.h
new file mode 100644
index 0000000000000000000000000000000000000000..630fc8bfe95935afd756f941d461536e9255fa1b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/xnnpack/executor/xnn_executor.h
@@ -0,0 +1,68 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+#include <xnnpack.h>
+#include <memory>
+#include <vector>
+
+namespace torch::jit::xnnpack::delegate {
+
+class XNNExecutor {
+ private:
+  std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)> runtime_{
+      nullptr,
+      &xnn_delete_runtime};
+  std::vector<uint32_t> input_ids_;
+  std::vector<uint32_t> output_ids_;
+  std::vector<xnn_external_value> externals_;
+
+ public:
+  XNNExecutor() = default;
+
+  template <typename T>
+  bool set_inputs(std::vector<T*>& inputs, std::vector<T*>& outputs) {
+    externals_.clear();
+
+    if (inputs.size() != input_ids_.size()) {
+      return false;
+    }
+
+    for (int i = 0; i < inputs.size(); i++) {
+      externals_.emplace_back(xnn_external_value{input_ids_[i], inputs[i]});
+    }
+
+    if (outputs.size() != output_ids_.size()) {
+      return false;
+    }
+
+    for (int i = 0; i < outputs.size(); i++) {
+      externals_.emplace_back(xnn_external_value{output_ids_[i], outputs[i]});
+    }
+
+    return true;
+  }
+
+  bool forward() {
+    xnn_status status =
+        xnn_setup_runtime(runtime_.get(), externals_.size(), externals_.data());
+
+    if (status != xnn_status_success) {
+      return false;
+    }
+
+    status = xnn_invoke_runtime(runtime_.get());
+
+    if (status != xnn_status_success) {
+      return false;
+    }
+
+    return true;
+  }
+
+  friend class XNNCompiler;
+};
+
+} // namespace torch::jit::xnnpack::delegate
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/xnnpack/serialization/serializer.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/xnnpack/serialization/serializer.h
new file mode 100644
index 0000000000000000000000000000000000000000..381f286c0d7ced18c79b6ce32b3dd3354e9edba2
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/xnnpack/serialization/serializer.h
@@ -0,0 +1,89 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <torch/csrc/jit/backends/xnnpack/serialization/schema_generated.h>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+namespace torch {
+namespace jit {
+namespace xnnpack {
+namespace delegate {
+
+using namespace fb_xnnpack; // Specified in the schema
+
+class XNNSerializer {
+ public:
+  // Constructors
+  // initial buffersize of 1024 which will grow
+  // automatically, constant buffer and buffer sizes initialized with dummy
+  // values as 0 index is reserved for non-constant tensors
+  XNNSerializer() : XNNSerializer(1024) {}
+
+  explicit XNNSerializer(size_t bufferSize)
+      : _builder(bufferSize),
+        _nodes(),
+        _values(),
+        _constantBuffer({CreateBuffer(
+            _builder,
+            {})}), // index 0 is reserved for non-const data
+        _bufferSizes({0}) {}
+
+  // Serializing Nodes
+
+  // Serialize add node, we are serializing the argument needed to call
+  // xnn_define_add2. Serializing these values, and at run time we build
+  // teh graph by re running xnn_define_add2
+  void serializeAddNode(
+      uint32_t input1_id,
+      uint32_t input2_id,
+      uint32_t output_id,
+      uint32_t flags);
+
+  // Serializing Values
+  void serializeTensorValue(
+      uint32_t xnn_datatype,
+      size_t num_dims,
+      std::vector<size_t> dims,
+      size_t buffer_data_idx,
+      uint32_t external_id,
+      uint32_t flags,
+      uint32_t id_out);
+
+  // finish and serialize xnngraph returning serialized data
+  std::string finishAndSerialize(
+      std::vector<uint32_t> input_ids,
+      std::vector<uint32_t> output_ids,
+      size_t num_extern_ids);
+
+  // decoupled data serialization with tensor values. This way constant tensor
+  // data can be referenced by multiple intermediate tensors. This call
+  // serializes the num_bytes of the data_ptr and returns the index it was
+  // placed in.
+  size_t serializeData(const uint8_t* data_ptr, size_t num_bytes);
+
+ private:
+  // xnnpack version we are serializing
+  const char* _version_sha1 = "ae108ef49aa5623b896fc93d4298c49d1750d9ba";
+
+  // flatbuffer objects we will create and serialize together to create xnngraph
+  flatbuffers_fbsource::FlatBufferBuilder _builder;
+
+  // Vector of the serialized xnnpack nodes
+  std::vector<flatbuffers_fbsource::Offset<XNode>> _nodes;
+
+  // Vector of the serialized xnnpack values
+  std::vector<flatbuffers_fbsource::Offset<XValue>> _values;
+
+  std::vector<flatbuffers_fbsource::Offset<Buffer>> _constantBuffer;
+  std::vector<uint32_t> _bufferSizes;
+};
+
+} // namespace delegate
+} // namespace xnnpack
+} // namespace jit
+} // namespace torch
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.h
new file mode 100644
index 0000000000000000000000000000000000000000..6f3d8412caeb74171b710f02e6f285cf3a39eed4
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/backends/xnnpack/xnnpack_graph_builder.h
@@ -0,0 +1,97 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <ATen/Functions.h>
+#include <ATen/Utils.h>
+#include <torch/torch.h>
+#include <xnnpack.h>
+#include <unordered_set>
+#include <vector>
+
+#include <torch/csrc/jit/backends/xnnpack/serialization/serializer.h>
+
+namespace torch {
+namespace jit {
+namespace xnnpack {
+namespace delegate {
+
+class XNNGraph {
+ private:
+  const float output_min = -std::numeric_limits<float>::infinity();
+  const float output_max = std::numeric_limits<float>::infinity();
+
+  // serializer class
+  XNNSerializer _serializer;
+  // xnn subgraph
+  xnn_subgraph_t _subgraph_ptr;
+  // Set of all the tensor values throughout the jit graph
+  std::unordered_set<torch::jit::Value*> _intermediate_tensors;
+  // Set of all the tensor values mapped to the xnnpack ids
+  std::unordered_map<torch::jit::Value*, uint32_t> _val_to_ids;
+  // Vector containing the torch valued inputs/outputs,
+  // must be ordered to preserve the order of input/outputs
+  std::vector<torch::jit::Value*> _inputs;
+  std::vector<torch::jit::Value*> _outputs;
+
+  // Graph passes for optimizing and tracing torchscript graph
+  // Essentially massaging the graph into a digestiable format for
+  // xnnpack graph lowering.
+  std::shared_ptr<torch::jit::Graph> optimizeAndTraceGraph(
+      std::shared_ptr<torch::jit::Graph> graph,
+      std::vector<c10::IValue>& example_inputs);
+
+  // Gather all the intermediate tensor values within a graph. This
+  // skips through all prim constants. The purpose of this is for defining
+  // the tensor values beforehand for the xnnpack subgraph.
+  void gatherTensorValues(std::shared_ptr<torch::jit::Graph>& graph);
+
+  // Gathers the tensor values in a give node
+  void gatherNodeInputs(torch::jit::Node& node);
+
+  // Helper function to determine if a jit value is a graph input
+  bool isGraphInput(torch::jit::Value* val);
+
+  // Helper function to determine if a jit value is a graph output
+  bool isGraphOutput(torch::jit::Value* val);
+
+  // Defines all xnnpack nodes for the nodes in the graph
+  void defineAllNodes(std::shared_ptr<torch::jit::Graph>& graph);
+
+  // Defines all xnn tensor values used throughout the graph
+  void defineAllTensorValues();
+
+  // Makes a pass through the graph and throws if any ops are unsupported
+  void checkOpsToDelegate(std::shared_ptr<torch::jit::Graph>& graph);
+
+ public:
+  XNNGraph() : _serializer(), _subgraph_ptr(nullptr) {
+    xnn_status status = xnn_initialize(/*allocator =*/nullptr);
+    TORCH_CHECK(xnn_status_success == status, "Failed to initialize xnnpack");
+  }
+
+  ~XNNGraph() {
+    xnn_deinitialize();
+    if (_subgraph_ptr != nullptr) {
+      xnn_delete_subgraph(_subgraph_ptr);
+    }
+  }
+
+  void buildXNNGraph(
+      std::shared_ptr<torch::jit::Graph>& graph,
+      std::vector<c10::IValue> example_inputs);
+
+  void runGraphOnInputs(
+      std::vector<at::Tensor> tensor_inputs,
+      std::vector<at::Tensor> tensor_outputs);
+
+  std::string serializedXNNGraph();
+
+  std::vector<std::vector<long>> getGraphOutputShapes();
+};
+
+} // namespace delegate
+} // namespace xnnpack
+} // namespace jit
+} // namespace torch
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/cuda/interface.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/cuda/interface.h
new file mode 100644
index 0000000000000000000000000000000000000000..5fe16558ccbe6567cbba66c4ec340d88e3c7ae36
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/cuda/interface.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <c10/macros/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/passes/pass_manager.h>
+#include <torch/csrc/jit/runtime/profiling_record.h>
+
+/*
+ * This file contains APIs for cuda fuser;
+ *
+ * We use an empty static struct to hold the function pointers, which are
+ * registered separately. This is to support cpu-only compilation.
+ * Registration is done in torch/csrc/jit/codegen/cuda/register_interface.cpp
+ */
+
+namespace torch::jit::fuser::cuda {
+
+TORCH_API std::atomic<bool>& getCudaFusionGuardMode();
+
+TORCH_API bool getSingletonFusion();
+TORCH_API bool setSingletonFusion(bool value);
+TORCH_API bool getHorizontalFusion();
+TORCH_API bool setHorizontalFusion(bool value);
+
+// dummy struct to allow API registration
+struct CudaFuserInterface {
+  void (*fn_compile_n)(Node*) = nullptr;
+  void (*fn_run_n_s)(const Node*, Stack&) = nullptr;
+  void (*fn_fuse_graph)(std::shared_ptr<Graph>&) = nullptr;
+  bool (*fn_can_fuse_n)(const Node*) = nullptr;
+  void (*fn_insert_profile_inodes)(ProfilingRecord* pr) = nullptr;
+  bool (*fn_profile_n)(const Node*) = nullptr;
+  bool (*fn_skip_n)(const std::string&, bool flip) = nullptr;
+};
+
+// Get interface, this is used by registration and user facing API internally
+TORCH_API CudaFuserInterface* getFuserInterface();
+
+TORCH_API void compileFusionGroup(Node* fusion_node);
+TORCH_API void runFusionGroup(const Node* fusion_node, Stack& stack);
+TORCH_API void fuseGraph(std::shared_ptr<Graph>&);
+TORCH_API bool canFuseNode(const Node* node);
+TORCH_API void InsertProfileNodesForCUDAFuser(ProfilingRecord* pr);
+TORCH_API bool profileNode(const Node* node);
+
+TORCH_API bool skipNode(const std::string& symbol_str, bool flip = true);
+
+TORCH_API bool isEnabled();
+TORCH_API bool setEnabled(bool is_enabled);
+TORCH_API bool canBeEnabled();
+
+} // namespace torch::jit::fuser::cuda
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/arg_spec.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/arg_spec.h
new file mode 100644
index 0000000000000000000000000000000000000000..c294feada73fd8b0986f28fb4a2f72e4999ac36e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/arg_spec.h
@@ -0,0 +1,55 @@
+#pragma once
+#include <ATen/ATen.h>
+#include <ATen/core/functional.h> // fmap
+#include <c10/util/hash.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/codegen/fuser/tensor_desc.h>
+
+#include <cstdint>
+#include <vector>
+
+namespace torch::jit::fuser {
+
+// Describes the (runtime) arguments to a kernel.
+// ArgSpecs are also used as keys to lookup instantiated kernels, so
+//  they are hashable.
+// Note: the device to run on is included in the arg spec because kernels
+//  are compiled per-device.
+struct TORCH_API ArgSpec {
+  ArgSpec(at::TensorList inputs, const int _device)
+      : descs_{c10::fmap<TensorDesc>(inputs)},
+        hash_code_{c10::get_hash(_device, inputs.size(), descs_)},
+        device_{_device} {}
+
+  // (Common) hash function
+  static size_t hash(const ArgSpec& spec) {
+    return spec.hash_code_;
+  }
+
+  // Comparators
+  bool operator==(const ArgSpec& other) const {
+    return (descs_ == other.descs_ && device_ == other.device_);
+  }
+
+  bool operator!=(const ArgSpec& spec) const {
+    return !(*this == spec);
+  }
+
+  // Getters
+  size_t hashCode() const {
+    return hash_code_;
+  }
+  const std::vector<TensorDesc>& descs() const {
+    return descs_;
+  }
+  int device() const {
+    return device_;
+  }
+
+ private:
+  std::vector<TensorDesc> descs_;
+  size_t hash_code_;
+  int device_;
+};
+
+} // namespace torch::jit::fuser
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/codegen.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/codegen.h
new file mode 100644
index 0000000000000000000000000000000000000000..7aa2ed39cc04e72dfe72ed4d122d5e64c5ed2245
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/codegen.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/codegen/fuser/arg_spec.h>
+#include <torch/csrc/jit/codegen/fuser/partition_desc.h>
+#include <torch/csrc/jit/codegen/fuser/tensor_desc.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+#include <string>
+#include <vector>
+
+namespace torch::jit::fuser {
+
+// Creates a CPU or CUDA kernel for the given graph.
+// Returns the C++ or CUDA string implementing the kernel.
+TORCH_API std::string generateKernel(
+    const std::string& name,
+    const Graph& graph,
+    const std::vector<std::pair<const Value*, const std::optional<TensorDesc>>>&
+        inputs,
+    const std::vector<std::pair<const Value*, const TensorDesc>>& outputs,
+    const bool use_cuda);
+
+} // namespace torch::jit::fuser
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/compiler.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/compiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..d2e5cfff7e711f324c19ddcd232683f77a17c692
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/compiler.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <ATen/core/stack.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/codegen/fuser/arg_spec.h>
+#include <torch/csrc/jit/codegen/fuser/fused_kernel.h>
+#include <torch/csrc/jit/codegen/fuser/interface.h>
+#include <torch/csrc/jit/codegen/fuser/kernel_spec.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+#include <cstdint>
+#include <vector>
+
+namespace torch::jit::fuser {
+
+// Performs device-independent "upfront" compilation of the given fusion_group,
+// if it has not been registered already.
+// Returns a key that can be used to run the fusion later
+TORCH_API int64_t registerFusion(const Node* fusion_group);
+
+// Performs device-specific "runtime" compilation of the given kernel
+//  with the runtime arguments specified in ArgSpec.
+//  Outputs are allocated using map_size on the specified device.
+TORCH_API std::shared_ptr<FusedKernel> compileKernel(
+    const KernelSpec& spec,
+    const ArgSpec& arg_spec,
+    const std::vector<int64_t>& map_size,
+    const at::Device& device);
+
+TORCH_API size_t nCompiledKernels();
+
+TORCH_API int debugFuser();
+
+using FusedKernelConstructor = std::function<std::shared_ptr<FusedKernel>(
+    int16_t device,
+    std::string name,
+    std::string code,
+    std::vector<TensorDesc> input_desc,
+    std::vector<TensorDesc> output_desc,
+    std::vector<PartitionDesc> chunk_desc,
+    std::vector<PartitionDesc> concat_desc,
+    bool has_random)>;
+
+TORCH_API void registerFusionBackend(
+    at::Device::Type backend_type,
+    FusedKernelConstructor ctor);
+TORCH_API bool hasFusionBackend(at::Device::Type backend_type);
+struct TORCH_API RegisterFusionBackend{RegisterFusionBackend(
+    at::Device::Type backend_type,
+    FusedKernelConstructor ctor){
+    registerFusionBackend(backend_type, std::move(ctor));
+} // namespace torch::jit::fuser
+}
+;
+
+} // namespace torch::jit::fuser
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..4ee5f056f19aa05465c39560c956fc57ac5ac810
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/cpu/fused_kernel.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/DynamicLibrary.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/codegen/fuser/fused_kernel.h>
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+namespace torch::jit::fuser::cpu {
+
+// Represents a compiled CPU kernel and the metadata necessary to run it
+struct TORCH_API FusedKernelCPU : public FusedKernel {
+  FusedKernelCPU(
+      std::string name,
+      std::string code,
+      std::vector<TensorDesc> input_desc,
+      std::vector<TensorDesc> output_desc,
+      std::vector<PartitionDesc> chunk_desc,
+      std::vector<PartitionDesc> concat_desc,
+      bool has_random);
+
+  at::Backend backend() const override {
+    return at::Backend::CPU;
+  }
+
+  void launch_raw(const uint32_t numel, std::vector<void*>& arguments)
+      const override {
+    kernel(numel, arguments.data());
+  }
+
+ private:
+  std::unique_ptr<at::DynamicLibrary> so_lib;
+  void (*kernel)(uint32_t, void**) = nullptr;
+};
+
+} // namespace torch::jit::fuser::cpu
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/cpu/resource_strings.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/cpu/resource_strings.h
new file mode 100644
index 0000000000000000000000000000000000000000..475bb501e14606315830394fd091733791e9b537
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/cpu/resource_strings.h
@@ -0,0 +1,101 @@
+#pragma once
+
+#include <ATen/code_template.h>
+
+namespace torch::jit::fuser::cpu {
+
+/*with type_as not checking type of its input, a fusion group can have non-fp32
+tensor as input. Correct code for this case is generated, however, nvrtc does
+not know how to handle int*_t integer types, so typedefs help it handle those
+cases*/
+
+static auto type_declarations_template = at::jit::CodeTemplate(R"(
+
+#define POS_INFINITY INFINITY
+#define NEG_INFINITY -INFINITY
+
+typedef ${IndexType} IndexType;
+template<typename T, size_t N>
+struct TensorInfo {
+  T* data;
+  IndexType sizes[N];
+  IndexType strides[N];
+};
+template<typename T>
+struct TensorInfo<T, 0> {
+  T * data;
+};
+)");
+
+static auto cpu_compilation_unit_template = at::jit::CodeTemplate(R"(
+#include <math.h>
+#include <cstddef>
+#include <cstdint>
+
+double rsqrt(double x) {
+  return 1.0/sqrt(x);
+}
+
+float rsqrtf(float x) {
+  return 1.0f/sqrtf(x);
+}
+
+double frac(double x) {
+  return x - trunc(x);
+}
+
+float fracf(float x) {
+  return x - truncf(x);
+}
+
+${type_declarations}
+
+#ifdef _MSC_VER
+template<size_t n> struct int_of_size;
+
+#define DEFINE_INT_OF_SIZE(int_t) \
+template<> struct int_of_size<sizeof(int_t)> { using type = int_t; }
+
+DEFINE_INT_OF_SIZE(int64_t);
+DEFINE_INT_OF_SIZE(int32_t);
+DEFINE_INT_OF_SIZE(int16_t);
+DEFINE_INT_OF_SIZE(int8_t);
+
+#undef DEFINE_INT_OF_SIZE
+
+template <typename T>
+using int_same_size_t = typename int_of_size<sizeof(T)>::type;
+
+#define IndexTypeLoop int_same_size_t<IndexType>
+#define ToIndexTypeLoop(x) static_cast<IndexTypeLoop>(x)
+#else
+#define IndexTypeLoop IndexType
+#define ToIndexTypeLoop(x) x
+#endif
+
+#define OMP_THRESHOLD 100000
+static void ${kernelName}_kernel(IndexType totalElements, ${formals}) {
+  #pragma omp parallel for if(totalElements > OMP_THRESHOLD)
+  for (IndexTypeLoop linearIndex = 0;
+        linearIndex < ToIndexTypeLoop(totalElements);
+        linearIndex += 1) {
+      // Convert `linearIndex` into an offset of tensor:
+      ${tensorOffsets}
+      // calculate the results
+      ${kernelBody}
+    }
+}
+
+#ifdef _WIN32
+#define JIT_API __declspec(dllexport)
+#else
+#define JIT_API
+#endif
+
+extern "C"
+JIT_API void ${kernelName}(IndexType totalElements, void ** args) {
+  ${kernelName}_kernel(totalElements ${,argument_loads});
+}
+)");
+
+} // namespace torch::jit::fuser::cpu
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/cpu/temp_file.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/cpu/temp_file.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe37e64fdc97bdd0eae4b2b926d0bd7540cd010b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/cpu/temp_file.h
@@ -0,0 +1,135 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/Utils.h>
+#include <c10/util/Exception.h>
+#include <torch/csrc/Export.h>
+
+#ifdef _WIN32
+#include <WinError.h>
+#include <c10/util/Unicode.h>
+#include <c10/util/win32-headers.h>
+#include <fcntl.h>
+#include <io.h>
+#include <process.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <random>
+#else
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <vector>
+
+namespace torch::jit::fuser::cpu {
+
+#ifdef _MSC_VER
+inline int wmkstemps(wchar_t* tmpl, int suffix_len) {
+  int len;
+  wchar_t* name;
+  int fd = -1;
+  int save_errno = errno;
+
+  len = wcslen(tmpl);
+  if (len < 6 + suffix_len ||
+      wcsncmp(&tmpl[len - 6 - suffix_len], L"XXXXXX", 6)) {
+    return -1;
+  }
+
+  name = &tmpl[len - 6 - suffix_len];
+
+  std::random_device rd;
+  do {
+    for (unsigned i = 0; i < 6; ++i) {
+      name[i] = "abcdefghijklmnopqrstuvwxyz0123456789"[rd() % 36];
+    }
+
+    fd = _wopen(tmpl, _O_RDWR | _O_CREAT | _O_EXCL, _S_IWRITE | _S_IREAD);
+  } while (errno == EEXIST);
+
+  if (fd >= 0) {
+    errno = save_errno;
+    return fd;
+  } else {
+    return -1;
+  }
+}
+#endif
+
+struct TempFile {
+  AT_DISALLOW_COPY_AND_ASSIGN(TempFile);
+
+  TempFile(const std::string& t, int suffix) {
+#ifdef _MSC_VER
+    auto wt = c10::u8u16(t);
+    std::vector<wchar_t> tt(wt.c_str(), wt.c_str() + wt.size() + 1);
+    int fd = wmkstemps(tt.data(), suffix);
+    AT_ASSERT(fd != -1);
+    file_ = _wfdopen(fd, L"r+");
+    auto wname = std::wstring(tt.begin(), tt.end() - 1);
+    name_ = c10::u16u8(wname);
+#else
+    // mkstemps edits its first argument in places
+    // so we make a copy of the string here, including null terminator
+    std::vector<char> tt(t.c_str(), t.c_str() + t.size() + 1);
+    int fd = mkstemps(tt.data(), suffix);
+    AT_ASSERT(fd != -1);
+    file_ = fdopen(fd, "r+");
+    // - 1 because tt.size() includes the null terminator,
+    // but std::string does not expect one
+    name_ = std::string(tt.begin(), tt.end() - 1);
+#endif
+  }
+
+  const std::string& name() const {
+    return name_;
+  }
+
+  void sync() {
+    fflush(file_);
+  }
+
+  void write(const std::string& str) {
+    size_t result = fwrite(str.c_str(), 1, str.size(), file_);
+    AT_ASSERT(str.size() == result);
+  }
+
+#ifdef _MSC_VER
+  void close() {
+    if (file_ != nullptr) {
+      fclose(file_);
+    }
+    file_ = nullptr;
+  }
+#endif
+
+  FILE* file() {
+    return file_;
+  }
+
+  ~TempFile() {
+#ifdef _MSC_VER
+    if (file_ != nullptr) {
+      fclose(file_);
+    }
+    auto wname = c10::u8u16(name_);
+    if (!wname.empty() && _waccess(wname.c_str(), 0) != -1) {
+      _wunlink(wname.c_str());
+    }
+#else
+    if (file_ != nullptr) {
+      // unlink first to ensure another mkstemps doesn't
+      // race between close and unlink
+      unlink(name_.c_str());
+      fclose(file_);
+    }
+#endif
+  }
+
+ private:
+  FILE* file_ = nullptr;
+  std::string name_;
+};
+
+} // namespace torch::jit::fuser::cpu
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..2103971f21068e8a2974ceaddbdfe88ae75d3b96
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/cuda/fused_kernel.h
@@ -0,0 +1,59 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/codegen/fuser/fused_kernel.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <nvrtc.h>
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+namespace torch::jit::fuser::cuda {
+
+// query codegen output arch and target
+TORCH_CUDA_CU_API void codegenOutputQuery(
+    const cudaDeviceProp* const prop,
+    int& major,
+    int& minor,
+    bool& compile_to_sass);
+
+// A class holding metadata for an actual CUDA function.
+// Note: CUDA functions are per device.
+struct TORCH_CUDA_CU_API FusedKernelCUDA
+    : public ::torch::jit::fuser::FusedKernel {
+  FusedKernelCUDA(
+      at::DeviceIndex device,
+      std::string name,
+      std::string code,
+      std::vector<TensorDesc> input_desc,
+      std::vector<TensorDesc> output_desc,
+      std::vector<PartitionDesc> chunk_desc,
+      std::vector<PartitionDesc> concat_desc,
+      bool has_random);
+
+  ~FusedKernelCUDA() override;
+
+  void launch_raw(const uint32_t numel, std::vector<void*>& arguments)
+      const override;
+
+  at::Backend backend() const override {
+    return at::Backend::CUDA;
+  }
+
+ private:
+  static constexpr auto kBlockSize = 128;
+
+  // Note: per device to store device properties and compute launch heuristics
+  //  Acquiring these values at launch time would be too slow
+  at::DeviceIndex device_;
+  int maxBlocks_{};
+  cudaDeviceProp* prop_{};
+  std::vector<char> ptx_;
+  CUmodule module_{};
+  CUfunction function_{};
+};
+
+} // namespace torch::jit::fuser::cuda
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h
new file mode 100644
index 0000000000000000000000000000000000000000..4878ba668653b05098c162a50eaf4a93508e3ad5
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/cuda/resource_strings.h
@@ -0,0 +1,405 @@
+#pragma once
+
+#include <ATen/code_template.h>
+#include <torch/csrc/Export.h>
+
+namespace torch::jit::fuser::cuda {
+
+/*with type_as not checking type of its input, a fusion group can have non-fp32
+tensor as input. Correct code for this case is generated, however, nvrtc does
+not know how to handle int*_t integer types, so typedefs help it handle those
+cases*/
+
+static constexpr auto bfloat16_type_string = "__nv_bfloat16";
+
+#if defined(USE_ROCM)
+static auto type_declarations_template = at::jit::CodeTemplate(R"(
+${HalfHeader}
+${BFloat16Header}
+${RandHeader}
+
+#define NAN __int_as_float(0x7fffffff)
+#define POS_INFINITY __int_as_float(0x7f800000)
+#define NEG_INFINITY __int_as_float(0xff800000)
+
+typedef ${IndexType} IndexType;
+template<typename T, size_t N>
+struct TensorInfo {
+  T* data;
+  IndexType sizes[N];
+  IndexType strides[N];
+};
+template<typename T>
+struct TensorInfo<T, 0> {
+  T * data;
+};
+)");
+#else
+static auto type_declarations_template = at::jit::CodeTemplate(R"(
+typedef unsigned char uint8_t;
+typedef signed char int8_t;
+typedef short int  int16_t;
+typedef long long int int64_t;
+typedef unsigned long long int uint64_t;
+${HalfHeader}
+${BFloat16Header}
+${RandHeader}
+
+#define NAN __int_as_float(0x7fffffff)
+#define POS_INFINITY __int_as_float(0x7f800000)
+#define NEG_INFINITY __int_as_float(0xff800000)
+
+typedef ${IndexType} IndexType;
+template<typename T, size_t N>
+struct TensorInfo {
+  T* data;
+  IndexType sizes[N];
+  IndexType strides[N];
+};
+template<typename T>
+struct TensorInfo<T, 0> {
+  T * data;
+};
+)");
+#endif
+
+// We rewrite the code for philox RNG from curand as nvrtc couldn't resolve the
+// curand header correctly.
+constexpr auto rand_support_literal = R"(
+
+  class Philox {
+  public:
+    __device__ inline Philox(unsigned long long seed,
+                             unsigned long long subsequence,
+                             unsigned long long offset) {
+      key.x = (unsigned int)seed;
+      key.y = (unsigned int)(seed >> 32);
+      counter = make_uint4(0, 0, 0, 0);
+      counter.z = (unsigned int)(subsequence);
+      counter.w = (unsigned int)(subsequence >> 32);
+      STATE = 0;
+      incr_n(offset / 4);
+    }
+
+    __device__ inline unsigned long operator()() {
+      if(STATE == 0) {
+        uint4 counter_ = counter;
+        uint2 key_ = key;
+        for(int i = 0; i < 9; i++) {
+          counter_ = single_round(counter_, key_);
+          key_.x += (kPhilox10A); key_.y += (kPhilox10B);
+        }
+        output = single_round(counter_, key_);
+        incr();
+      }
+      unsigned long ret;
+      switch(STATE) {
+        case 0: ret = output.x; break;
+        case 1: ret = output.y; break;
+        case 2: ret = output.z; break;
+        case 3: ret = output.w; break;
+      }
+      STATE = (STATE + 1) % 4;
+      return ret;
+    }
+
+  private:
+    uint4 counter;
+    uint4 output;
+    uint2 key;
+    unsigned int STATE;
+    __device__ inline void incr_n(unsigned long long n) {
+      unsigned int nlo = (unsigned int)(n);
+      unsigned int nhi = (unsigned int)(n >> 32);
+      counter.x += nlo;
+      if (counter.x < nlo)
+        nhi++;
+      counter.y += nhi;
+      if (nhi <= counter.y)
+        return;
+      if (++counter.z)
+        return;
+      ++counter.w;
+    }
+    __device__ inline void incr() {
+      if (++counter.x)
+        return;
+      if (++counter.y)
+        return;
+      if (++counter.z)
+        return;
+      ++counter.w;
+    }
+    __device__ unsigned int mulhilo32(unsigned int a, unsigned int b,
+                                      unsigned int *result_high) {
+      *result_high = __umulhi(a, b);
+      return a*b;
+    }
+
+    __device__ inline uint4 single_round(uint4 ctr, uint2 key) {
+      unsigned int hi0;
+      unsigned int hi1;
+      unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0);
+      unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1);
+
+      uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0};
+      return ret;
+    }
+
+    static const unsigned long kPhilox10A = 0x9E3779B9;
+    static const unsigned long kPhilox10B = 0xBB67AE85;
+    static const unsigned long kPhiloxSA = 0xD2511F53;
+    static const unsigned long kPhiloxSB = 0xCD9E8D57;
+  };
+
+  // Inverse of 2^32.
+  #define M_RAN_INVM32 2.3283064e-10f
+  __device__  __inline__ float uniform(unsigned int x) {
+    return x * M_RAN_INVM32;
+  }
+)";
+
+constexpr auto rand_param =
+    ",unsigned long long seed, unsigned long long offset";
+
+constexpr auto rand_init = R"(
+  int idx = blockIdx.x*blockDim.x + threadIdx.x;
+  Philox rnd(seed, idx, offset);
+)";
+
+static auto cuda_compilation_unit_template = at::jit::CodeTemplate(R"(
+${type_declarations}
+
+extern "C" __global__
+void ${kernelName}(IndexType totalElements, ${formals} ${RandParam}) {
+  ${RandInit}
+  // check whether do vectorized load/store and allocate buffer
+  bool flag_vec4 = true;
+  ${tensorChecks}
+  if (flag_vec4) {
+    for (IndexType linearIndex = 4 * (blockIdx.x * blockDim.x + threadIdx.x);
+         linearIndex < totalElements;
+         linearIndex += 4 * gridDim.x * blockDim.x) {
+      // Convert `linearIndex` into an offset of tensor as it is:
+      ${tensorOffsets}
+      // load 4 at a time
+      ${kernelLoad}
+      #pragma unroll 4
+      for (int i=0; i<4; i++) {
+        // calculate the results
+        ${kernelBody_vec4}
+      }
+      // store 4 at a time
+      ${kernelStore}
+    }
+  } else {
+    for (IndexType linearIndex = blockIdx.x * blockDim.x + threadIdx.x;
+         linearIndex < totalElements;
+         linearIndex += gridDim.x * blockDim.x) {
+      // Convert `linearIndex` into an offset of tensor:
+      ${tensorOffsets}
+      // calculate the results
+      ${kernelBody}
+    }
+  }
+}
+)");
+
+// This snippet enables half support in the jit. Following the pattern for
+// reductions, fp16 input data is immediately upconverted to float
+// with __half2float(). All mathematical operations are done on float
+// values, and if needed the intermediate float representation is
+// converted to half with __float2half() when writing to a half tensor.
+#if defined(USE_ROCM)
+constexpr auto half_support_literal =
+    R"(
+typedef __half half;
+)";
+#else
+constexpr auto half_support_literal =
+    R"(
+#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
+#define __HALF_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))
+#if defined(__cplusplus)
+  struct __align__(2) __half {
+    __host__ __device__ __half() { }
+
+  protected:
+    unsigned short __x;
+  };
+
+  /* All intrinsic functions are only available to nvcc compilers */
+  #if defined(__CUDACC__)
+    /* Definitions of intrinsics */
+    __device__ __half __float2half(const float f) {
+      __half val;
+      asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(f));
+      return val;
+    }
+
+    __device__ float __half2float(const __half h) {
+      float val;
+      asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(h)));
+      return val;
+    }
+)"
+    // MSVC's preprocessor (but not the standard compiler) has a bug
+    // where it incorrectly tokenizes raw string literals, ending when it sees a
+    // " this causes the #endif in this string literal to be treated as a
+    // preprocessor token which, in turn, cause sccache on windows CI to fail.
+    // See https://godbolt.org/z/eVTIJq as an example.
+    // This workaround uses string-pasting to separate the " and the #endif into
+    // different strings
+    R"(
+  #endif /* defined(__CUDACC__) */
+#endif /* defined(__cplusplus) */
+#undef __HALF_TO_US
+#undef __HALF_TO_CUS
+
+typedef __half half;
+)";
+#endif
+
+#if defined(USE_ROCM)
+constexpr auto bfloat16_support_literal =
+    R"(
+#ifndef __align__
+#define __align__(x) __attribute__((aligned(x)))
+#endif
+
+typedef struct __align__(2) {
+  unsigned short x;
+}
+__nv_bfloat16_raw;
+
+#if defined(__cplusplus)
+struct __align__(2) __nv_bfloat16 {
+  __host__ __device__ __nv_bfloat16() {}
+
+  __host__ __device__ __nv_bfloat16& operator=(const __nv_bfloat16_raw& hr) {
+    __x = hr.x;
+    return *this;
+  }
+
+  unsigned short __x;
+};
+
+__device__ unsigned short __internal_float2bfloat16(
+    const float f,
+    unsigned int& sign,
+    unsigned int& remainder) {
+  unsigned int x;
+
+  x = __float_as_uint(f);
+
+  if ((x & 0x7fffffffU) > 0x7f800000U) {
+    sign = 0U;
+    remainder = 0U;
+    return static_cast<unsigned short>(0x7fffU);
+  }
+  sign = x >> 31;
+  remainder = x << 16;
+  return static_cast<unsigned short>(x >> 16);
+}
+
+/* Definitions of intrinsics */
+__device__ __nv_bfloat16 __float2bfloat16(const float a) {
+  __nv_bfloat16 val;
+  __nv_bfloat16_raw r;
+  unsigned int sign;
+  unsigned int remainder;
+  r.x = __internal_float2bfloat16(a, sign, remainder);
+  if ((remainder > 0x80000000U) ||
+      ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) {
+    r.x++;
+  }
+  val = r;
+  return val;
+}
+
+__device__ float __bfloat162float(const __nv_bfloat16 a) {
+  union
+  {
+      uint32_t int32;
+      float    fp32;
+  } u = {uint32_t(a.__x) << 16};
+  return u.fp32;
+}
+#endif /* defined(__cplusplus) */
+)";
+#else
+constexpr auto bfloat16_support_literal =
+    R"(
+#define __BFLOAT16_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
+#define __BFLOAT16_TO_CUS(var) \
+  *(reinterpret_cast<const unsigned short*>(&(var)))
+
+typedef struct __align__(2) {
+  unsigned short x;
+}
+__nv_bfloat16_raw;
+
+#if defined(__cplusplus)
+struct __align__(2) __nv_bfloat16 {
+  __host__ __device__ __nv_bfloat16() {}
+
+  __host__ __device__ __nv_bfloat16& operator=(const __nv_bfloat16_raw& hr) {
+    __x = hr.x;
+    return *this;
+  }
+
+ protected:
+  unsigned short __x;
+};
+
+#if defined(__CUDACC__)
+__device__ unsigned short __internal_float2bfloat16(
+    const float f,
+    unsigned int& sign,
+    unsigned int& remainder) {
+  unsigned int x;
+
+  x = __float_as_uint(f);
+
+  if ((x & 0x7fffffffU) > 0x7f800000U) {
+    sign = 0U;
+    remainder = 0U;
+    return static_cast<unsigned short>(0x7fffU);
+  }
+  sign = x >> 31;
+  remainder = x << 16;
+  return static_cast<unsigned short>(x >> 16);
+}
+
+/* Definitions of intrinsics */
+__device__ __nv_bfloat16 __float2bfloat16(const float a) {
+  __nv_bfloat16 val;
+#if __CUDA_ARCH__ >= 800
+  asm("{  cvt.rn.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a));
+#else
+  __nv_bfloat16_raw r;
+  unsigned int sign;
+  unsigned int remainder;
+  r.x = __internal_float2bfloat16(a, sign, remainder);
+  if ((remainder > 0x80000000U) ||
+      ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) {
+    r.x++;
+  }
+  val = r;
+#endif
+  return val;
+}
+
+__device__ float __bfloat162float(const __nv_bfloat16 a) {
+  float val;
+  asm("{ mov.b32 %0, {0,%1};}\n" : "=f"(val) : "h"(__BFLOAT16_TO_CUS(a)));
+  return val;
+}
+#endif /* defined(__CUDACC__) */
+#endif /* defined(__cplusplus) */
+#undef __BFLOAT16_TO_US
+#undef __BFLOAT16_TO_CUS
+)";
+#endif
+
+} // namespace torch::jit::fuser::cuda
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/executor.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/executor.h
new file mode 100644
index 0000000000000000000000000000000000000000..639e58d1604c3b0d0f0695966b8c145bfd18db09
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/executor.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <ATen/core/stack.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/codegen/fuser/fused_kernel.h>
+#include <torch/csrc/jit/codegen/fuser/kernel_spec.h>
+
+#include <cstdint>
+
+namespace torch::jit::fuser {
+
+// Runs the fusion associated with the key (see registerFusion() in interface.h)
+// on the inputs taken from the given Stack.
+TORCH_API bool runFusion(
+    const int64_t key,
+    Stack& stack,
+    std::string* code_out = nullptr);
+
+} // namespace torch::jit::fuser
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/fallback.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/fallback.h
new file mode 100644
index 0000000000000000000000000000000000000000..e32ff6173d7aaefd0ccc553bce7aa745327dbca9
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/fallback.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <ATen/core/stack.h>
+
+#include <cstdlib>
+
+namespace torch::jit::fuser {
+
+void runFallback(int64_t key, Stack& stack);
+
+} // namespace torch::jit::fuser
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/fused_kernel.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/fused_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..2c1f3f209483be59af3a5f6615758dfefca40d2d
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/fused_kernel.h
@@ -0,0 +1,98 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/Utils.h>
+#include <torch/csrc/jit/codegen/fuser/partition_desc.h>
+#include <torch/csrc/jit/codegen/fuser/tensor_desc.h>
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+namespace torch::jit::fuser {
+
+struct FusedKernel {
+  AT_DISALLOW_COPY_AND_ASSIGN(FusedKernel);
+
+  FusedKernel(
+      std::string name,
+      std::string code,
+      std::vector<TensorDesc> input_desc,
+      std::vector<TensorDesc> output_desc,
+      std::vector<PartitionDesc> chunk_desc,
+      std::vector<PartitionDesc> concat_desc,
+      bool has_random)
+      : name_(std::move(name)),
+        code_(std::move(code)),
+        input_desc_(std::move(input_desc)),
+        output_desc_(std::move(output_desc)),
+        chunk_desc_(std::move(chunk_desc)),
+        concat_desc_(std::move(concat_desc)),
+        has_random_(has_random) {}
+
+  virtual ~FusedKernel() = default;
+
+  // arguments is a list of pointers to the arguments for the compiled CUDA/CPU
+  // code.
+  // The format of arguments is suitable for directly passing to a call to
+  // cuLaunchKernel as the kernel arguments.
+  // Currently the first argument is a pointer to numel (for passing to
+  // CUDA code), and the remainder are pointers to the TensorInfo<T> structs
+  // that compiled code uses to load Tensor data.
+  // launch_with_tensors handles packing at::Tensors into this arguments array.
+  // CPU code uses the same convension so that launch_with_tensors can be
+  // shared.
+  virtual void launch_raw(const uint32_t numel, std::vector<void*>& arguments)
+      const = 0;
+  virtual at::Backend backend() const = 0;
+
+  // Getters
+  const std::string& name() const {
+    return name_;
+  }
+  const std::string& code() const {
+    return code_;
+  }
+  const std::vector<TensorDesc>& inputDesc() const {
+    return input_desc_;
+  }
+  const std::vector<TensorDesc>& outputDesc() const {
+    return output_desc_;
+  }
+  const std::vector<PartitionDesc>& chunkDesc() const {
+    return chunk_desc_;
+  }
+  const std::vector<PartitionDesc>& concatDesc() const {
+    return concat_desc_;
+  }
+  bool hasRandom() const {
+    return has_random_;
+  }
+
+ protected:
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  const std::string name_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  const std::string code_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  const std::vector<TensorDesc> input_desc_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  const std::vector<TensorDesc> output_desc_;
+
+  // same size as input_desc, describes whether an
+  // input should be broken into subtensors (chunks)
+  // to be consumed by the fusion group
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  const std::vector<PartitionDesc> chunk_desc_;
+
+  // same size as output_desc, describes whether
+  // an output is actually a concatenation of
+  // many subtensors that the fusion group produces
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  const std::vector<PartitionDesc> concat_desc_;
+
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  const bool has_random_;
+};
+
+} // namespace torch::jit::fuser
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/interface.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/interface.h
new file mode 100644
index 0000000000000000000000000000000000000000..a892e189b5e785e77524dba2288b8a0eb269082c
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/interface.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/core/stack.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+namespace torch::jit {
+
+constexpr int kCPUDevice = -1;
+
+// Assigns a "key" to the given fusion_group that it can use to run its
+// fusion later (via runFusion() below).
+TORCH_API int64_t registerFusion(const Node* fusion_group);
+
+// Runs the fusion corresponding to the given key on the inputs
+// found on the stack. Outputs are placed on the same stack.
+// In some cases a fusion cannot be run and a fallback path where
+// PyTorch's interpreter runs the graph instead is attempted.
+TORCH_API void runFusion(const int64_t key, Stack& stack);
+
+// True if the respective devices can fuse, false otherwise
+TORCH_API bool canFuseOnCPU();
+TORCH_API bool canFuseOnGPU();
+
+// Sets whether fusion on the CPU is allowed (disabled by default due to
+// flakiness)
+TORCH_API void overrideCanFuseOnCPU(bool value);
+
+// Sets whether fusion on CPU must use LLVM Codegen and not SimplieIREval
+TORCH_API void overrideMustUseLLVMOnCPU(bool value);
+
+// Sets whether fusion on the GPU is allowed (enabled by default)
+TORCH_API void overrideCanFuseOnGPU(bool value);
+
+// Treats the given graph as a fusion group and launches it on the
+// specified device with the given inputs.
+// Returns the outputs.
+TORCH_API std::vector<at::Tensor> debugLaunchGraph(
+    Graph& graph,
+    at::ArrayRef<at::Tensor> inputs);
+
+// Treats the given graph as a fusion group and returns the generated code.
+TORCH_API std::string debugGetFusedKernelCode(
+    Graph& graph,
+    at::ArrayRef<at::Tensor> inputs);
+
+TORCH_API size_t nCompiledKernels();
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/kernel_cache.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/kernel_cache.h
new file mode 100644
index 0000000000000000000000000000000000000000..bea343a5ad0c5c47808204f5c161cdec8ebd5c04
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/kernel_cache.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/codegen/fuser/kernel_spec.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+#include <cstdint>
+#include <functional>
+#include <optional>
+
+namespace torch::jit::fuser {
+
+// A thread-safe cache interface.
+
+// Normalizes the graph by canonicalizing and erasing shape information
+TORCH_API std::shared_ptr<Graph> normalizeGraphForCache(
+    const std::shared_ptr<Graph>& graph);
+
+// Stores the given graph, returning the key used to access it
+TORCH_API int64_t store(std::shared_ptr<Graph> graph);
+
+// Given a graph, find a KernelSpec based on it
+TORCH_API std::optional<KernelSpec*> lookupGraph(
+    const std::shared_ptr<Graph>& graph);
+
+// Returns the graph corresponding to the given key (if it exists)
+TORCH_API std::optional<KernelSpec*> retrieve(const int64_t key);
+
+// Returns the size of the fusion key -> KernelSpec cache.
+// Only used for testing.
+TORCH_API int64_t debugNumCachedKernelSpecs();
+
+} // namespace torch::jit::fuser
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/kernel_spec.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/kernel_spec.h
new file mode 100644
index 0000000000000000000000000000000000000000..83eccb097700d59cd80f66e37718185911e1f1f1
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/kernel_spec.h
@@ -0,0 +1,144 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/core/stack.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/codegen/fuser/arg_spec.h>
+#include <torch/csrc/jit/codegen/fuser/fused_kernel.h>
+#include <torch/csrc/jit/codegen/fuser/interface.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/runtime/interpreter.h>
+#include <optional>
+
+#include <cstdint>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <vector>
+
+namespace torch::jit::fuser {
+
+// Helper struct containing partition information: the number of tensors
+// created and the dimension the partitioning is performed on.
+// Note: created during upfront compilation, once the tensors are known
+// at runtime the partition info is logically combined with the tensor
+// descriptions to create PartitionDesc objects.
+struct TORCH_API PartitionInfo {
+  PartitionInfo(const int64_t _nSubTensors, const int64_t _dim)
+      : nSubTensors_{_nSubTensors}, dim_{_dim} {}
+
+  int64_t nSubTensors() const {
+    return nSubTensors_;
+  }
+  int64_t dim() const {
+    return dim_;
+  }
+
+ private:
+  int64_t nSubTensors_;
+  int64_t dim_;
+};
+
+// "Kernel Specification." - Contains device-independent fusion information.
+// Each kernel specification contains a map of instantiated generated functions
+// that implement some or most of its functionality. Multiple generated
+// functions are needed by each abstract specification because of different
+// devices (cpu vs gpu, different gpus) and different inputs (int vs float,
+// contiguous vs discontiguous).
+// Note: uses a mutex to control access to its kernel store
+// Note: unordered containers do not invalidate references/pointers on
+//   rehashing, which is critical for thread-safety.
+// TODO: allow abstract kernels to use multiple generated kernels
+// TODO: allow abstract kernels to reuse generated kernels from common pool
+struct TORCH_API KernelSpec {
+  // Note: assumes the spec is a single block
+  // Note: This is the appropriate place to generalize if you want to add other
+  //  passes to upfront compilation that walk the graph.
+  KernelSpec(const int64_t _key, const std::shared_ptr<Graph>& _graph)
+      : key_{_key},
+        graph_{_graph},
+        code_{_graph, "<fused code>"},
+        nInputs_{_graph->inputs().size()}
+
+  {
+    // No need to iterate over reference since n is pointer
+    for (const auto n : graph_->nodes()) {
+      static_assert(std::is_pointer_v<decltype(n)>, "n must be a pointer");
+      if (n->kind() == aten::rand_like) {
+        has_random_ = true;
+        break;
+      }
+    }
+    nTensorInputs_ = std::count_if(
+        graph_->inputs().begin(), graph_->inputs().end(), [](const Value* v) {
+          return v->type()->isSubtypeOf(*TensorType::get());
+        });
+  }
+
+  // Getters
+  int64_t key() const {
+    return key_;
+  }
+  std::shared_ptr<Graph> graph() const {
+    return graph_;
+  }
+  const Code& code() const {
+    return code_;
+  }
+  int64_t nInputs() const {
+    return nInputs_;
+  }
+  int64_t nTensorInputs() const {
+    return nTensorInputs_;
+  }
+
+  std::vector<std::vector<int64_t>>& inputBroadcastGroups() {
+    return inputBroadcastGroups_;
+  }
+  const std::vector<std::vector<int64_t>>& inputBroadcastGroups() const {
+    return inputBroadcastGroups_;
+  }
+
+  std::vector<PartitionInfo>& inputChunks() {
+    return inputChunks_;
+  }
+  const std::vector<PartitionInfo>& inputChunks() const {
+    return inputChunks_;
+  }
+
+  bool hasRandom() const {
+    return has_random_;
+  }
+
+  // Cache functions
+  std::optional<std::shared_ptr<FusedKernel>> findKernel(
+      const ArgSpec& arg_spec) const {
+    std::lock_guard<std::mutex> guard{mutex_};
+    const auto it = kernels_.find(arg_spec);
+    if (it == kernels_.end())
+      return std::nullopt;
+    return it->second;
+  }
+  void cacheKernel(
+      const ArgSpec& arg_spec,
+      const std::shared_ptr<FusedKernel>& kernel) const {
+    std::lock_guard<std::mutex> guard{mutex_};
+    kernels_.emplace(arg_spec, kernel);
+  }
+
+ private:
+  int64_t key_;
+  std::shared_ptr<Graph> graph_;
+  Code code_;
+  uint64_t nInputs_;
+  uint64_t nTensorInputs_{};
+  std::vector<std::vector<int64_t>> inputBroadcastGroups_;
+  std::vector<PartitionInfo> inputChunks_;
+  bool has_random_{false};
+  mutable std::mutex mutex_;
+  mutable std::
+      unordered_map<ArgSpec, std::shared_ptr<FusedKernel>, c10::hash<ArgSpec>>
+          kernels_;
+};
+
+} // namespace torch::jit::fuser
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/partition_desc.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/partition_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..553a27b68e2a12b3a57406fc9f1707f76c7c38ea
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/partition_desc.h
@@ -0,0 +1,58 @@
+#pragma once
+
+#include <c10/util/Exception.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/codegen/fuser/tensor_desc.h>
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+namespace torch::jit::fuser {
+
+// Descriptor for chunk-ing an input tensor into subtensors
+// OR concat-ing an output tensor from subtensors
+// Note: default constructed used for tensors that do not participate in
+// chunk or cat operations.
+struct TORCH_API PartitionDesc {
+  PartitionDesc() : nSubTensors_{1}, dim_{0} {}
+
+  PartitionDesc(const TensorDesc& _desc, size_t _nSubTensors, size_t _dim)
+      : nSubTensors_{_nSubTensors}, dim_{_dim} {
+    AT_ASSERT(nSubTensors_ > 1);
+    std::vector<bool> cont = _desc.contiguity;
+    if (dim_ > 0) {
+      // when we narrow the concatenated output/chunked input
+      // we make the size[dim] smaller while keeping the stride[dim] the same,
+      // meaning: stride[dim - 1] != stride[dim]*size[dim]
+      // so dim - 1 is no longer contiguous
+      cont[dim_ - 1] = false;
+    }
+    subTensorDesc_ = std::make_shared<TensorDesc>(_desc.scalar_type, cont);
+  }
+
+  bool isNoop() const {
+    return (nSubTensors_ == 1);
+  }
+  size_t nSubTensors() const {
+    return nSubTensors_;
+  }
+  size_t dim() const {
+    return dim_;
+  }
+  std::shared_ptr<TensorDesc> subTensorDesc() {
+    return subTensorDesc_;
+  }
+  const std::shared_ptr<TensorDesc> subTensorDesc() const {
+    return subTensorDesc_;
+  }
+
+ private:
+  size_t nSubTensors_; // == 1 for tensors that should not be operated on via
+                       // chunk/cat
+  size_t dim_; // dimension along which the chunk/concat occurs
+  std::shared_ptr<TensorDesc>
+      subTensorDesc_; // descriptor for the subtensor, if it exists
+};
+
+} // namespace torch::jit::fuser
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/tensor_desc.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/tensor_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..68baef61ca16e71c2e789dea592f07b8e6af4494
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/tensor_desc.h
@@ -0,0 +1,98 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/core/jit_type.h>
+#include <c10/util/Exception.h>
+#include <c10/util/hash.h>
+#include <torch/csrc/Export.h>
+
+#include <algorithm>
+#include <ostream>
+#include <vector>
+
+namespace torch::jit::fuser {
+
+// type information needed by the compiler for input/outputs
+// contiguity[i] is true if the dim i is contiguous with dim i + 1.
+// contiguity.back() == true means strides.back() == 1.
+struct TORCH_API TensorDesc {
+  at::ScalarType scalar_type;
+  std::vector<bool> contiguity;
+
+  TensorDesc(const at::ScalarType& type, const std::vector<bool>& contiguity)
+      : scalar_type{type}, contiguity{contiguity} {
+    if (contiguity.empty()) {
+      nDim_ = 0;
+    } else {
+      nDim_ = std::count(contiguity.begin(), contiguity.end(), false) +
+          (lastIsContiguous() ? 1 : 0);
+    }
+  }
+
+  // Delegating constructors
+  TensorDesc(
+      const at::ScalarType& type,
+      const at::IntArrayRef& sizes,
+      const at::IntArrayRef& strides)
+      : TensorDesc(type, TensorDesc::findContiguous(sizes, strides)) {}
+
+  TensorDesc(const at::Tensor& t)
+      : TensorDesc(t.scalar_type(), t.sizes(), t.strides()) {}
+
+  TensorDesc(const c10::TensorTypePtr& type)
+      : TensorDesc(
+            type->scalarType().value(),
+            type->sizes().concrete_sizes().value(),
+            type->strides().concrete_sizes().value()) {}
+
+  // number of dimensions after contiguity compression
+  size_t nDim() const {
+    return nDim_;
+  }
+
+  // True iff innermost stride is 1
+  bool lastIsContiguous() const {
+    return (contiguity.empty() || contiguity.back());
+  }
+
+  static std::vector<bool> findContiguous(
+      const at::IntArrayRef& sizes,
+      const at::IntArrayRef& strides) {
+    AT_ASSERT(sizes.size() == strides.size());
+    std::vector<bool> cont(sizes.size());
+    for (size_t i = 0; i < sizes.size(); ++i) {
+      const auto expected_stride =
+          (i + 1 < sizes.size()) ? sizes[i + 1] * strides[i + 1] : 1;
+      cont[i] = (strides[i] == expected_stride);
+    }
+    return cont;
+  }
+
+  bool operator==(const TensorDesc& desc) const {
+    return scalar_type == desc.scalar_type && contiguity == desc.contiguity;
+  }
+
+  bool operator!=(const TensorDesc& desc) const {
+    return !(*this == desc);
+  }
+
+  static size_t hash(const TensorDesc& spec) {
+    return c10::get_hash(
+        spec.scalar_type,
+        spec.nDim_,
+        std::hash<std::vector<bool>>{}(spec.contiguity));
+  }
+
+ private:
+  size_t nDim_;
+};
+
+inline std::ostream& operator<<(std::ostream& out, const TensorDesc& d) {
+  out << d.scalar_type << "[";
+  for (const auto b : d.contiguity)
+    out << b << ";";
+  out << "]";
+  return out;
+}
+
+} // namespace torch::jit::fuser
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/tensor_info.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/tensor_info.h
new file mode 100644
index 0000000000000000000000000000000000000000..49b24c7719efcd58c300f8548a04ffde932d9a52
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/fuser/tensor_info.h
@@ -0,0 +1,24 @@
+#pragma once
+#include <torch/csrc/Export.h>
+
+#include <cstddef>
+#include <cstdint>
+
+namespace torch::jit::fuser {
+
+// Host-side view of TensorInfo
+// Note dims[0] - we need to dynamically allocate the dims.
+struct TORCH_API TensorInfo {
+  uint32_t* sizes(size_t nDim) {
+    return &sizes_strides[0];
+  }
+  uint32_t* strides(size_t nDim) {
+    return &sizes_strides[nDim];
+  }
+
+  void* data;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+  uint32_t sizes_strides[0];
+};
+
+} // namespace torch::jit::fuser
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/onednn/LlgaTensorImpl.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/onednn/LlgaTensorImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..a1ee1d1c64caeb6e0b468851c2cc46af6520e4fa
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/onednn/LlgaTensorImpl.h
@@ -0,0 +1,272 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/Config.h>
+
+#include <oneapi/dnnl/dnnl_graph.hpp>
+#include <torch/csrc/jit/ir/ir.h>
+#include <utility>
+
+namespace torch::jit::fuser::onednn {
+
+// Engine represents a device and its context. From the device kind, the engine
+// knows how to generate code for the target device and what kind of device
+// object to be expected. The device id ensures that there is a unique engine
+// being created for each device. The device handle passed from PyTorch allows
+// oneDNN Graph implementation to work on the device specified by PyTorch, which
+// is currently CPU, so we only have one engine.
+// Ref:
+// https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onednn/source/graph/programming_model#engine
+struct Engine {
+  // CPU engine singleton
+  static dnnl::engine& getEngine();
+  Engine(const Engine&) = delete;
+  void operator=(const Engine&) = delete;
+};
+
+// Stream is the logical abstraction for execution units. It is created on top
+// of oneDNN Graph engine. A compiled oneDNN Graph partition is submitted to a
+// stream for execution.
+struct Stream {
+  // CPU stream singleton
+  static dnnl::stream& getStream();
+  Stream(const Stream&) = delete;
+  void operator=(const Stream&) = delete;
+};
+
+struct LlgaTensorDesc {
+  using desc = dnnl::graph::logical_tensor;
+
+  LlgaTensorDesc(
+      size_t tid,
+      std::vector<int64_t> sizes,
+      std::vector<int64_t> strides,
+      desc::data_type dtype,
+      desc::property_type property_type)
+      : tid_(tid),
+        sizes_(std::move(sizes)),
+        strides_(std::move(strides)),
+        dtype_(dtype),
+        property_type_(property_type),
+        layout_type_(desc::layout_type::strided),
+        layout_id_(-1) {}
+
+  LlgaTensorDesc(const desc& t)
+      : tid_(t.get_id()),
+        sizes_(t.get_dims()),
+        strides_({-1}),
+        dtype_(t.get_data_type()),
+        property_type_(t.get_property_type()),
+        layout_type_(t.get_layout_type()),
+        layout_id_(-1) {
+    if (is_opaque()) {
+      layout_id_ = t.get_layout_id();
+    }
+    if (is_strided()) {
+      strides_ = t.get_strides();
+    }
+  }
+
+  LlgaTensorDesc(const torch::jit::Value* v)
+      : LlgaTensorDesc(
+            v->unique(),
+            {},
+            {},
+            desc::data_type::f32,
+            get_property_type(v)) {
+    if (v->type()->isSubtypeOf(TensorType::get())) {
+      auto tt = v->type()->cast<TensorType>();
+
+      if (tt->scalarType()) {
+        dtype_ = getLlgaDataType(tt->scalarType().value());
+      }
+
+      auto sizes = tt->sizes();
+      if (sizes.sizes()) {
+        for (auto d : *sizes.sizes()) {
+          sizes_.push_back(d.value_or(DNNL_GRAPH_UNKNOWN_DIM));
+        }
+      }
+
+      auto strides = tt->strides();
+      if (strides.sizes()) {
+        for (auto d : *strides.sizes()) {
+          strides_.push_back(d.value_or(DNNL_GRAPH_UNKNOWN_DIM));
+        }
+      }
+    }
+  }
+
+  LlgaTensorDesc supplementTensorInfo(const at::Tensor& t) const;
+
+  desc::data_type getLlgaDataType(at::ScalarType dt) const;
+
+  at::ScalarType aten_scalar_type() const;
+
+  const std::vector<int64_t>& sizes() const {
+    return sizes_;
+  }
+
+  const std::vector<int64_t>& strides() const {
+    TORCH_CHECK(!is_opaque(), "Cannot get strides on opaque layout");
+    return strides_;
+  }
+
+  size_t tid() const {
+    return tid_;
+  }
+
+  LlgaTensorDesc tid(uint64_t new_id) const {
+    auto ret = *this;
+    ret.tid_ = new_id;
+    return ret;
+  }
+
+  desc::data_type dtype() const {
+    return dtype_;
+  }
+
+  LlgaTensorDesc dtype(desc::data_type new_dtype) const {
+    return LlgaTensorDesc(tid_, sizes_, strides_, new_dtype, property_type_);
+  }
+
+  desc::layout_type layout_type() const {
+    return layout_type_;
+  }
+
+  LlgaTensorDesc layout_type(desc::layout_type new_layout_type) {
+    auto ret = *this;
+    ret.layout_type_ = new_layout_type;
+    return ret;
+  }
+
+  desc::property_type get_property_type(const torch::jit::Value* v) {
+    switch (v->node()->kind()) {
+      case prim::Constant:
+        return desc::property_type::constant;
+      default:
+        return desc::property_type::variable;
+    }
+  }
+
+  LlgaTensorDesc any() {
+    return layout_type(desc::layout_type::any);
+  }
+
+  size_t storage_size() const {
+    return logical_tensor().get_mem_size();
+  }
+
+  desc logical_tensor() const {
+    if (is_dimensionality_unknown()) {
+      return desc(
+          tid_, dtype_, DNNL_GRAPH_UNKNOWN_NDIMS, layout_type_, property_type_);
+    } else if (is_opaque()) {
+      return desc(tid_, dtype_, sizes_, layout_id_, property_type_);
+    } else if (is_any()) {
+      return desc(tid_, dtype_, sizes_, layout_type_, property_type_);
+    } else {
+      return desc(tid_, dtype_, sizes_, strides_, property_type_);
+    }
+  }
+
+  bool is_strided() const {
+    return layout_type_ == desc::layout_type::strided;
+  }
+
+  bool is_any() const {
+    return layout_type_ == desc::layout_type::any;
+  }
+
+  bool is_opaque() const {
+    return layout_type_ == desc::layout_type::opaque;
+  }
+
+  bool operator==(const LlgaTensorDesc& desc) const {
+    return tid_ == desc.tid_ && sizes_ == desc.sizes_ &&
+        dtype_ == desc.dtype_ && layout_type_ == desc.layout_type_ &&
+        ((is_opaque() && layout_id_ == desc.layout_id_) ||
+         strides_ == desc.strides_);
+  }
+
+  bool operator!=(const LlgaTensorDesc& desc) const {
+    return (tid_ != desc.tid_) || (sizes_ != desc.sizes_) ||
+        (dtype_ != desc.dtype_) || (layout_type_ != desc.layout_type_) ||
+        !((is_opaque() && (layout_id_ == desc.layout_id_)) ||
+          (strides_ == desc.strides_));
+  }
+
+  static size_t hash(const LlgaTensorDesc& desc) {
+    return c10::get_hash(
+        desc.tid_,
+        desc.sizes_,
+        desc.dtype_,
+        desc.layout_type_,
+        desc.layout_id_);
+  }
+
+  void set_compute_inplace() {
+    compute_inplace_ = true;
+  }
+
+  void set_input_tensor_index(size_t index) {
+    input_tensor_index_ = index;
+  }
+
+  bool reuses_input_tensor() {
+    return compute_inplace_;
+  }
+
+  size_t get_input_tensor_index() {
+    return input_tensor_index_;
+  }
+
+ private:
+  bool is_dimensionality_unknown() const {
+    return sizes_.empty();
+  }
+
+  size_t tid_;
+  std::vector<int64_t> sizes_;
+  std::vector<int64_t> strides_;
+  desc::data_type dtype_;
+  desc::property_type property_type_;
+  desc::layout_type layout_type_;
+  size_t layout_id_;
+  // If this is an output tensor, and querying the compiled partition would
+  // determine that this tensor would reuse its input tensor, then
+  // compute_inplace would be true, and input_tensor_index would be the index of
+  // the corresponding input tensor in inputSpecs_ of the LlgaKernel object.
+  bool compute_inplace_ = false;
+  size_t input_tensor_index_{};
+};
+
+// Initially, oneDNN Graph also used to have blocked layout for tensors between
+// partitions, and the LlgaTensorImpl wrapper helped us bypass guard checks.
+// oneDNN Graph has switched over to using strided tensors between partitions,
+// but this wrapper still helps us bypass guard checks because the strides of
+// tensors between partitions would be different from the ones the guard is
+// otherwise expecting.
+struct TORCH_API LlgaTensorImpl : public c10::TensorImpl {
+  LlgaTensorImpl(
+      at::Storage&& storage,
+      const caffe2::TypeMeta& data_type,
+      const LlgaTensorDesc& desc);
+
+  const LlgaTensorDesc& desc() const {
+    return desc_;
+  }
+
+  static at::Tensor llga_to_aten_tensor(LlgaTensorImpl* llgaImpl);
+
+ private:
+  LlgaTensorDesc desc_;
+};
+
+at::Tensor empty_llga(
+    const LlgaTensorDesc& desc,
+    const c10::TensorOptions& options);
+
+dnnl::graph::tensor llga_from_aten_tensor(const at::Tensor& tensor);
+
+} // namespace torch::jit::fuser::onednn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/onednn/decompose_silu.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/onednn/decompose_silu.h
new file mode 100644
index 0000000000000000000000000000000000000000..61e2b71fd9487b99888690a1e341e4468a21daac
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/onednn/decompose_silu.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit::fuser::onednn {
+
+void DecomposeSiluForLLGA(std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit::fuser::onednn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/onednn/defer_size_check.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/onednn/defer_size_check.h
new file mode 100644
index 0000000000000000000000000000000000000000..e944b1f8b36126c2596366382b4b0fd4b0563221
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/onednn/defer_size_check.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit::fuser::onednn {
+
+void DeferSizeCheck(std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit::fuser::onednn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/onednn/graph_fuser.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/onednn/graph_fuser.h
new file mode 100644
index 0000000000000000000000000000000000000000..2fb016c1c36356e5fa98ff707c6af7a4519acd70
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/onednn/graph_fuser.h
@@ -0,0 +1,47 @@
+#pragma once
+
+#include <torch/csrc/jit/codegen/onednn/graph_helper.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit::fuser::onednn {
+
+struct WorkBlock : public std::pair<Node*, Node*> {
+  using pair::pair;
+
+  Node* begin() {
+    return this->first;
+  }
+  Node* end() {
+    return this->second;
+  }
+};
+
+class GraphRewriter {
+ public:
+  GraphRewriter(Block* block, std::shared_ptr<Graph> graph, AliasDb& aliasDb)
+      : block_(block),
+        graph_(std::move(graph)),
+        aliasDb_(aliasDb),
+        llgaHelper_(graph_) {}
+
+  void cleanupSubgraphs();
+  void buildupSubgraphs();
+
+ private:
+  Block* block_;
+  std::shared_ptr<Graph> graph_;
+  AliasDb& aliasDb_;
+  LlgaGraphHelper llgaHelper_;
+  std::vector<WorkBlock> buildWorkBlocks();
+  std::pair<graph_node_list::iterator, bool> scanNode(
+      Node* consumer,
+      graph_node_list::iterator workblock_begin);
+  std::optional<Node*> tryMerge(Node* consumer, Node* producer);
+};
+
+// This pass creates the subgraphs for oneDNN Graph Fusion Nodes.
+// Its code-structure has been vastly inspired from
+// torch/csrc/jit/passes/create_autodiff_subgraphs.cpp
+void CreateLlgaSubgraphs(std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit::fuser::onednn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/onednn/graph_helper.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/onednn/graph_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..279ebdab7f93e2b2b959106a399f884d9d70e118
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/onednn/graph_helper.h
@@ -0,0 +1,98 @@
+#pragma once
+
+#include <oneapi/dnnl/dnnl_graph.hpp>
+#include <torch/csrc/jit/codegen/onednn/operator.h>
+#include <torch/csrc/jit/ir/alias_analysis.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit::fuser::onednn {
+
+#define STRIDED_LAYOUT 0
+#define OPAQUE_LAYOUT 1
+
+struct OpPartitionMap {
+  void add(uint64_t opId, uint64_t partitionId) {
+    opmap_[opId] = partitionId;
+  }
+  void add(Node* n, uint64_t partitionId) {
+    add(Operator::getId(n), partitionId);
+  }
+  bool has(uint64_t opId) {
+    return opmap_.count(opId) > 0;
+  }
+  bool has(Node* n) {
+    return has(Operator::getId(n));
+  }
+  uint64_t get(uint64_t opId) {
+    return opmap_[opId];
+  }
+  uint64_t get(Node* n) {
+    auto opId = Operator::getId(n);
+    TORCH_CHECK(
+        has(opId),
+        "Node ",
+        n->kind().toQualString(),
+        " does not belong to any LLGA partition");
+    return get(opId);
+  }
+
+ private:
+  std::unordered_map<uint64_t, uint64_t> opmap_;
+};
+
+class LlgaGraphHelper {
+ public:
+  LlgaGraphHelper(
+      const std::shared_ptr<Graph>& graph,
+      dnnl::graph::partition::policy policy =
+          dnnl::graph::partition::policy::fusion);
+
+  bool shouldMerge(Node* toMerge, Node* subgraph);
+
+  bool shouldConsiderForMerge(Node* node);
+
+  bool checkForSingleOpPartition(Node* node);
+
+  Node* createSingletonSubgraph(Node* n, AliasDb& db);
+
+  void mergeNodeIntoSubgraph(Node* toMerge, Node* subgraphNode, AliasDb& db);
+
+  void unmergeIfAnyNodeIsMissing(Node* subgraphNode);
+
+  static bool isLlgaSubgraph(const Node* node);
+
+  Operator makeEltwiseOp(Node* node, dnnl::graph::op::kind kind);
+
+  Operator makeBinaryOp(Node* node, dnnl::graph::op::kind kind);
+
+  std::vector<dnnl::graph::partition> getPartitions() const;
+
+  std::map<size_t, Value*> getTensorIdToValue() const;
+
+  Operator createOperator(Node* node);
+
+ private:
+  size_t countSupportedOps(const std::shared_ptr<Graph>& graph) const;
+  std::unique_ptr<dnnl::graph::graph> dnnl_graph_ = nullptr;
+  std::unique_ptr<torch::jit::AliasDb> aliasDb_ = nullptr;
+  OpPartitionMap opToOwningPartition_;
+  std::vector<dnnl::graph::partition> partitions_;
+  std::map<size_t, Value*>
+      tensorIdToValue_; // map from tensorId to torch::jit::Value
+};
+
+class LlgaNodeWrapper {
+ public:
+  LlgaNodeWrapper(const Node* node);
+
+  void setOpaqueLayout(size_t offset);
+
+  bool useOpaqueLayout(size_t offset) const;
+
+  friend class LlgaGraphHelper;
+
+ private:
+  Node* n;
+};
+
+} // namespace torch::jit::fuser::onednn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/onednn/guard_shape.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/onednn/guard_shape.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f1b5a473699d124510101d92df7dfc5ef60ba41
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/onednn/guard_shape.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit::fuser::onednn {
+
+void prepareFusionGroupAndGuardOutputs(Block* block);
+
+} // namespace torch::jit::fuser::onednn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/onednn/interface.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/onednn/interface.h
new file mode 100644
index 0000000000000000000000000000000000000000..f6d8f89ba6c343ff39b124b478dbaa511ea9b223
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/onednn/interface.h
@@ -0,0 +1,58 @@
+#pragma once
+#include <ATen/Config.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/passes/pass_manager.h>
+
+namespace torch::jit {
+namespace fuser::onednn {
+
+static std::atomic<bool> onednn_enabled{false};
+
+static std::atomic<bool>& getLlgaEnabled() {
+  return onednn_enabled;
+}
+
+C10_EXPORT void fuseGraph(std::shared_ptr<Graph>& g);
+
+} // namespace fuser::onednn
+
+struct C10_EXPORT RegisterLlgaFuseGraph
+    : public PassManager<RegisterLlgaFuseGraph> {
+  static bool setEnabled(bool enabled) {
+    TORCH_CHECK(
+        AT_MKLDNN_ENABLED(),
+        "Running oneDNN Graph fuser is only supported with MKLDNN builds.");
+    bool oldState = fuser::onednn::getLlgaEnabled();
+    fuser::onednn::getLlgaEnabled() = enabled;
+    if (enabled) {
+      registerPass(fuser::onednn::fuseGraph);
+    } else {
+      clearPass();
+    }
+    return oldState;
+  }
+
+  static bool isEnabled() {
+    return fuser::onednn::getLlgaEnabled();
+  }
+
+  // override PassManager::registerPass to register pre-pass
+  static bool registerPass(GraphPass p) {
+    if (!isRegistered()) {
+      passID(registerPrePass(std::move(p)), true);
+      isRegistered(true);
+      return false;
+    }
+    return true;
+  }
+
+  // override PassManager::clearPass to clear pre-pass
+  static void clearPass() {
+    if (isRegistered()) {
+      clearPrePass(passID());
+      isRegistered(true);
+    }
+  }
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/onednn/kernel.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/onednn/kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..82daef8ab3198fb1cf0fd7cf841beb2c5936a1f6
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/onednn/kernel.h
@@ -0,0 +1,89 @@
+#pragma once
+
+#include <unordered_map>
+
+#include <oneapi/dnnl/dnnl_graph.hpp>
+#include <torch/csrc/jit/codegen/onednn/LlgaTensorImpl.h>
+#include <torch/csrc/jit/codegen/onednn/graph_helper.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/runtime/interpreter.h>
+
+#include <c10/util/CallOnce.h>
+
+namespace torch::jit::fuser::onednn {
+
+using ArgSpec = LlgaTensorDesc;
+using ArgSpecs = std::vector<ArgSpec>;
+using RunArg = dnnl::graph::tensor;
+using RunArgs = std::vector<RunArg>;
+using TensorArgs = std::vector<at::Tensor>;
+
+class LlgaKernel {
+ public:
+  explicit LlgaKernel(const Node* fusionNode);
+
+  void run(Stack& stack);
+
+  void initialize(const TensorArgs& inputs);
+
+  const std::string& debugName() const {
+    return debugName_;
+  }
+
+ private:
+  bool useOpaqueLayout(size_t offset) const;
+
+  // PyTorch copy constants inside the subgraph instead of referencing them.
+  // Constants inputs to the partition are no longer in the graph->inputs().
+  // Need use the tid retrieved from the partition to find the missing
+  // constant inputs.
+  void initializeConstantInputs();
+
+  ArgSpecs initializeInputSpecs(const TensorArgs& inputs);
+
+  ArgSpecs initializeOutputSpecs() const;
+
+  dnnl::graph::compiled_partition compile(
+      const dnnl::graph::partition& partition);
+
+  std::map<size_t, int64_t> initializeTensorIdToOccurence() const;
+
+  std::tuple<RunArgs, RunArgs> prepareRunArgs(
+      const TensorArgs& inputs,
+      TensorArgs& outputs) const;
+
+  static std::string genDebugName() {
+    static size_t debugId = 0;
+    return "LlgaPartition_" + std::to_string(debugId++);
+  }
+
+  static dnnl::graph::logical_tensor toLogicalTensor(const ArgSpec& s) {
+    return s.logical_tensor();
+  }
+
+  at::Device device_ = at::kCPU;
+  const Node* fusionNode_;
+  std::shared_ptr<Graph> graph_;
+  int64_t nGraphInputs_ = 0; // number of inputs to graph_ on the IR
+  int64_t nOutputs_ = 0;
+  std::map<size_t, Value*> tensorIdToValue_;
+  std::vector<int64_t> runArgsIdx_;
+  dnnl::graph::partition partition_;
+  // nPartitionInputs_ is the actual number of inputs to partition_ of graph_
+  // needed by the backend.
+  // nPartitionInputs_ = nGraphInputs_ + constantInputs_.size() since Constant
+  // inputs are copied to the inside of the subgraph
+  int64_t nPartitionInputs_;
+  dnnl::graph::compiled_partition compilation_;
+  std::set<size_t> initializedInputIds_;
+  std::vector<Value*> constantValues_;
+  TensorArgs constantInputs_;
+  ArgSpecs inputSpecs_;
+  ArgSpecs outputSpecs_;
+  std::vector<dnnl::graph::logical_tensor> constantLogicalTensors_;
+  std::string debugName_;
+  c10::once_flag initialized_flag;
+  bool is_initialized_ = false;
+};
+
+} // namespace torch::jit::fuser::onednn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/onednn/layout_propagation.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/onednn/layout_propagation.h
new file mode 100644
index 0000000000000000000000000000000000000000..a8e759ff44d77eeba51b7939c38caeed20292211
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/onednn/layout_propagation.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit::fuser::onednn {
+
+void PropagateLayout(const std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit::fuser::onednn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/onednn/operator.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/onednn/operator.h
new file mode 100644
index 0000000000000000000000000000000000000000..a32125575a95bb79b3785502109044ca2433eb06
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/onednn/operator.h
@@ -0,0 +1,146 @@
+#pragma once
+
+#include <oneapi/dnnl/dnnl_graph.hpp>
+#include <torch/csrc/jit/codegen/onednn/LlgaTensorImpl.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit::fuser::onednn {
+
+class Operator {
+ public:
+  Operator(const Node* node, dnnl::graph::op::kind kind)
+      : n(node), o(getId(node), kind, node->kind().toQualString()), k(kind) {}
+
+  // Returns output index if the Value is a graph output.
+  // Otherwise returns -1
+  int32_t graphOutputIdx(Value* v) {
+    int32_t i = 0;
+    for (const Value* output : v->owningGraph()->outputs()) {
+      if (v == output) {
+        return i;
+      }
+      i++;
+    }
+    return -1;
+  }
+
+  Operator& setInputValue(Value* v) {
+    if (v->mustNotBeNone()) {
+      if (v->type()->kind() == c10::TensorType::Kind) {
+        o.add_input(createLogicalTensor(v));
+      }
+    }
+    return *this;
+  }
+
+  Operator& setInput(size_t offset) {
+    return setInputValue(n->input(offset));
+  }
+
+  template <typename... Ts>
+  Operator& setInput(size_t offset, Ts... other) {
+    setInput(offset);
+    return setInput(other...);
+  }
+
+  Operator& setOutputValue(Value* v) {
+    if (v->mustNotBeNone()) {
+      o.add_output(createLogicalTensor(v));
+    }
+    return *this;
+  }
+
+  // setOutputValue & setOutput require a pointer to the LLGA graph, as output
+  // logical tensors that are graph outputs should be connected to an End LLGA
+  // op. A value of NULL can be provided for the graph pointer in order to
+  // maintain the legacy functionality of this function.
+  Operator& setOutputValue(Value* v, std::unique_ptr<dnnl::graph::graph>& g) {
+    if (v->mustNotBeNone()) {
+      auto output_tensor = createLogicalTensor(v);
+      o.add_output(output_tensor);
+      if (g) {
+        int32_t outputIndex = graphOutputIdx(v);
+        if (outputIndex != -1) {
+          dnnl::graph::op newEndNode(
+              LONG_MAX - outputIndex,
+              dnnl::graph::op::kind::End,
+              "EndNodeForGraphOutput");
+          newEndNode.add_input(output_tensor);
+          g->add_op(newEndNode);
+        }
+      }
+    }
+    return *this;
+  }
+
+  Operator& setOutput(std::unique_ptr<dnnl::graph::graph>& g, size_t offset) {
+    return setOutputValue(n->output(offset), g);
+  }
+
+  Operator& setOutput(size_t offset) {
+    return setOutputValue(n->output(offset));
+  }
+
+  template <typename... Ts>
+  Operator& setOutput(
+      std::unique_ptr<dnnl::graph::graph>& g,
+      size_t offset,
+      Ts... other) {
+    setOutput(g, offset);
+    return setOutput(g, other...);
+  }
+
+  template <typename Attr>
+  Operator& setAttr(dnnl::graph::op::attr name, Attr&& attr) {
+    o.set_attr(name, std::forward<Attr>(attr));
+    return *this;
+  }
+
+  template <typename F>
+  Operator& setAttr(dnnl::graph::op::attr name, const F& fn, size_t offset) {
+    return setAttr(name, fn(n, offset));
+  }
+
+  static float ScalarToFloat(const Node* node, size_t offset) {
+    return toIValue(node->input(offset))->toScalar().to<float>();
+  }
+
+  static std::vector<int64_t> Ints(const Node* node, size_t offset) {
+    return toIValue(node->input(offset))->toIntVector();
+  }
+
+  static int64_t Int(const Node* node, size_t offset) {
+    return toIValue(node->input(offset))->toInt();
+  }
+
+  static float Float(const Node* node, size_t offset) {
+    return static_cast<float>(toIValue(node->input(offset))->toDouble());
+  }
+
+  static bool Bool(const Node* node, size_t offset) {
+    return toIValue(node->input(offset))->toBool();
+  }
+
+  static uint64_t getId(const Node* node) {
+    return reinterpret_cast<uint64_t>(node); // cast node address as op id
+  }
+
+  dnnl::graph::op::kind kind() const {
+    return k;
+  }
+
+  dnnl::graph::op llgaOp() const {
+    return o;
+  }
+
+ private:
+  dnnl::graph::logical_tensor createLogicalTensor(Value* value) const {
+    return LlgaTensorDesc(value).logical_tensor();
+  }
+
+  const Node* n;
+  dnnl::graph::op o;
+  dnnl::graph::op::kind k;
+};
+
+} // namespace torch::jit::fuser::onednn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/onednn/prepare_binary.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/onednn/prepare_binary.h
new file mode 100644
index 0000000000000000000000000000000000000000..ebfe297a44b78dba56b20614bc990bfa9d4a622c
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/codegen/onednn/prepare_binary.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit::fuser::onednn {
+
+// Prepare binary ops for LLGA
+//
+// The pass does the following:
+//
+// - Convert scalar input of aten::add and aten::mul into Float tensor with
+//   dimension [1]
+//
+// - Decompose fused add into aten::mul + aten::add when alpha != 1.0
+//
+// - Eliminate identity add/mul, i.e., tensor + 0, tensor * 1
+//
+void PrepareBinaryForLLGA(const std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit::fuser::onednn
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/cuda/cuda.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/cuda/cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..01ce7908a6e8d9fe09a2bfd6814cd81bdd590d34
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/cuda/cuda.h
@@ -0,0 +1,179 @@
+#include <ATen/cuda/CUDAEvent.h>
+#include <c10/core/Device.h>
+#include <c10/cuda/CUDAStream.h>
+#include <torch/custom_class.h>
+
+namespace torch::jit {
+
+class CUDAEvent;
+// This class is a wrapper around c10::cuda::CUDAStream.
+// It is needed because TorchBind does not support all of the argument types
+// for c10::cuda::CUDAStream. For more details, please refer to
+// c10/cuda/CUDAStream.h.
+class CUDAStream final : public CustomClassHolder {
+ public:
+  CUDAStream(
+      std::optional<c10::Device> device = std::nullopt,
+      int64_t priority = 0) {
+    c10::DeviceIndex device_index =
+        device.has_value() ? device->index() : c10::cuda::current_device();
+    stream_ = std::make_unique<c10::cuda::CUDAStream>(
+        c10::cuda::getStreamFromPool(static_cast<int>(priority), device_index));
+  }
+
+  CUDAStream(c10::cuda::CUDAStream s) {
+    stream_ = std::make_unique<c10::cuda::CUDAStream>(s);
+  }
+
+  bool query() {
+    return stream_->query();
+  }
+
+  c10::intrusive_ptr<CUDAEvent> recordEvent(
+      c10::intrusive_ptr<CUDAEvent> event);
+
+  void synchronize() {
+    stream_->synchronize();
+  }
+
+  void waitEvent(const c10::intrusive_ptr<CUDAEvent>& event);
+
+  void waitStream(const c10::intrusive_ptr<CUDAStream>& stream);
+
+  /// Get the CUDA device index that this stream is associated with.
+  int64_t device_index() const {
+    return stream_->device_index();
+  }
+
+  /// Get the full Device that this stream is associated with.  The Device
+  /// is guaranteed to be a CUDA device.
+  c10::Device device() const {
+    return stream_->device();
+  }
+
+  /// Return the stream ID corresponding to this particular stream.
+  int64_t id() const {
+    return stream_->id();
+  }
+
+ private:
+  std::unique_ptr<c10::cuda::CUDAStream> stream_;
+  friend class CUDAEvent;
+};
+
+// This class is a wrapper around at::cuda::CUDAStream.
+// It is needed because TorchBind does not support all of the argument types
+// for at::cuda::CUDAEvent. For more details, please refer to
+// aten/src/ATen/cuda/CUDAEvent.h.
+class CUDAEvent final : public CustomClassHolder {
+ public:
+  CUDAEvent(
+      bool enable_timing = false,
+      bool blocking = false,
+      bool interprocess = false) {
+    int flags = cudaEventDisableTiming;
+    if (enable_timing) {
+      flags = cudaEventDefault;
+    }
+    if (blocking) {
+      flags |= cudaEventBlockingSync;
+    }
+    if (interprocess) {
+      TORCH_CHECK(!enable_timing);
+      flags |= cudaEventInterprocess;
+    }
+
+    event_ = std::make_unique<at::cuda::CUDAEvent>(flags);
+  }
+
+  double elapsedTime(const c10::intrusive_ptr<CUDAEvent>& end) {
+    return event_->elapsed_time(*end->event_);
+  }
+
+  std::string ipcHandle() {
+    cudaIpcEventHandle_t handle{};
+    event_->ipc_handle(&handle);
+    std::string str_handle((const char*)&handle, sizeof(handle));
+    return str_handle;
+  }
+
+  bool query() {
+    return event_->query();
+  }
+
+  void record(const c10::intrusive_ptr<CUDAStream>& stream);
+
+  void synchronize() {
+    event_->synchronize();
+  }
+  void wait(const c10::intrusive_ptr<CUDAStream>& stream);
+
+ private:
+  void recordInternal(CUDAStream* stream);
+  std::unique_ptr<at::cuda::CUDAEvent> event_;
+
+  friend class CUDAStream;
+};
+
+inline c10::intrusive_ptr<CUDAEvent> CUDAStream::recordEvent(
+    c10::intrusive_ptr<CUDAEvent> event) {
+  if (!event) {
+    event = c10::make_intrusive<CUDAEvent>();
+  }
+
+  event->recordInternal(this);
+  return event;
+}
+
+inline void CUDAStream::waitEvent(const c10::intrusive_ptr<CUDAEvent>& event) {
+  event->event_->block(*stream_);
+}
+
+inline void CUDAStream::waitStream(
+    const c10::intrusive_ptr<CUDAStream>& stream) {
+  auto ev = c10::make_intrusive<CUDAEvent>();
+  stream->recordEvent(ev);
+  waitEvent(ev);
+}
+
+inline void CUDAEvent::record(const c10::intrusive_ptr<CUDAStream>& stream) {
+  event_->record(*stream->stream_);
+}
+
+inline void CUDAEvent::recordInternal(CUDAStream* stream) {
+  event_->record(*stream->stream_);
+}
+
+inline void CUDAEvent::wait(const c10::intrusive_ptr<CUDAStream>& stream) {
+  event_->block(*stream->stream_);
+}
+
+TORCH_LIBRARY(cuda, m) {
+  auto stream_class = m.class_<torch::jit::CUDAStream>("Stream").def(
+      torch::init<std::optional<c10::Device>, int64_t>(),
+      "",
+      {torch::arg("device") = std::nullopt, torch::arg("priority") = 0});
+  auto event_class = m.class_<torch::jit::CUDAEvent>("Event").def(
+      torch::init<bool, bool, bool>(),
+      "",
+      {torch::arg("enable_timing") = false,
+       torch::arg("blocking") = false,
+       torch::arg("interprocess") = false});
+
+  stream_class.def("query", &CUDAStream::query)
+      .def("record_event", &CUDAStream::recordEvent)
+      .def("synchronize", &CUDAStream::synchronize)
+      .def("wait_event", &CUDAStream::waitEvent)
+      .def("wait_stream", &CUDAStream::waitStream)
+      .def("device_index", &CUDAStream::device_index)
+      .def_property("device", &CUDAStream::device)
+      .def("id", &CUDAStream::id);
+
+  event_class.def("elapsed_time", &CUDAEvent::elapsedTime)
+      .def("query", &CUDAEvent::query)
+      .def("record", &CUDAEvent::record)
+      .def("synchronize", &CUDAEvent::synchronize)
+      .def("wait", &CUDAEvent::wait);
+}
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/builtin_functions.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/builtin_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..335793c48eed0c1a19739b79a8621d11b2467554
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/builtin_functions.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/api/module.h>
+
+namespace torch::jit {
+
+TORCH_API const std::vector<Function*>& getAllBuiltinFunctionsFor(Symbol name);
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/canonicalize_modified_loop.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/canonicalize_modified_loop.h
new file mode 100644
index 0000000000000000000000000000000000000000..03ec328fe4d5554bda4074633593bb95ac267396
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/canonicalize_modified_loop.h
@@ -0,0 +1,14 @@
+#pragma once
+#include <memory>
+
+#include <torch/csrc/Export.h>
+
+namespace torch::jit {
+
+struct Graph;
+
+// Transforms loops so that they can be represented as python
+// for or while loops
+TORCH_API void CanonicalizeModifiedLoops(std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/concrete_module_type.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/concrete_module_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..7edea26e954365eaa65f12ab140250ae893af863
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/concrete_module_type.h
@@ -0,0 +1,239 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/python/pybind_utils.h>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace torch::jit {
+
+enum class IterableModuleKind { NONE, LIST, DICT, PARAMLIST, PARAMDICT };
+class ConcreteModuleType;
+
+// You can think of an nn.Module as a template that corresponds to a family of
+// JIT types. The template "arguments" are things like the constant values.
+// e.g.
+//   class M(nn.Module):
+//        __constants__ = ["const"]
+//        ...
+//
+// Is similar to writing the following in C++:
+//
+//    template<TConst>
+//    class M {
+//       ...
+//    }
+//
+// We need to consider each different member of the type family a different JIT
+// type because, e.g. different constant values lead to different versions of
+// the same method.
+//
+// ConcreteModuleType corresponds to a single member of the type family, with
+// all template arguments fully specified. Two Modules that share a
+// ConcreteModuleType can share a JIT type, and vice versa.
+//
+// Why not just use a JIT type to represent concrete types? Because constants,
+// function attributes, etc. are currently not representable in the type system,
+// so this acts a non-first-class way of tracking concrete types.
+//
+// ConcreteModuleType is also the source of truth for servicing all
+// ModuleValue::attr calls. This is so we can guarantee that if two Module's
+// share a JIT type (and thus a ConcreteModuleType), then they behave the same
+// way when you access attributes on them.
+
+// ConcreteModuleType has two phases.
+// 1. Creation: First we build it up, during the ScriptModule conversion
+// process. This is represented by ConcreteModuleTypeBuilder.
+//    ...then the converter calls ConcreteModuleTypeBuilder::build(), producing
+//    a
+//       ConcreteModuleType ready for querying.
+// 2. Querying: We use ConcreteModuleType as a source of truth for
+// ModuleValue::attr calls during method compilation.
+
+// Represents a concrete type during in the process for construction. We use
+// this to decide whether we can share types between modules.
+class VISIBILITY_HIDDEN ConcreteModuleTypeBuilder {
+ public:
+  explicit ConcreteModuleTypeBuilder(py::object pyClass) {
+    TORCH_INTERNAL_ASSERT(pyClass);
+    pyClass_ = std::move(pyClass);
+  }
+
+  void addConstant(std::string name, py::object value);
+  void addConstant(std::string name, IValue value);
+  void addAttribute(
+      std::string name,
+      const TypePtr& type,
+      bool isParameter,
+      bool isBuffer);
+  void addFunctionAttribute(
+      std::string name,
+      const TypePtr& type,
+      py::object pyFunction);
+
+  void addModule(std::string name, std::shared_ptr<ConcreteModuleType> meta);
+
+  void addForwardHook(py::object hook);
+  void addForwardPreHook(py::object pre_hook);
+
+  void addOverload(
+      std::string methodName,
+      std::vector<std::string> overloadedMethodNames);
+  void addBuiltinFunction(std::string name, const std::string& symbol_name);
+  void addFailedAttribute(std::string name, std::string failureReason);
+  void addIgnoredAttribute(std::string name);
+  void setIterableModuleKind(IterableModuleKind kind);
+
+  // If a ConcreteModuleType is poisoned, it will never compare equal to any
+  // other concrete type
+  void setPoisoned();
+
+  std::shared_ptr<ConcreteModuleType> build() const {
+    return std::make_shared<ConcreteModuleType>(*this);
+  }
+
+  // This determines whether two modules can share a type. The container structs
+  // used by ConcreteModuleType have been defined such that operator==
+  // implements a meaningful comparison in that context.
+  bool equals(const ConcreteModuleTypeBuilder& other) const;
+
+  struct FunctionAttribute {
+    FunctionTypePtr function_;
+    py::object pyFunction_;
+
+    friend bool operator==(
+        const FunctionAttribute& lhs,
+        const FunctionAttribute& rhs) {
+      // Functions are not first class, so we can't do type comparison like a
+      // regular attribute. So we do a pointer equality check on the actual
+      // Python function object.
+      return lhs.pyFunction_.is(rhs.pyFunction_);
+    }
+  };
+
+  struct Attribute {
+    Attribute(TypePtr type, bool isParam, bool isBuffer)
+        : type_(std::move(type)), isParam_(isParam), isBuffer_(isBuffer) {}
+
+    friend bool operator==(const Attribute& lhs, const Attribute& rhs) {
+      return *(lhs.type_) == *(rhs.type_) && lhs.isParam_ == rhs.isParam_;
+    }
+    TypePtr type_;
+    bool isParam_;
+    bool isBuffer_;
+  };
+
+  struct ModuleInfo {
+    ModuleInfo(std::string name, std::shared_ptr<ConcreteModuleType> meta)
+        : name_(std::move(name)), meta_(std::move(meta)) {}
+
+    friend bool operator==(const ModuleInfo& lhs, const ModuleInfo& rhs);
+
+    std::string name_;
+    std::shared_ptr<ConcreteModuleType> meta_;
+  };
+
+ private:
+  ConcreteModuleTypeBuilder() = default;
+  ClassTypePtr createTypeFromThis() const;
+
+  // If true, this type will never compare equally to anything else. This is
+  // used if we want to ensure that this type is not shared (for example, if it
+  // came from a traced module)
+  bool isPoisoned_ = false;
+
+  // The value of any constants defined by the module.
+  std::unordered_map<std::string, IValue> constants_;
+  // The types of any attributes
+  OrderedDict<std::string, Attribute> attributes_;
+  // Overloads, in the same format as `__overloads__` in Python
+  std::unordered_map<std::string, std::vector<std::string>> overloads_;
+  // Any attributes we failed to convert to TorchScript, along with a hint as to
+  // why
+  std::unordered_map<std::string, std::string> failedAttributes_;
+  // Any attributes that were marked as ignored. They cannot be used in
+  // TorchScript but can still be used in ignored function in Python.
+  std::unordered_set<std::string> ignoredAttributes_;
+  // Any function attributes. These are special right now because functions are
+  // not first-class in the type system.
+  std::unordered_map<std::string, FunctionAttribute> functionAttributes_;
+  // Function attributes that are calls to builtin functions. These get
+  // de-sugared directly into the corresponding aten:: call. The map is
+  // attribute name -> aten symbol name
+  std::unordered_map<std::string, c10::Symbol> builtinFunctions_;
+  // The concrete types of any submodules
+  std::vector<ModuleInfo> modules_;
+  // Hooks to be called before/after forward when the module
+  // is called directly. Used to ensure modules have different types
+  // when they have different python hooks
+  // Actual hooks are added to ClassType directly during compilation
+  std::vector<py::object> forwardHooks_;
+  std::vector<py::object> forwardPreHooks_;
+
+  // If something is a ModuleDict/ModuleList, it means:
+  //   1. The order of the submodules matters for comparing the type
+  //   2. The compiler is allowed to treat it like a dict/tuple
+  IterableModuleKind iterableModuleKind_ = IterableModuleKind::NONE;
+
+  // The original `nn.Module` class that we derived this ScriptModule from.
+  py::object pyClass_;
+
+  // NOTE: If you ever add any more state to this struct, you need to make sure
+  // operator== still makes sense!
+  friend ConcreteModuleType;
+};
+
+// Represents a finalized concrete type, used to service ModuleValue::attr calls
+// during method compilation.
+class VISIBILITY_HIDDEN ConcreteModuleType {
+ public:
+  explicit ConcreteModuleType(ConcreteModuleTypeBuilder data);
+
+  static std::shared_ptr<ConcreteModuleType> fromJitType(TypePtr type);
+
+  TypePtr getJitType() const;
+  std::optional<py::object> getPyClass() const;
+  IterableModuleKind getIterableModuleKind() const;
+  std::optional<std::vector<std::string>> findOverloads(
+      const std::string& name) const;
+  std::optional<Function*> findFunctionAttribute(const std::string& name) const;
+  std::optional<c10::Symbol> findBuiltinFunction(const std::string& name) const;
+  std::shared_ptr<ConcreteModuleType> findSubmoduleConcreteType(
+      const std::string& name) const;
+  std::optional<std::string> findFailedAttribute(const std::string& name) const;
+  bool isIgnoredAttribute(const std::string& name) const;
+
+  // These getters are only here to return things as types that can be
+  // automatically converted by pybind.
+  std::unordered_map<std::string, py::object> getConstantsPy() const;
+  std::unordered_map<std::string, std::pair<TypePtr, bool>> getAttributesPy()
+      const;
+  std::vector<std::pair<std::string, std::shared_ptr<ConcreteModuleType>>>
+  getModulesPy() const;
+
+  bool equals(const ConcreteModuleType& other) const {
+    if (jitType_ == other.jitType_) {
+      // If the computed types are the same, these modules can (obviously) share
+      // a type.
+      return true;
+    }
+
+    return data_.equals(other.data_);
+  }
+  bool equals(const ConcreteModuleTypeBuilder& other) const {
+    return data_.equals(other);
+  }
+
+  void dump() const;
+
+ private:
+  ConcreteModuleType() = default;
+
+  // The JIT type derived from this ConcreteModuleType.
+  ConcreteModuleTypeBuilder data_;
+  TypePtr jitType_;
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/convert_to_ssa.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/convert_to_ssa.h
new file mode 100644
index 0000000000000000000000000000000000000000..8add05d3442a8a92458dc6f6e87228a4f0604f0a
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/convert_to_ssa.h
@@ -0,0 +1,14 @@
+#pragma once
+#include <functional>
+#include <memory>
+#include <string>
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// Convert a graph with Loads & Stores into SSA form
+TORCH_API void ConvertToSSA(std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/edit_distance.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/edit_distance.h
new file mode 100644
index 0000000000000000000000000000000000000000..73b241a758b7c56fa9effcbc42bcca4e6dd5cae4
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/edit_distance.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <cstddef>
+
+namespace torch::jit {
+
+TORCH_API size_t ComputeEditDistance(
+    const char* word1,
+    const char* word2,
+    size_t maxEditDistance);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/error_report.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/error_report.h
new file mode 100644
index 0000000000000000000000000000000000000000..f723ed97cebfb8d301a51ef2a5d23f05ef0d501a
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/error_report.h
@@ -0,0 +1,51 @@
+#pragma once
+
+#include <torch/csrc/jit/frontend/tree.h>
+
+namespace torch::jit {
+
+struct Call {
+  std::string fn_name;
+  SourceRange caller_range;
+};
+
+struct TORCH_API ErrorReport : public std::exception {
+  ErrorReport(const ErrorReport& e);
+
+  explicit ErrorReport(const SourceRange& r);
+  explicit ErrorReport(const TreeRef& tree) : ErrorReport(tree->range()) {}
+  explicit ErrorReport(const Token& tok) : ErrorReport(tok.range) {}
+
+  const char* what() const noexcept override;
+
+  struct TORCH_API CallStack {
+    // These functions are used to report why a function was being compiled
+    // (i.e. what was the call stack of user functions at compilation time that
+    // led to this error)
+    CallStack(const std::string& name, const SourceRange& range);
+    ~CallStack();
+
+    // Change the range that is relevant for the current function (i.e. after
+    // each successful expression compilation, change it to the next expression)
+    static void update_pending_range(const SourceRange& range);
+  };
+
+  static std::string current_call_stack();
+
+ private:
+  template <typename T>
+  friend const ErrorReport& operator<<(const ErrorReport& e, const T& t);
+
+  mutable std::stringstream ss;
+  OwnedSourceRange context;
+  mutable std::string the_message;
+  std::vector<Call> error_stack;
+};
+
+template <typename T>
+const ErrorReport& operator<<(const ErrorReport& e, const T& t) {
+  e.ss << t;
+  return e;
+}
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/exit_transforms.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/exit_transforms.h
new file mode 100644
index 0000000000000000000000000000000000000000..f620606d01fb3efe6e5b04eebfb98e2ae9054567
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/exit_transforms.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API void TransformExits(std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/function_schema_parser.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/function_schema_parser.h
new file mode 100644
index 0000000000000000000000000000000000000000..49e922f62d771fbf59b0ba342c66dcf2665ec894
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/function_schema_parser.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <ATen/core/function_schema.h>
+#include <c10/macros/Macros.h>
+#include <string>
+#include <variant>
+
+namespace torch::jit {
+
+// allow_typevars: If true, we assume that lowercase types that we don't
+// understand are type variables. This is only needed for TorchScript (and not
+// not needed for custom ops).
+// If false, we disallow typevars, except in certain cases for BC reason (i.e.
+// your op is in the aten or prim namespace).
+TORCH_API std::variant<c10::OperatorName, c10::FunctionSchema> parseSchemaOrName(
+    const std::string& schemaOrName,
+    bool allow_typevars = true);
+TORCH_API c10::FunctionSchema parseSchema(
+    const std::string& schema,
+    bool allow_typevars = true);
+TORCH_API c10::OperatorName parseName(const std::string& name);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/inline_loop_condition.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/inline_loop_condition.h
new file mode 100644
index 0000000000000000000000000000000000000000..f0a6987a18eb8d9b9b48a24689ff133062d448eb
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/inline_loop_condition.h
@@ -0,0 +1,14 @@
+#pragma once
+#include <functional>
+#include <memory>
+#include <string>
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API void InlineLoopCondition(std::shared_ptr<Graph>& graph);
+TORCH_API void InlineBlockBeforeNode(Node* before_node, Block* block);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/ir_emitter.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/ir_emitter.h
new file mode 100644
index 0000000000000000000000000000000000000000..88bd5118acb1876b14c2b60b69fb05c6d01e414d
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/ir_emitter.h
@@ -0,0 +1,19 @@
+#pragma once
+#include <functional>
+#include <memory>
+#include <string>
+
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/frontend/error_report.h>
+#include <torch/csrc/jit/frontend/resolver.h>
+#include <torch/csrc/jit/frontend/sugared_value.h>
+#include <torch/csrc/jit/frontend/tree_views.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API void runCleanupPasses(std::shared_ptr<Graph>& to_clean);
+
+TORCH_API bool meaningfulName(const std::string& name);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/lexer.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/lexer.h
new file mode 100644
index 0000000000000000000000000000000000000000..7163f192d153c1d92eea61edd07afae35889118b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/lexer.h
@@ -0,0 +1,567 @@
+#pragma once
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/frontend/parser_constants.h>
+#include <torch/csrc/jit/frontend/source_range.h>
+#include <torch/csrc/jit/frontend/strtod.h>
+#include <algorithm>
+#include <clocale>
+#include <cstdlib>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace torch::jit {
+
+// single character tokens are just the character itself '+'
+// multi-character tokens need an entry here
+// if the third entry is not the empty string, it is used
+// in the lexer to match this token.
+
+// These kinds are also used in Tree.h as the kind of the AST node.
+// Some kinds TK_APPLY, TK_LIST are only used in the AST and are not seen in the
+// lexer.
+
+#define TC_FORALL_TOKEN_KINDS(_)                 \
+  _(TK_EOF, "eof", "")                           \
+  _(TK_WHITESPACE, "whitespace", "")             \
+  _(TK_WHITESPACE_EOF, "whitespace_eof", "")     \
+  _(TK_NUMBER, "number", "")                     \
+  _(TK_NEWLINE, "newline", "")                   \
+  _(TK_INDENT, "indent", "")                     \
+  _(TK_DEDENT, "dedent", "")                     \
+  _(TK_DEF, "def", "def")                        \
+  _(TK_EQUIVALENT, "equivalent", "<=>")          \
+  _(TK_IDENT, "ident", "")                       \
+  _(TK_STRING, "string", "")                     \
+  _(TK_STRINGLITERAL, "string_literal", "")      \
+  _(TK_CONST, "const", "")                       \
+  _(TK_LIST, "list", "")                         \
+  _(TK_DICT, "dict", "")                         \
+  _(TK_OPTION, "option", "")                     \
+  _(TK_APPLY, "apply", "")                       \
+  _(TK_COMPREHENSION, "comprehension", "")       \
+  _(TK_RANGE_CONSTRAINT, "range_constraint", "") \
+  _(TK_PARAM, "param", "")                       \
+  _(TK_INFERRED, "inferred", "")                 \
+  _(TK_ACCESS, "access", "")                     \
+  _(TK_ASSIGN, "assign", "")                     \
+  _(TK_AUG_ASSIGN, "aug_assign", "")             \
+  _(TK_ATTRIBUTE, "attribute", "")               \
+  _(TK_IF, "if", "if")                           \
+  _(TK_ELSE, "else", "else")                     \
+  _(TK_ELIF, "elif", "elif")                     \
+  _(TK_WHILE, "while", "while")                  \
+  _(TK_EXPR_STMT, "expression statement", "")    \
+  _(TK_RETURN, "return", "return")               \
+  _(TK_IS, "is", "is")                           \
+  _(TK_ISNOT, "is not", "is not")                \
+  _(TK_NE, "ne", "!=")                           \
+  _(TK_EQ, "eq", "==")                           \
+  _(TK_LE, "le", "<=")                           \
+  _(TK_GE, "ge", ">=")                           \
+  _(TK_FLOOR_DIV, "floordiv", "//")              \
+  _(TK_IF_EXPR, "if", "")                        \
+  _(TK_TRUE, "True", "True")                     \
+  _(TK_FALSE, "False", "False")                  \
+  _(TK_NONE, "None", "None")                     \
+  _(TK_AND, "and", "and")                        \
+  _(TK_OR, "or", "or")                           \
+  _(TK_NOT, "not", "not")                        \
+  _(TK_LSHIFT, "<<", "<<")                       \
+  _(TK_RSHIFT, ">>", ">>")                       \
+  _(TK_CAST, "cast", "")                         \
+  _(TK_PLUS_EQ, "+=", "+=")                      \
+  _(TK_MINUS_EQ, "-=", "-=")                     \
+  _(TK_TIMES_EQ, "*=", "*=")                     \
+  _(TK_DIV_EQ, "/=", "/=")                       \
+  _(TK_MOD_EQ, "%=", "%=")                       \
+  _(TK_BIT_OR_EQ, "|=", "|=")                    \
+  _(TK_BIT_AND_EQ, "&=", "&=")                   \
+  _(TK_BIT_XOR_EQ, "^=", "^=")                   \
+  _(TK_LSHIFT_EQ, "<<=", "<<=")                  \
+  _(TK_RSHIFT_EQ, ">>=", ">>=")                  \
+  _(TK_POW_EQ, "**=", "**=")                     \
+  _(TK_GLOBAL, "global", "global")               \
+  _(TK_BUILT_IN, "built-in", "")                 \
+  _(TK_SUBSCRIPT, "subscript", "")               \
+  _(TK_VAR, "variable", "")                      \
+  _(TK_NOTHING, "nothing", "")                   \
+  _(TK_DICT_LITERAL, "dict-literal", "")         \
+  _(TK_LIST_LITERAL, "list-literal", "")         \
+  _(TK_TUPLE_LITERAL, "tuple-literal", "")       \
+  _(TK_FOR, "for", "for")                        \
+  _(TK_IN, "in", "in")                           \
+  _(TK_NOTIN, "not in", "not in")                \
+  _(TK_STARRED, "starred", "")                   \
+  _(TK_UNARY_MINUS, "unary minus", "")           \
+  _(TK_POW, "pow operator", "**")                \
+  _(TK_ARROW, "arrow", "->")                     \
+  _(TK_DECL, "decl", "")                         \
+  _(TK_SLICE_EXPR, "slice expr", "")             \
+  _(TK_TYPE_COMMENT, "type comment", "# type:")  \
+  _(TK_RAISE, "raise", "raise")                  \
+  _(TK_ASSERT, "assert", "assert")               \
+  _(TK_DOTS, "dots", "...")                      \
+  _(TK_LIST_COMP, "list comprehension", "")      \
+  _(TK_DICT_COMP, "dict comprehension", "")      \
+  _(TK_BREAK, "break", "break")                  \
+  _(TK_CONTINUE, "continue", "continue")         \
+  _(TK_DELETE, "del", "del")                     \
+  _(TK_PASS, "pass", "pass")                     \
+  _(TK_CLASS_DEF, "class", "class")              \
+  _(TK_IMPORT, "import", "import")               \
+  _(TK_WITH, "with", "with")                     \
+  _(TK_WITH_ITEM, "withitem", "")                \
+  _(TK_AS, "as", "as")                           \
+  _(TK_PROP, "property", "")                     \
+  _(TK_ELLIPSIS, "Ellipsis", "Ellipsis")         \
+  _(TK_NONE_TYPE, "NoneType", "NoneType")
+
+enum TokenKind {
+  // we use characters to represent themselves so skip all valid characters
+  // before
+  // assigning enum values to multi-char tokens.
+  TK_DUMMY_START = 256,
+#define DEFINE_TOKEN(tok, _, _2) tok,
+  TC_FORALL_TOKEN_KINDS(DEFINE_TOKEN)
+#undef DEFINE_TOKEN
+};
+
+TORCH_API std::string kindToString(int kind);
+TORCH_API int stringToKind(const std::string& str);
+
+// nested hash tables that indicate char-by-char what is a valid token.
+struct TokenTrie;
+using TokenTrieRef = std::unique_ptr<TokenTrie>;
+struct TokenTrie {
+  TokenTrie() = default;
+  void insert(const char* str, int tok) {
+    if (*str == '\0') {
+      AT_ASSERT(kind == 0);
+      kind = tok;
+      return;
+    }
+
+    for (size_t i = 0, e = child_chars.size(); i < e; ++i) {
+      if (child_chars[i] == *str) {
+        child_tries[i]->insert(str + 1, tok);
+        return;
+      }
+    }
+
+    child_chars.emplace_back(*str);
+    child_tries.emplace_back(std::make_unique<TokenTrie>());
+    child_tries.back()->insert(str + 1, tok);
+  }
+  int kind{0}; // 0 == invalid token
+
+  std::vector<char> child_chars;
+  std::vector<TokenTrieRef> child_tries;
+};
+
+// stuff that is shared against all TC lexers/parsers and is initialized only
+// once.
+struct TORCH_API SharedParserData {
+  SharedParserData() : head(new TokenTrie()) {
+    for (const char* c = valid_single_char_tokens; *c; c++) {
+      std::string str(1, *c);
+      head->insert(str.c_str(), *c);
+    }
+
+#define ADD_CASE(tok, _, tokstring)   \
+  if (*(tokstring) != '\0') {         \
+    head->insert((tokstring), (tok)); \
+  }
+    TC_FORALL_TOKEN_KINDS(ADD_CASE)
+#undef ADD_CASE
+  }
+
+  bool match(
+      StringCordView::Iterator pos,
+      bool continuation, // are we inside a scope where newlines don't count
+                         // (e.g. inside parens)
+      bool whitespace_token, // should we treat whitespace as a token
+      int* kind,
+      StringCordView::Iterator* start,
+      StringCordView::Iterator* end) {
+    *start = pos;
+    // skip whitespace
+    while (pos.has_next() && isblank(*pos)) {
+      ++pos;
+    }
+
+    // special handling
+    if (pos.has_next()) {
+      if (*pos == '#' && !isTypeComment(pos)) {
+        // skip comments
+        while (pos.has_next() && *pos != '\n')
+          ++pos;
+        // tail call, handle whitespace and more comments
+        return match(pos, continuation, whitespace_token, kind, start, end);
+      }
+      if (*pos == '\\') {
+        auto newiter = pos;
+        ++newiter;
+        if (newiter.has_next() && *newiter == '\n' && !whitespace_token) {
+          ++newiter;
+          return match(newiter, continuation, false, kind, start, end);
+        }
+      }
+      if (*pos == '\n') {
+        return match(++pos, continuation, !continuation, kind, start, end);
+      }
+    }
+    // we handle white space before EOF because in the case we have something
+    // like the following where we need to generate the dedent token if foo:
+    //   ...
+    // else:
+    //   pass
+    if (whitespace_token) {
+      *kind = !pos.has_next() ? TK_WHITESPACE_EOF : TK_WHITESPACE;
+      *end = pos;
+      return true;
+    }
+    if (!pos.has_next()) {
+      *kind = TK_EOF;
+      *start = pos;
+      *end = *start;
+      return true;
+    }
+    // invariant: the next token is not whitespace or newline
+    *start = pos;
+    // check for a valid number
+    size_t len = 0;
+    if (isNumber(pos.rest_line(), 0, &len)) {
+      *end = *start;
+      *end += len;
+      *kind = TK_NUMBER;
+      return true;
+    }
+    // check for string
+    if (isString(pos.rest_line(), 0, &len)) {
+      *kind = TK_STRINGLITERAL;
+      *end = *start;
+      *end += len;
+      return true;
+    }
+
+    // check for either an ident or a token
+    // ident tracks whether what we have scanned so far could be an identifier
+    // matched indicates if we have found any match.
+    bool matched = false;
+    bool ident = true;
+    TokenTrie* cur = head.get();
+    // for (size_t i = 0; pos + i < str.size() && (ident || cur != nullptr);
+    // i++)
+    for (size_t i = 0; pos.has_next() && (ident || cur != nullptr);
+         ++pos, ++i) {
+      ident = ident && validIdent(i, *pos);
+      if (ident) {
+        matched = true;
+        *end = pos.next_iter();
+        *kind = TK_IDENT;
+      }
+      // check for token second, so that e.g. 'max' matches the token TK_MAX
+      // rather the
+      // identifier 'max'
+      if (cur) {
+        const auto begin_it = cur->child_chars.begin();
+        const auto end_it = cur->child_chars.end();
+        const auto ch_it = std::find(begin_it, end_it, *pos);
+
+        cur = (ch_it == end_it) ? nullptr
+                                : cur->child_tries[ch_it - begin_it].get();
+
+        if (cur && cur->kind != 0) {
+          matched = true;
+          *end = pos.next_iter();
+          *kind = cur->kind;
+        }
+      }
+    }
+    return matched;
+  }
+
+  bool isUnary(int kind, int* prec);
+  bool isBinary(int kind, int* prec);
+  bool isRightAssociative(int kind) {
+    switch (kind) {
+      case '?':
+      case TK_POW:
+      case TK_IF:
+        return true;
+      default:
+        return false;
+    }
+  }
+
+ private:
+  bool validIdent(size_t i, char n) {
+    return isalpha(n) || n == '_' || (i > 0 && isdigit(n));
+  }
+
+  // 1. skip whitespace
+  // 2. handle comment or newline
+  //
+  bool isNumber(std::string_view str, size_t start, size_t* len) {
+    char first = str[start];
+    // strtod allows numbers to start with + or - or nan or inf
+    // http://en.cppreference.com/w/cpp/string/byte/strtof
+    // but we want only the number part, otherwise 1+3 will turn into two
+    // adjacent numbers in the lexer
+    if (first == '-' || first == '+' || isalpha(first))
+      return false;
+    const char* startptr = str.data() + start;
+    char* endptr = nullptr;
+    torch::jit::strtod_c(startptr, &endptr);
+    *len = endptr - startptr;
+    // check if the number is complex valued
+    // access is safe because string is assumed to be null terminated
+    if (endptr != nullptr && *endptr == 'j') {
+      *len += 1;
+    }
+    return *len > 0;
+  }
+
+  bool isCharCount(char c, std::string_view str, size_t start, int len) {
+    // count checks from [start, start + len)
+    return start + len <= str.size() &&
+        std::count(str.begin() + start, str.begin() + start + len, c) == len;
+  }
+
+  // python concatenates all adjacent strings "a" "b" == "ab"
+  // strings can be enclosed with 1 or 3 single or double quotes
+  // if enclosed with 3 quotes newlines are valid
+  // as elsewhere, backslash and new line should be ignored
+  bool isString(std::string_view str, size_t start, size_t* len) {
+    char quote = str[start];
+    if (quote != '\"' && quote != '\'')
+      return false;
+    int quote_len = isCharCount(quote, str, start, 3) ? 3 : 1;
+
+    // end is now set past the opening quotation marks
+    size_t end = start + quote_len;
+    while (end < str.size() && !isCharCount(quote, str, end, quote_len)) {
+      if (str[end] == '\n' && quote_len != 3) {
+        return false;
+      }
+      // handle escaped characters. advances past escaped quotation marks,
+      // escaped newlines and escaped backslashes
+      // multi-char escapes like \x1A are handled fine here because the
+      // remainder of the escape are valid string characters anyway
+      if (str[end] == '\\') {
+        end++;
+      }
+      end++;
+    }
+    // set length equal to the complete string including quotations
+    *len = end - start + quote_len;
+    // if end finished without going past the last character of the string than
+    // there is a match
+    return end < str.size();
+  }
+
+  bool isblank(int n) {
+    return isspace(n) && n != '\n';
+  }
+
+  bool isTypeComment(StringCordView::Iterator str_iter) {
+    std::string_view rest_line = str_iter.rest_line();
+    const std::string type_string = "# type:";
+    if (rest_line.size() < type_string.length()) {
+      return false;
+    }
+    auto match_string = rest_line.substr(0, type_string.size());
+    return match_string == type_string;
+  }
+
+  // Make an exception ignoring comments for type annotation comments
+  bool isTypeComment(const StringCordView& str, size_t pos) {
+    const std::string type_string = "# type:";
+    if (str.size() < pos + type_string.length()) {
+      return false;
+    }
+    auto match_string = str.substr(pos, type_string.size());
+    return match_string == type_string;
+  }
+
+  TokenTrieRef head;
+};
+
+TORCH_API SharedParserData& sharedParserData();
+
+struct Token {
+  int kind;
+  SourceRange range;
+  Token(int kind, SourceRange range) : kind(kind), range(std::move(range)) {}
+  std::string text() const {
+    return std::string(range.token_text());
+  }
+
+  std::string_view text_view() const {
+    return range.token_text();
+  }
+
+  std::string kindString() const {
+    return kindToString(kind);
+  }
+};
+
+struct Lexer {
+  explicit Lexer(std::shared_ptr<Source> source)
+      : source(std::move(source)),
+
+        indent_stack(),
+        next_tokens(),
+        shared(sharedParserData()) {
+    auto first_indent = lexRaw(true);
+    indent_stack.push_back(first_indent.range.size());
+    lex();
+  }
+  // Return the current token, and then move to the next one
+  Token next() {
+    if (next_tokens.empty())
+      reportError("Lexer invariant violated: empty token queue");
+    Token r = std::move(next_tokens.front());
+    next_tokens.erase(next_tokens.begin());
+    if (next_tokens.empty()) {
+      lex();
+    }
+    return r;
+  }
+  // Skip the current token if it matches the given kind
+  bool nextIf(int kind) {
+    if (cur().kind != kind)
+      return false;
+    next();
+    return true;
+  }
+
+  [[noreturn]] void reportError(const std::string& what) {
+    reportError(what, cur());
+  }
+  [[noreturn]] void reportError(const std::string& what, const Token& t) {
+    std::stringstream ss;
+    ss << what << ":\n";
+    t.range.highlight(ss);
+    throw std::runtime_error(ss.str());
+  }
+  [[noreturn]] void expected(const std::string& what, const Token& t) {
+    std::stringstream ss;
+    ss << "expected " << what << " but found '" << t.kindString()
+       << "' here:\n";
+    t.range.highlight(ss);
+    throw std::runtime_error(ss.str());
+  }
+  [[noreturn]] void expected(const std::string& what) {
+    expected(what, cur());
+  }
+  // Check that the current token has a given kind, return the current token,
+  // and advance to the next one.
+  Token expect(int kind) {
+    if (cur().kind != kind) {
+      expected(kindToString(kind));
+    }
+    return next();
+  }
+  Token& lookahead() {
+    if (next_tokens.size() < 2) {
+      lex();
+    }
+    return next_tokens[1];
+  }
+  Token& cur() {
+    return next_tokens.front();
+  }
+
+ private:
+  void lex() {
+    auto r = lexRaw();
+    switch (r.kind) {
+      case '(':
+      case '[':
+      case '{':
+        nesting++;
+        break;
+      case ')':
+      case ']':
+      case '}':
+        nesting--;
+        break;
+      case TK_WHITESPACE:
+      case TK_WHITESPACE_EOF: {
+        const auto depth =
+            r.kind == TK_WHITESPACE_EOF ? indent_stack.front() : r.range.size();
+        // note: TK_WHITESPACE_EOF is whitespace right before the EOF token
+        // just like we allow the code to be indented to a particular initial
+        // indent level, we allow the final indent to be anything and set
+        // it back to the initial indent level. This allows the code to be
+        // put into string literals inside code without worrying about final
+        // whitespace
+        if (depth > indent_stack.back()) {
+          indent_stack.push_back(depth);
+          r.kind = TK_INDENT;
+        } else if (depth == indent_stack.back()) {
+          r.kind = TK_NEWLINE;
+        } else {
+          next_tokens.emplace_back(TK_NEWLINE, r.range);
+          while (indent_stack.back() != depth) {
+            indent_stack.pop_back();
+            next_tokens.emplace_back(TK_DEDENT, r.range);
+            if (indent_stack.empty()) {
+              reportError("invalid indent level " + std::to_string(depth), r);
+            }
+          }
+          return; // We've already queued the tokens
+        }
+      } break;
+      default:
+        break;
+    }
+    next_tokens.push_back(std::move(r));
+  }
+  Token lexRaw(bool whitespace_token = false) {
+    AT_ASSERT(source);
+    if (current == nullptr) {
+      AT_ASSERT(pos == 0);
+      current = std::make_unique<StringCordView::Iterator>(
+          source->text_str().begin());
+    }
+
+    StringCordView::Iterator start_iter = *current;
+    StringCordView::Iterator end_iter = *current;
+    int kind = 0;
+    if (!shared.match(
+            *current,
+            nesting > 0,
+            whitespace_token,
+            &kind,
+            &start_iter,
+            &end_iter)) {
+      expected(
+          "a valid token",
+          Token(
+              **current,
+              SourceRange(source, start_iter, start_iter.pos() + 1)));
+    }
+
+    auto t = Token(kind, SourceRange(source, start_iter, end_iter.pos()));
+    pos = end_iter.pos();
+    *current = end_iter;
+    return t;
+  }
+
+  std::shared_ptr<Source> source;
+  std::unique_ptr<StringCordView::Iterator> current;
+  size_t pos{0};
+  size_t nesting{0}; // depth of ( [ { nesting...
+  std::vector<size_t> indent_stack; // stack of indentation level of blocks
+  // Invariant: this should always contain at least a single element
+  std::vector<Token> next_tokens;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  SharedParserData& shared;
+};
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/mini_environment.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/mini_environment.h
new file mode 100644
index 0000000000000000000000000000000000000000..799513ecaf18c11a44d61752cb4fa6bc79a0be06
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/mini_environment.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#include <ATen/core/jit_type.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// Simple data structure for containing a type T in nested control blocks
+// Should only be used after initial compilation where type checking and
+// loads and stores are emitted
+
+template <typename T>
+struct MiniEnvironment {
+  MiniEnvironment(Block* b, std::shared_ptr<MiniEnvironment> next = nullptr)
+      : next(std::move(next)) {}
+
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::shared_ptr<MiniEnvironment<T>> next;
+
+  T findInThisFrame(const std::string& name) {
+    auto it = table.find(name);
+    if (it != table.end()) {
+      return it->second;
+    }
+    return nullptr;
+  }
+
+  T findInAnyFrame(const std::string& name) {
+    for (auto runner = this; runner; runner = runner->next.get()) {
+      if (auto r = runner->findInThisFrame(name)) {
+        return r;
+      }
+    }
+    return nullptr;
+  }
+
+  void setVar(const std::string& name, T value) {
+    table[name] = value;
+  }
+
+  std::vector<std::string> definedVariables() {
+    std::vector<std::string> result;
+    result.reserve(table.size());
+    for (auto& kv : table) {
+      result.push_back(kv.first);
+    }
+    std::sort(result.begin(), result.end());
+    return result;
+  }
+
+ private:
+  std::unordered_map<std::string, T> table;
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/name_mangler.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/name_mangler.h
new file mode 100644
index 0000000000000000000000000000000000000000..5260b8bbca580bfe0f8d3535eb66eeb1a7f19f54
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/name_mangler.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <ATen/core/qualified_name.h>
+#include <torch/csrc/Export.h>
+
+namespace torch::jit {
+
+/**
+ * class NameMangler
+ *
+ * Utility to mangle qualified names in order to make them unique. We use this
+ * in various places where we to de-duplicate qualified names.
+ */
+class TORCH_API NameMangler {
+ public:
+  // Given a qualified name, return a mangled version that is guaranteed to be
+  // unique with respect to previous/future calls of `mangled()` on this name
+  // mangler instance.
+  c10::QualifiedName mangle(const c10::QualifiedName& name);
+
+ private:
+  size_t mangleIndex_ = 0;
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/parse_string_literal.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/parse_string_literal.h
new file mode 100644
index 0000000000000000000000000000000000000000..a8cca9cc710911ac18f5ef1909afbbf01adec63f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/parse_string_literal.h
@@ -0,0 +1,87 @@
+#pragma once
+#include <torch/csrc/jit/frontend/error_report.h>
+#include <torch/csrc/jit/frontend/lexer.h>
+#include <optional>
+
+namespace torch::jit {
+
+inline bool isCharCount(char c, const std::string& str, size_t start, int len) {
+  // count checks from [start, start + len)
+  return start + len <= str.size() &&
+      std::count(
+          str.begin() + static_cast<ptrdiff_t>(start),
+          str.begin() + static_cast<ptrdiff_t>(start + len),
+          c) == len;
+}
+
+inline std::optional<char> parseOctal(const std::string& str, size_t pos) {
+  //\xxx where x are 0-7
+  if (pos + 3 >= str.size())
+    return std::nullopt;
+  size_t c = 0;
+  for (size_t i = 1, b = 64; i < 4; ++i, b /= 8) {
+    auto d = str[pos + i];
+    if (d < '0' || d > '7')
+      return std::nullopt;
+    c += b * (d - '0');
+  }
+  if (c >= 256)
+    return std::nullopt;
+  return c;
+}
+
+inline std::string parseStringLiteral(
+    const SourceRange& range,
+    const std::string& str) {
+  size_t quote_len = isCharCount(str[0], str, 0, 3) ? 3 : 1;
+  auto ret_str = str.substr(quote_len, str.size() - quote_len * 2);
+  size_t pos = ret_str.find('\\');
+  while (pos != std::string::npos) {
+    // invariant: pos has to escape a character because it is a valid string
+    char c = ret_str[pos + 1];
+    size_t to_erase = 2;
+    switch (ret_str[pos + 1]) {
+      case '\\':
+      case '\'':
+      case '\"':
+      case '\n':
+        break;
+      case 'a':
+        c = '\a';
+        break;
+      case 'b':
+        c = '\b';
+        break;
+      case 'f':
+        c = '\f';
+        break;
+      case 'n':
+        c = '\n';
+        break;
+      case 'v':
+        c = '\v';
+        break;
+      case 't':
+        c = '\t';
+        break;
+      case 'x':
+        throw(ErrorReport(range) << "unsupported hex specifier");
+      case 'u':
+      case 'U':
+        throw(ErrorReport(range) << "unsupported unicode specifier");
+      default:
+        // octal value in format \nnn, n is [0-7]
+        if (auto v = parseOctal(ret_str, pos)) {
+          to_erase = 4;
+          c = *v;
+        } else {
+          throw(ErrorReport(range) << " ill formed octal specifier");
+        }
+    }
+    ret_str.replace(pos, to_erase, /* num copies */ 1, c);
+    pos = ret_str.find('\\', pos + 1);
+  }
+  return ret_str;
+}
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/parser.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/parser.h
new file mode 100644
index 0000000000000000000000000000000000000000..b29a8d85314e83bd38703237a7153792e04402da
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/parser.h
@@ -0,0 +1,31 @@
+#pragma once
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/frontend/tree.h>
+#include <torch/csrc/jit/frontend/tree_views.h>
+#include <memory>
+
+namespace torch::jit {
+
+struct Decl;
+struct ParserImpl;
+struct Lexer;
+
+TORCH_API Decl mergeTypesFromTypeComment(
+    const Decl& decl,
+    const Decl& type_annotation_decl,
+    bool is_method);
+
+struct TORCH_API Parser {
+  explicit Parser(const std::shared_ptr<Source>& src);
+  TreeRef parseFunction(bool is_method);
+  TreeRef parseClass();
+  Decl parseTypeComment();
+  Expr parseExp();
+  Lexer& lexer();
+  ~Parser();
+
+ private:
+  std::unique_ptr<ParserImpl> pImpl;
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/parser_constants.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/parser_constants.h
new file mode 100644
index 0000000000000000000000000000000000000000..0e310a3c2319082ed87151e466bf65876e036091
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/parser_constants.h
@@ -0,0 +1,6 @@
+#pragma once
+
+namespace torch::jit {
+static constexpr const char* valid_single_char_tokens =
+    "+-*/%@()[]:,={}><.?!&^|~";
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/resolver.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/resolver.h
new file mode 100644
index 0000000000000000000000000000000000000000..12404cef539494de5bdb758626e8840445ebfd1e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/resolver.h
@@ -0,0 +1,66 @@
+#pragma once
+
+#include <ATen/core/jit_type.h>
+#include <ATen/core/qualified_name.h>
+#include <torch/csrc/jit/frontend/sugared_value.h>
+
+namespace torch::jit {
+
+struct Resolver;
+using ResolverPtr = std::shared_ptr<Resolver>;
+
+/**
+ * class Resolver
+ *
+ * Represents an "outer environment" in which we an look up names and return
+ * a corresponding SugaredValue. This is used during compilation to resolve
+ * references to names which are not defined internal to the graph.
+ *
+ * Example: PythonResolver looks at the enclosing Python scope for `name`.
+ *
+ * NOTE: When adding methods, keep this an abstract class (i.e. all new methods
+ * should be purely virtual). Resist the urge to provide a default
+ * implementation; you should explicitly think about how each resolver would
+ * handle the method.
+ */
+struct Resolver {
+  virtual ~Resolver() = default;
+
+  // Resolve a given name to a SugaredValue. This takes the method `m` that the
+  // caller is currently constructing, since we may need to insert nodes into
+  // the graph to create a value.
+  virtual std::shared_ptr<SugaredValue> resolveValue(
+      const std::string& name,
+      GraphFunction& m,
+      const SourceRange& loc) {
+    return nullptr;
+  }
+
+  // Resolve `name` to a TypePtr.
+  virtual TypePtr resolveType(const std::string& name, const SourceRange& loc) {
+    return nullptr;
+  }
+};
+
+// A resolver that only understands "torch.foo()" lookups.
+struct NativeResolver : public Resolver {
+  std::shared_ptr<SugaredValue> resolveValue(
+      const std::string& name,
+      GraphFunction& m,
+      const SourceRange& loc) override {
+    if (name == "torch") {
+      return std::make_shared<BuiltinModule>("aten");
+    }
+    return nullptr;
+  }
+
+  TypePtr resolveType(const std::string& name, const SourceRange& loc)
+      override {
+    return nullptr;
+  }
+};
+
+inline std::shared_ptr<NativeResolver> nativeResolver() {
+  return std::make_shared<NativeResolver>();
+}
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/schema_matching.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/schema_matching.h
new file mode 100644
index 0000000000000000000000000000000000000000..c5dacf37d601014502cd8f61b5910691bffa0694
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/schema_matching.h
@@ -0,0 +1,68 @@
+#pragma once
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/ir/named_value.h>
+
+#include <ATen/core/function_schema.h>
+
+namespace torch::jit {
+
+// Try to match a list of inputs and keyword 'attributes' to this
+// schema. Return the flat list of positional inputs to the call or
+// `std::nullopt` on failure (`failure_messages` contains a good error
+// report in this case)
+
+struct MatchedSchema {
+  std::vector<Value*> inputs;
+  std::vector<TypePtr> return_types;
+  c10::OptNameList return_field_names;
+  std::string schema_name;
+};
+
+TORCH_API bool isBlockListedSchema(const FunctionSchema& schema);
+
+TORCH_API MatchedSchema matchSchema(
+    const ::c10::FunctionSchema& schema,
+    const SourceRange& loc,
+    Graph& graph,
+    at::ArrayRef<NamedValue> args,
+    at::ArrayRef<NamedValue> kwargs,
+    const std::optional<NamedValue>& self = std::nullopt);
+
+TORCH_API std::pair<size_t, MatchedSchema> matchSchemas(
+    const std::vector<const ::c10::FunctionSchema*>& schemas,
+    const SourceRange& loc,
+    Graph& graph,
+    at::ArrayRef<NamedValue> args,
+    at::ArrayRef<NamedValue> kwargs,
+    const std::optional<NamedValue>& self = std::nullopt,
+    bool render_errors = false);
+
+TORCH_API bool convertibleToList(
+    const TypePtr& type,
+    const TypePtr& list_type_);
+
+TORCH_API std::string getFullSchemaName(const ::c10::FunctionSchema& schema);
+
+TORCH_API Value* emitBuiltinCall(
+    const SourceRange& loc,
+    Graph& graph,
+    Symbol name,
+    at::ArrayRef<NamedValue> args,
+    at::ArrayRef<NamedValue> kwargs,
+    const std::optional<NamedValue>& self = std::nullopt);
+
+TORCH_API std::optional<size_t> findInputWithName(
+    const std::string& name,
+    at::ArrayRef<NamedValue> kwargs,
+    bool is_aten = false);
+
+// applies implicit conversion from value trying to turn it into type
+// concrete_type it succeeds if the return_value->isSubtypeOf(concrete_type)
+TORCH_API Value* tryConvertToType(
+    const SourceRange& loc,
+    Graph& graph,
+    const TypePtr& concrete_type,
+    Value* value,
+    bool allow_conversions);
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/schema_type_parser.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/schema_type_parser.h
new file mode 100644
index 0000000000000000000000000000000000000000..fcfe0bbc3bd17716b90ece3efd0345a438b262b2
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/schema_type_parser.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include <ATen/core/alias_info.h>
+#include <ATen/core/jit_type.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/FunctionRef.h>
+#include <torch/csrc/jit/frontend/lexer.h>
+
+namespace torch::jit {
+
+using TypePtr = c10::TypePtr;
+
+struct TORCH_API SchemaTypeParser {
+  TypePtr parseBaseType();
+  std::optional<c10::AliasInfo> parseAliasAnnotation();
+  std::pair<TypePtr, std::optional<c10::AliasInfo>> parseType();
+  std::tuple</*fake*/ TypePtr, /*real*/ TypePtr, std::optional<c10::AliasInfo>>
+  parseFakeAndRealType();
+  std::optional<at::ScalarType> parseTensorDType(const std::string& dtype);
+  TypePtr parseRefinedTensor();
+
+  SchemaTypeParser(
+      Lexer& L,
+      bool parse_complete_tensor_types,
+      bool allow_typevars)
+      : complete_tensor_types(parse_complete_tensor_types),
+        L(L),
+        allow_typevars_(allow_typevars) {}
+
+ private:
+  std::optional<bool> tryToParseRequiresGrad();
+  std::optional<c10::Device> tryToParseDeviceType();
+  void parseList(
+      int begin,
+      int sep,
+      int end,
+      c10::function_ref<void()> callback);
+
+  bool complete_tensor_types;
+  Lexer& L;
+  size_t next_id = 0;
+  bool allow_typevars_;
+};
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/script_type_parser.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/script_type_parser.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb22be48b05fc42717688987e56af9df65ee1d92
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/script_type_parser.h
@@ -0,0 +1,53 @@
+#pragma once
+#include <ATen/core/jit_type.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/frontend/resolver.h>
+#include <torch/csrc/jit/frontend/tree_views.h>
+
+namespace torch::jit {
+
+/**
+ * class ScriptTypeParser
+ *
+ * Parses expressions in our typed AST format (TreeView) into types and
+ * typenames.
+ */
+class TORCH_API ScriptTypeParser {
+ public:
+  explicit ScriptTypeParser() = default;
+  explicit ScriptTypeParser(ResolverPtr resolver)
+      : resolver_(std::move(resolver)) {}
+
+  c10::TypePtr parseTypeFromExpr(const Expr& expr) const;
+
+  std::optional<std::pair<c10::TypePtr, int32_t>> parseBroadcastList(
+      const Expr& expr) const;
+
+  c10::TypePtr parseType(const std::string& str);
+
+  FunctionSchema parseSchemaFromDef(const Def& def, bool skip_self);
+
+  c10::IValue parseClassConstant(const Assign& assign);
+
+ private:
+  c10::TypePtr parseTypeFromExprImpl(const Expr& expr) const;
+
+  std::optional<std::string> parseBaseTypeName(const Expr& expr) const;
+  at::TypePtr subscriptToType(
+      const std::string& typeName,
+      const Subscript& subscript) const;
+  std::vector<IValue> evaluateDefaults(
+      const SourceRange& r,
+      const std::vector<Expr>& default_types,
+      const std::vector<Expr>& default_exprs);
+  std::vector<Argument> parseArgsFromDecl(const Decl& decl, bool skip_self);
+
+  std::vector<Argument> parseReturnFromDecl(const Decl& decl);
+
+  ResolverPtr resolver_ = nullptr;
+
+  // Need to use `evaluateDefaults` in serialization
+  friend struct ConstantTableValue;
+  friend struct SourceImporterImpl;
+};
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/source_range.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/source_range.h
new file mode 100644
index 0000000000000000000000000000000000000000..64315ed1151e1d334de717b357e3e7ba8a9d172d
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/source_range.h
@@ -0,0 +1,603 @@
+#pragma once
+#include <c10/util/Exception.h>
+#include <optional>
+
+#include <algorithm>
+#include <iterator>
+#include <memory>
+#include <ostream>
+#include <sstream>
+#include <unordered_map>
+
+namespace torch::jit {
+
+class SourceRangeUnpickler;
+struct SourceRange;
+
+// A stringlike class backed by a vector of string_view
+// the string represented are logically the concatenation of  the string_views
+// This has advantage of not needing continues memory.
+struct TORCH_API StringCordView {
+  StringCordView();
+  StringCordView(const StringCordView&) = default;
+  StringCordView(StringCordView&&) noexcept = default;
+  StringCordView(
+      std::vector<std::string_view> inputs,
+      std::vector<std::shared_ptr<std::string>> ownerships);
+
+  StringCordView& operator=(const StringCordView&) = default;
+  StringCordView& operator=(StringCordView&&) noexcept = default;
+
+  size_t size() const {
+    return accumulated_sizes_.back();
+  }
+
+  size_t find(const std::string& tok, size_t start) const;
+  size_t find_regex(const std::string& tok, size_t start) const;
+  StringCordView substr(size_t start, size_t size) const;
+
+  char at(size_t index) const {
+    return *iter_for_pos(index);
+  }
+  char operator[](size_t index) const {
+    return at(index);
+  }
+
+  std::string str() const {
+    std::stringstream ss;
+    for (auto s : pieces_) {
+      ss << std::string(s);
+    }
+    return ss.str();
+  }
+
+  bool operator==(const std::string& rhs) const;
+
+  bool operator==(const StringCordView& rhs) const;
+
+  std::string_view piece(size_t index) const {
+    return pieces_[index];
+  }
+
+  // General-case iterator implementation.
+  struct IteratorImpl {
+    IteratorImpl(
+        const StringCordView* str,
+        size_t start_line,
+        size_t start_pos,
+        size_t size)
+        : line_(start_line), pos_(start_pos), str_(str), size_(size) {}
+    explicit IteratorImpl(const StringCordView* str)
+        : IteratorImpl(str, 0, 0, str->size()) {}
+
+    IteratorImpl() : IteratorImpl(nullptr, 0, 0, 0) {}
+
+    IteratorImpl(const IteratorImpl&) = default;
+    IteratorImpl(IteratorImpl&&) = default;
+    IteratorImpl& operator=(const IteratorImpl&) = default;
+    IteratorImpl& operator=(IteratorImpl&&) = default;
+
+    IteratorImpl& operator++() {
+      if (size_ == 0) {
+        return *this;
+      }
+      if ((pos_ + 1) < str_->pieces_[line_].size()) {
+        pos_++;
+      } else {
+        line_++;
+        pos_ = 0;
+      }
+      return *this;
+    }
+
+    IteratorImpl operator++(int) {
+      IteratorImpl prev(*this);
+      ++(*this);
+      return prev;
+    }
+
+    IteratorImpl next_iter() const {
+      IteratorImpl next(*this);
+      ++next;
+      return next;
+    }
+
+    IteratorImpl& operator+=(size_t num);
+
+    IteratorImpl operator+(size_t num) const {
+      IteratorImpl it(*this);
+      it += num;
+      return it;
+    }
+
+    bool operator==(const IteratorImpl& rhs) const {
+      if (!has_next() && !rhs.has_next()) {
+        return true;
+      }
+      return (str_ == rhs.str_) && (line_ == rhs.line_) && (pos_ == rhs.pos_);
+    }
+
+    bool operator!=(const IteratorImpl& rhs) const {
+      return !((*this) == rhs);
+    }
+    bool has_next() const {
+      return size_ > 0 && (line_ < str_->pieces_.size());
+    }
+
+    char operator*() const {
+      TORCH_INTERNAL_ASSERT(line_ < str_->pieces_.size());
+      TORCH_INTERNAL_ASSERT(pos_ < str_->pieces_[line_].size());
+      return str_->pieces_[line_].at(pos_);
+    }
+
+    // returns rest of the line of the current iterator
+    std::string_view rest_line() const {
+      if (line_ >= str_->pieces_.size()) {
+        return "";
+      }
+
+      std::string_view cur_line = str_->pieces_[line_];
+      return cur_line.substr(pos_, std::string::npos);
+    }
+
+    size_t pos() const {
+      if (size_ == 0) {
+        return 0;
+      }
+      return str_->accumulated_sizes_[line_] + pos_;
+    }
+
+   private:
+    size_t line_;
+    size_t pos_;
+    const StringCordView* str_;
+    size_t size_;
+    friend struct StringCordView;
+  };
+
+  // Either an IteratorImpl, or a simple std::string_view::iterator
+  // (which is faster) if possible.
+  struct Iterator {
+    Iterator() = default;
+
+    Iterator(
+        const StringCordView* str,
+        size_t start_line,
+        size_t start_pos,
+        size_t size)
+        : repr_(
+              str->pieces_.size() == 1
+                  ? repr_type(FastRepr(
+                        start_line ? str->pieces_[0].end()
+                                   : str->pieces_[0].begin() + start_pos,
+                        str))
+                  : repr_type(IteratorImpl(str, start_line, start_pos, size))) {
+    }
+
+    Iterator(const StringCordView* str) : Iterator(str, 0, 0, str->size()) {}
+
+    Iterator& operator++() {
+      if (auto* pit = std::get_if<IteratorImpl>(&repr_)) {
+        ++(*pit);
+      } else {
+        ++fast_repr().it;
+      }
+      return *this;
+    }
+
+    Iterator operator++(int) {
+      Iterator prev(*this);
+      ++(*this);
+      return prev;
+    }
+
+    Iterator next_iter() const {
+      Iterator next(*this);
+      ++next;
+      return next;
+    }
+
+    Iterator& operator+=(size_t num) {
+      if (auto* pit = std::get_if<IteratorImpl>(&repr_)) {
+        *pit += num;
+      } else {
+        fast_repr().it += num;
+      }
+      return *this;
+    }
+
+    Iterator operator+(size_t num) const {
+      Iterator it(*this);
+      it += num;
+      return it;
+    }
+
+    bool operator==(const Iterator& rhs) const {
+      return repr_ == rhs.repr_;
+    }
+
+    bool operator!=(const Iterator& rhs) const {
+      return repr_ != rhs.repr_;
+    }
+
+    bool has_next() const {
+      if (const auto* pit = std::get_if<IteratorImpl>(&repr_)) {
+        return pit->has_next();
+      } else {
+        return fast_repr().it != fast_repr().str->pieces_[0].end();
+      }
+    }
+
+    char operator*() const {
+      if (const auto* pit = std::get_if<IteratorImpl>(&repr_)) {
+        return **pit;
+      } else {
+        return *fast_repr().it;
+      }
+    }
+
+    std::string_view rest_line() const {
+      if (const auto* pit = std::get_if<IteratorImpl>(&repr_)) {
+        return pit->rest_line();
+      } else {
+        // NOTE: std::string_view(it, end) ctor wasn't added until C++20.
+        const auto fast_repr_end = fast_repr().str->pieces_[0].end();
+        if (fast_repr().it != fast_repr_end) {
+          return std::string_view(
+              &*fast_repr().it, fast_repr_end - fast_repr().it);
+        }
+        return std::string_view();
+      }
+    }
+
+    size_t pos() const {
+      if (const auto* pit = std::get_if<IteratorImpl>(&repr_)) {
+        return pit->pos();
+      } else {
+        return fast_repr().it - fast_repr().str->pieces_[0].begin();
+      }
+    }
+
+   private:
+    // When we have only one entry in pieces_ (importantly, such as
+    // when called from torch::Library::def during startup), we can
+    // skip extra complexity and just use string_view::iterator
+    // directly.
+    struct FastRepr {
+      std::string_view::iterator it;
+      const StringCordView* str;
+
+      FastRepr() : str(nullptr) {}
+
+      explicit FastRepr(
+          std::string_view::iterator it_,
+          const StringCordView* str_)
+          : it(it_), str(str_) {}
+
+      bool operator==(const FastRepr& rhs) const {
+        return it == rhs.it && str == rhs.str;
+      }
+
+      bool operator!=(const FastRepr& rhs) const {
+        return !operator==(rhs);
+      }
+    };
+    using repr_type = std::variant<FastRepr, IteratorImpl>;
+    repr_type repr_;
+
+    FastRepr& fast_repr() {
+      // -Oz refuses to inline std::get.
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(std::holds_alternative<FastRepr>(repr_));
+      return *std::get_if<FastRepr>(&repr_);
+    }
+
+    const FastRepr& fast_repr() const {
+      // -Oz refuses to inline std::get.
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(std::holds_alternative<FastRepr>(repr_));
+      return *std::get_if<FastRepr>(&repr_);
+    }
+  };
+
+  Iterator begin() const {
+    return Iterator(this, 0, 0, size());
+  }
+  Iterator end() const {
+    return Iterator(this, pieces_.size(), 0, 0);
+  }
+  Iterator iter_for_pos(size_t pos) const;
+
+ private:
+  IteratorImpl begin_impl() const {
+    return IteratorImpl(this, 0, 0, size());
+  }
+  IteratorImpl end_impl() const {
+    return IteratorImpl(this, pieces_.size(), 0, 0);
+  }
+  IteratorImpl iter_impl_for_pos(size_t pos) const;
+  std::vector<std::string_view> pieces_;
+  std::vector<size_t> accumulated_sizes_;
+  std::vector<std::shared_ptr<std::string>> owned_strings_;
+};
+
+// Source represents a code segment. It keeps track of:
+//  - text_view : the view into text of the code segment
+//  - filename (optional) : if present, represents the name of the file from
+//                          which the code segment originated.
+//  - starting_line_no : represents the line in the original file where the
+//                       code segment started.
+struct TORCH_API Source {
+  // Whether or not Source should copy the string passed in the constructor.
+  enum CopiesString { COPIES_STRING, DONT_COPY };
+
+  explicit Source(
+      std::string_view text_view,
+      std::optional<std::string> filename = std::nullopt,
+      size_t starting_line_no = 0,
+      std::shared_ptr<SourceRangeUnpickler> gen_ranges = nullptr,
+      CopiesString copies_str = COPIES_STRING)
+      : text_view_(create_text_view(copies_str, text_view)),
+        filename_(std::move(filename)),
+        starting_line_no_(starting_line_no),
+        gen_ranges_(std::move(gen_ranges)) {
+    calc_line_start_offsets();
+  }
+
+  explicit Source(
+      StringCordView str,
+      std::optional<std::string> filename = std::nullopt,
+      size_t starting_line_no = 0,
+      std::shared_ptr<SourceRangeUnpickler> gen_ranges = nullptr)
+      : text_view_(std::move(str)),
+        filename_(std::move(filename)),
+        starting_line_no_(starting_line_no),
+        gen_ranges_(std::move(gen_ranges)) {
+    calc_line_start_offsets();
+  }
+  // Given a line number (within source_), return the byte offset of the
+  // beginning of that line.
+  size_t offset_for_line(size_t line) const {
+    return line_starting_offsets_.at(line);
+  }
+
+  // Returns number of lines present.
+  size_t num_lines() const {
+    return line_starting_offsets_.size();
+  }
+
+  // Calculate the line (within the code segment) on which `offset` resides.
+  size_t lineno_for_offset(size_t offset) const {
+    auto iter = std::upper_bound(
+        line_starting_offsets_.begin(), line_starting_offsets_.end(), offset);
+    return iter - line_starting_offsets_.begin() - 1;
+  }
+
+  // Calculate the line (within the original source file, if present) on which
+  // `lineno` resides.
+  size_t lineno_to_source_lineno(size_t lineno) const {
+    if (filename_) {
+      return lineno + starting_line_no_;
+    } else {
+      return lineno;
+    }
+  }
+
+  StringCordView get_line(size_t lineno) const {
+    auto start = offset_for_line(lineno);
+    auto size = (lineno + 1) < num_lines() ? offset_for_line(lineno + 1) - start
+                                           : text_view_.size() - start;
+    return text_view_.substr(start, size);
+  }
+
+  const StringCordView& text_str() const {
+    return text_view_;
+  }
+
+  char char_at(size_t index) const {
+    return text_view_.at(index);
+  }
+
+  size_t size() const {
+    return text_view_.size();
+  }
+
+  std::optional<std::string>& filename() {
+    return filename_;
+  }
+
+  size_t starting_line_no() const {
+    return starting_line_no_;
+  }
+
+  std::optional<SourceRange> findSourceRangeThatGenerated(
+      const SourceRange& range);
+
+  ~Source() = default;
+
+ private:
+  void calc_line_start_offsets() {
+    line_starting_offsets_.clear();
+    line_starting_offsets_.push_back(0);
+    size_t pos = 0;
+    while ((pos = text_view_.find("\n", pos)) != std::string::npos) {
+      line_starting_offsets_.push_back(++pos);
+    }
+  }
+
+  static StringCordView create_text_view(
+      CopiesString copies_str,
+      std::string_view text_view) {
+    if (copies_str == COPIES_STRING) {
+      auto allocated_str =
+          std::make_shared<std::string>(text_view.data(), text_view.size());
+      return StringCordView({*allocated_str}, {allocated_str});
+    } else {
+      return StringCordView({text_view}, {});
+    }
+  }
+
+  StringCordView text_view_;
+
+  std::optional<std::string> filename_;
+  // If filename_ is not present, starting_line_no_ is don't care
+  size_t starting_line_no_;
+  // Starting offsets for lines into the source. e.g. line 0 starts at
+  // line_starting_offsets_[0], etc.
+  std::vector<size_t> line_starting_offsets_;
+
+  std::shared_ptr<SourceRangeUnpickler> gen_ranges_;
+};
+
+// A SourceRange is a reference to subset of a Source, specified by `start` and
+// `end` byte offsets into the source text.
+struct TORCH_API SourceRange {
+  SourceRange(std::shared_ptr<Source> source_view, size_t start_, size_t end_)
+      : source_view_(std::move(source_view)), start_(start_), end_(end_) {
+    if (source_view_) {
+      start_iter_ = source_view_->text_str().iter_for_pos(start_);
+    }
+  }
+
+  SourceRange() : source_view_(nullptr), start_(0), end_(0) {}
+
+  SourceRange(
+      std::shared_ptr<Source> source_view_,
+      StringCordView::Iterator start_iter,
+      size_t end_)
+      : source_view_(std::move(source_view_)),
+        start_(start_iter.pos()),
+        end_(end_),
+        start_iter_(start_iter) {}
+
+  const std::string_view token_text() const {
+    size_t size = end() - start();
+    return start_iter_.rest_line().substr(0, size);
+  }
+
+  const StringCordView text() const {
+    return source_view_->text_str().substr(start(), end() - start());
+  }
+  size_t size() const {
+    return end() - start();
+  }
+  static const size_t CONTEXT = 3;
+  void highlight(std::ostream& out) const;
+
+  // Customizable version of 'highlight' method.
+  void print_with_context(
+      std::ostream& out,
+      size_t context,
+      bool highlight,
+      const std::string& funcname) const;
+
+  const std::shared_ptr<Source>& source() const {
+    return source_view_;
+  }
+  size_t start() const {
+    return start_;
+  }
+  size_t end() const {
+    return end_;
+  }
+  std::string str() const {
+    std::stringstream ss;
+    highlight(ss);
+    return ss.str();
+  }
+
+  std::optional<std::tuple<std::string, size_t, size_t>> file_line_col() const {
+    if (!source_view_ || !source()->filename()) {
+      return std::nullopt;
+    }
+
+    auto lineno = source_view_->lineno_for_offset(start_);
+    auto col_offset = (int)start_ - (int)source_view_->offset_for_line(lineno);
+    // TODO: std::optional<>::value returns an rvalue ref so can't use it here??
+    return std::make_tuple<std::string, size_t, size_t>(
+        source_view_->filename().value_or(""),
+        source_view_->lineno_to_source_lineno(lineno),
+        (size_t)col_offset);
+  }
+
+  bool operator==(const SourceRange& rhs) const {
+    return start() == rhs.start() && end() == rhs.end() &&
+        source() == rhs.source();
+  }
+
+  bool operator!=(const SourceRange& rhs) const {
+    return !(*this == rhs);
+  }
+
+  std::optional<SourceRange> findSourceRangeThatGenerated() const {
+    if (!source_view_) {
+      return std::nullopt;
+    }
+    return source_view_->findSourceRangeThatGenerated(*this);
+  }
+
+ protected:
+  std::shared_ptr<Source> source_view_;
+
+ private:
+  size_t start_;
+  size_t end_;
+  StringCordView::Iterator start_iter_;
+};
+
+// OwnedSourceRange is just like a SourceRange except that it owns a `Source`
+// instead of `Source`. Thus OwnedSourceRange owns a copy of source text.
+struct OwnedSourceRange : public SourceRange {
+  explicit OwnedSourceRange(const SourceRange& source_range)
+      : SourceRange(source_range) {
+    const auto& source = source_range.source();
+    if (source) {
+      source_view_ = std::make_shared<Source>(
+          source->text_str().str(),
+          source->filename(),
+          source->starting_line_no());
+    }
+  }
+};
+
+struct TORCH_API SourceRangeHasher {
+ public:
+  size_t operator()(const torch::jit::SourceRange& key) const;
+};
+
+struct StackEntry {
+  std::string filename;
+  SourceRange range;
+};
+
+TORCH_API void format_stack_trace(
+    std::ostream& out,
+    const std::vector<StackEntry>& entries);
+
+inline std::ostream& operator<<(std::ostream& out, const SourceRange& range) {
+  range.highlight(out);
+  return out;
+}
+
+// A pair of (byte offset, SourceRange) describing a specific segment
+// of the output stream
+struct TaggedRange {
+  TaggedRange(size_t bytes, SourceRange range)
+      : bytes(bytes), range(std::move(range)) {}
+  size_t bytes;
+  SourceRange range;
+};
+using SourceRangeRecords = std::vector<TaggedRange>;
+using SourceRangeTagMap =
+    std::unordered_map<SourceRange, int64_t, SourceRangeHasher>;
+
+} // namespace torch::jit
+
+namespace std {
+template <>
+struct iterator_traits<torch::jit::StringCordView::Iterator> {
+  using value_type = char;
+  using difference_type = ptrdiff_t;
+  using pointer = char*;
+  using reference = char&;
+  using iterator_category = std::forward_iterator_tag;
+};
+} // namespace std
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/source_ref.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/source_ref.h
new file mode 100644
index 0000000000000000000000000000000000000000..dbda47da39c49b9f921d10e1349479dc9bd894a3
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/source_ref.h
@@ -0,0 +1,45 @@
+#pragma once
+
+#include <functional>
+#include <memory>
+
+#include <ATen/core/ivalue.h>
+#include <c10/macros/Export.h>
+#include <torch/csrc/jit/frontend/source_range.h>
+
+namespace torch::jit {
+
+/**
+ * SourceRef does two things:
+ *   1. Owns a Source object.
+ *   2. Serves as lookup key to the owned Source in associative containers, for
+ *      runtime data aggregation.
+ * We don't want to use std::shared_ptr<Source> directly because we want to
+ * support heteogeneous lookup, and also shared_ptr is an implementation detail
+ * which should be encapsulated.
+ */
+class TORCH_API SourceRef : public CustomClassHolder {
+ public:
+  explicit SourceRef(std::shared_ptr<Source> source_view)
+      : source_view_(std::move(source_view)) {}
+  bool operator==(const SourceRef& other) const {
+    return source_view_ == other.source_view_;
+  }
+  bool operator<(const Source& other) const {
+    return source_view_.get() < &other;
+  }
+  friend bool operator<(const Source& other, const SourceRef& self) {
+    return &other < self.source_view_.get();
+  }
+  bool operator<(const SourceRef& other) const {
+    return *this < *other.source_view_;
+  }
+  const Source* operator->() const {
+    return source_view_.get();
+  }
+
+ private:
+  std::shared_ptr<Source> source_view_;
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/strtod.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/strtod.h
new file mode 100644
index 0000000000000000000000000000000000000000..78af98d26fffd68a012e199e6081513623806af5
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/strtod.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+
+namespace torch::jit {
+
+TORCH_API double strtod_c(const char* nptr, char** endptr);
+TORCH_API float strtof_c(const char* nptr, char** endptr);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/sugared_value.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/sugared_value.h
new file mode 100644
index 0000000000000000000000000000000000000000..d666b4c64cdab3978e046ef27150b50945a33cab
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/sugared_value.h
@@ -0,0 +1,861 @@
+#pragma once
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include <ATen/core/symbol.h>
+#include <caffe2/serialize/versions.h>
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/frontend/error_report.h>
+#include <torch/csrc/jit/frontend/schema_matching.h>
+#include <torch/csrc/jit/frontend/versioned_symbols.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+using SugaredValuePtr = std::shared_ptr<SugaredValue>;
+
+// The AST can contain nodes like `self`, `self.b` or `python_fn` that
+// are not first-class values in the graph representation, but instead
+// will be desugared based on how they are used in the AST.
+
+// SugaredValue is used to temporarily represent these values in a way
+// that separates their behavior from the AST -> IR converter itself.
+// This allows us to keep dependencies on python minimal.
+
+struct TORCH_API SugaredValue
+    : public std::enable_shared_from_this<SugaredValue> {
+  // what is this node? for error reporting (e.g. Module, python function)
+  virtual std::string kind() const = 0;
+
+  // what can we do with this thing?
+  // use it as a value e.g.  `this + 4`
+  virtual Value* asValue(const SourceRange& loc, GraphFunction& m) {
+    throw(ErrorReport(loc) << kind() << " cannot be used as a value");
+  }
+
+  // select an attribute on it, e.g. `this.field`
+  virtual std::shared_ptr<SugaredValue> attr(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::string& field) {
+    throw(ErrorReport(loc) << "attribute lookup is not defined on " << kind());
+  }
+
+  virtual bool hasAttr(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::string& field) {
+    throw(ErrorReport(loc) << "attribute lookup is not defined on " << kind());
+  }
+
+  // assign an attribute on it, e.g. `this.field = newValue`
+  virtual void setAttr(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::string& field,
+      Value* newValue) {
+    throw(
+        ErrorReport(loc) << "attribute assignment is not defined on "
+                         << kind());
+  }
+
+  // use it as a vector of values, e.g. a tuple of values as return value from
+  // a method invocation
+  virtual std::vector<std::shared_ptr<SugaredValue>> asTuple(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::optional<size_t>& size_hint = {}) {
+    throw(ErrorReport(loc) << kind() << " cannot be used as a tuple");
+  }
+
+  // TODO @wconstab refactor to use ModuleValue::asTuple instead of new API
+  virtual SugaredValuePtr asTupleValue(
+      const SourceRange& loc,
+      GraphFunction& m) {
+    throw(ErrorReport(loc) << kind() << " cannot be used as a tuplevalue");
+  }
+
+  virtual std::vector<std::shared_ptr<SugaredValue>> asType(
+      const SourceRange& loc,
+      Method& m) {
+    throw(ErrorReport(loc) << kind() << " cannot be used as a type");
+  }
+
+  // call it like a function, e.g. `outputs = this(inputs)`
+  virtual std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& m,
+      // note: names for args will be 'argument 0', 'argument 1', etc..
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
+      size_t n_binders) {
+    // n_binders is always set to the number of variables an expression is
+    // syntactically bound to:
+    //     a = foo() # 1 binder (note in this case the single binder might be a
+    //     tuple) a, * b = foo() # 1 binder a, b = foo() # 2 binders foo() # 0
+    //     binders
+    //
+    // In subexpressions, like bar() in foo(bar()), n_binders is always set to
+    // 1. n_binders is used as a hint to subexpressions to determine how many
+    // values they should return when that number is ambiguous statically. In
+    // particular it is currently used to decide how many tensors a call to a
+    // python function will return. It is only a hint, functions do not have to
+    // check that n_binders match the number of things they are returning, the
+    // assignment logic will do that anyway.
+
+    throw(ErrorReport(loc) << "cannot call a " << kind());
+  }
+
+  // This function is called when to convert a SugaredValue to its iterator.
+  // For example, when iterating through a Dict we iterate over its keys
+  virtual std::shared_ptr<SugaredValue> iter(
+      const SourceRange& loc,
+      GraphFunction& m) {
+    throw(ErrorReport(loc) << kind() << " cannot be used as an iterable");
+  }
+
+  // If we are iterating over a Sugared Value and it returns a value from this
+  // function, then we emit an unrolled loop over the variable. This allows us
+  // to support containers of Heterogenous types, like Module Containers &
+  // Tuples
+  virtual std::optional<int64_t> staticLen() {
+    return std::nullopt;
+  }
+
+  // When iterating over this SugaredValue, should we emit the for loop as an
+  // unrolled loop.
+  bool shouldEmitUnrolled() {
+    return staticLen() != std::nullopt;
+  }
+
+  // return length of this thing, if not then it can't be iterated.
+  // If it does not have a statically-determinable length, then it cannot
+  // be iterated over with a modulelist. If it does it must return a constant
+  // Value *
+  virtual Value* len(const SourceRange& loc, GraphFunction& m) {
+    throw(
+        ErrorReport(loc) << "'" << kind() << "'"
+                         << " object is not iterable");
+  }
+
+  // expression for ith elemement for iterable value
+  virtual std::shared_ptr<SugaredValue> getitem(
+      const SourceRange& loc,
+      GraphFunction& m,
+      Value* idx,
+      TypePtr type_hint = nullptr) {
+    throw(
+        ErrorReport(loc) << "'" << kind() << "'"
+                         << " object is not subscriptable");
+  }
+
+  virtual ~SugaredValue() = default;
+};
+
+// most things in the environment are just simple value types
+// and not special python syntax sugar types
+struct TORCH_API SimpleValue : public SugaredValue {
+  SimpleValue(Value* value) : value_(value) {}
+  std::string kind() const override {
+    std::stringstream ss;
+    // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+    ss << "value of type '" << value_->type()->annotation_str() << "'";
+    return ss.str();
+  }
+  Value* asValue(const SourceRange& range, GraphFunction& m) override {
+    return value_;
+  }
+  std::vector<std::shared_ptr<SugaredValue>> asTuple(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::optional<size_t>& size_hint = {}) override;
+  std::shared_ptr<SugaredValue> attr(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::string& field) override;
+
+  bool hasAttr(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::string& field) override;
+
+  void setAttr(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::string& field,
+      Value* newValue) override;
+
+  std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& m,
+      // note: names for args will be 'argument 0', 'argument 1', etc..
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
+      size_t n_binders) override;
+
+  std::shared_ptr<SugaredValue> iter(const SourceRange& loc, GraphFunction& m)
+      override;
+
+  Value* getValue() const {
+    return value_;
+  }
+
+  Value* len(const SourceRange& loc, GraphFunction& m) override;
+  SugaredValuePtr getitem(
+      const SourceRange& loc,
+      GraphFunction& m,
+      Value* idx,
+      TypePtr type_hint = nullptr) override;
+
+ private:
+  Value* value_;
+};
+
+struct TORCH_API BuiltinFunction : public SugaredValue {
+  BuiltinFunction(Symbol symbol, std::optional<NamedValue> self)
+      : symbol(symbol), self(std::move(self)) {}
+
+  // The symbol of the function (e.g. `aten::relu`).
+  Symbol symbol;
+
+  // if this is method, then this is the self argument.
+  std::optional<NamedValue> self;
+  std::string kind() const override {
+    return "builtin";
+  }
+  std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& m,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
+      size_t n_binders) override;
+
+  // try to create this builtin but if it doesn't exist or the self argument
+  // cannot possibly match, then return nullptr. Use in situations where it is
+  // not clear if it is a valid builtin
+  static std::shared_ptr<BuiltinFunction> tryCreate(
+      Symbol symbol,
+      std::optional<NamedValue> self);
+};
+
+struct TORCH_API SugaredTupleValue : public SugaredValue {
+  explicit SugaredTupleValue(std::vector<std::shared_ptr<SugaredValue>> tup)
+      : tup_(std::move(tup)) {}
+
+  std::vector<std::shared_ptr<SugaredValue>> asTuple(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::optional<size_t>& size_hint = {}) override {
+    return tup_;
+  }
+
+  Value* asValue(const SourceRange& loc, GraphFunction& m) override {
+    std::vector<Value*> vec;
+    vec.reserve(tup_.size());
+    for (const auto& sv : tup_) {
+      vec.push_back(sv->asValue(loc, m));
+    }
+    Graph& g = *m.graph();
+    return g.insertNode(g.createTuple(vec))->output();
+  }
+
+  std::string kind() const override {
+    return "Tuple";
+  }
+
+  SugaredValuePtr getitem(
+      const SourceRange& loc,
+      GraphFunction& m,
+      Value* idx,
+      TypePtr type_hint = nullptr) override {
+    if (!(idx->type()->cast<IntType>() && toIValue(idx))) {
+      throw(
+          ErrorReport(loc)
+          << "Expected integer literal for index but got a variable or non-integer. "
+          << "ModuleList/Sequential indexing is only supported with integer literals. "
+          << "For example, 'i = 4; self.layers[i](x)' will fail because i is not a literal. "
+          << "Enumeration is supported, e.g. 'for index, v in enumerate(self): out = v(inp)'");
+    }
+    auto index = toIValue(idx)->toInt();
+    int64_t adj_index =
+        (index < 0) ? index + static_cast<int64_t>(tup_.size()) : index;
+    if (!(adj_index >= 0 && adj_index < static_cast<int64_t>(tup_.size()))) {
+      throw(
+          ErrorReport(loc) << "Index " << index << " out of range of length "
+                           << tup_.size());
+    }
+    return tup_.at(adj_index);
+  }
+
+  // This function is called when a SugaredValue is used to convert a
+  // SugaredValue to its iterator. For example, when iterating through a Dict we
+  // iterate over its keys
+  std::shared_ptr<SugaredValue> iter(const SourceRange& loc, GraphFunction& m)
+      override {
+    return shared_from_this();
+  }
+
+  // Because this is used to contain SugaredValues of Heterogenous types,
+  // we define staticLen() so that when this is iterated over it is emitted
+  // as an unrolled loop.
+  std::optional<int64_t> staticLen() override {
+    return static_cast<int64_t>(tup_.size());
+  }
+
+  std::vector<std::shared_ptr<SugaredValue>> tup_;
+};
+
+struct TORCH_API BuiltinModule : public SugaredValue {
+  BuiltinModule(std::string name, std::optional<int64_t> version = std::nullopt)
+      : name(std::move(name)), version(version) {}
+
+  std::string kind() const override {
+    return "builtin module";
+  }
+  std::shared_ptr<SugaredValue> attr(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::string& field) override {
+    if (field == "autograd") {
+      // When refering torch.autograd, it is also considered to be a
+      // BuiltinModule and we will dispatch to the aten operators for the
+      // methods under its module.
+      return std::make_shared<BuiltinModule>("aten", version);
+    }
+
+    auto sym = Symbol::fromQualString(name + "::" + field);
+    return std::make_shared<BuiltinFunction>(sym, std::nullopt);
+  }
+
+ private:
+  std::string name;
+  // when we add operator versioning, emit this op as it exising at 'version'
+  // if not set, use the latest version
+  std::optional<int64_t> version;
+};
+
+// Represents a class, analagous to `int` or `dict`. Instances of classes,
+// like `1` or `{"foo": 5}`, are represented as SimpleValues
+struct TORCH_API ClassValue : public SugaredValue {
+  explicit ClassValue(ClassTypePtr type) : type_(std::move(type)) {}
+
+  // Call the type's constructor, as in:
+  //    n = Foo(constructor_arg)
+  std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& m,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
+      size_t n_binders) override;
+
+  std::shared_ptr<SugaredValue> attr(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::string& field) override;
+
+  std::string kind() const override {
+    return type_->str();
+  }
+
+  ClassTypePtr type_;
+};
+
+struct TORCH_API NamedTupleConstructor : public SugaredValue {
+  explicit NamedTupleConstructor(TupleTypePtr type) : type_(std::move(type)) {}
+
+  std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& m,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
+      size_t n_binders) override;
+
+  std::string kind() const override {
+    return type_->str();
+  }
+
+  TupleTypePtr type_;
+};
+
+struct FunctionValue : public SugaredValue {
+  FunctionValue(Function* callee) : callees_({callee}) {}
+  FunctionValue(const StrongFunctionPtr& p)
+      : callees_({p.function_}), cu_(p.cu_) {}
+  FunctionValue(const std::vector<StrongFunctionPtr>& callees) {
+    for (const StrongFunctionPtr& callee : callees) {
+      cu_ = cu_ ? cu_ : callee.cu_;
+      TORCH_INTERNAL_ASSERT(callee.cu_ == cu_);
+      callees_.push_back(callee.function_);
+    }
+  }
+
+  std::string kind() const override {
+    return "function";
+  }
+
+  std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& f,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
+      size_t n_binders) override {
+    std::vector<const FunctionSchema*> schemas;
+    for (Function* callee : callees_) {
+      try {
+        callee->ensure_defined();
+      } catch (const RecursiveMethodCallError&) {
+        throw(
+            ErrorReport(loc)
+            << " function '" << callee->name() << "' is called recursively. "
+            << "Recursive calls are not supported");
+      }
+      schemas.push_back(&callee->getSchema());
+    }
+    auto match = matchSchemas(schemas, loc, *f.graph(), args, kwargs);
+    Value* output =
+        f.graph()->insertFunctionCall(callees_[match.first], match.second);
+    output->node()->setSourceRange(loc);
+    return std::make_shared<SimpleValue>(output);
+  }
+
+  const std::vector<Function*>& callees() {
+    return callees_;
+  }
+
+ private:
+  std::vector<Function*> callees_;
+  // TODO holding this thing is creepy
+  std::shared_ptr<CompilationUnit> cu_;
+};
+
+struct TORCH_API ClosureValue : public SugaredValue {
+  ClosureValue(Value* value) : value_(value) {
+    TORCH_INTERNAL_ASSERT(value_->node()->kind() == prim::Closure);
+  }
+  std::string kind() const override {
+    return "closure";
+  }
+  Value* asValue(const SourceRange& range, GraphFunction& m) override {
+    return value_;
+  }
+  Value* value_;
+};
+
+// defines how a method obtained from a module/class/interface behaves in script
+struct MethodValue : public SugaredValue {
+  MethodValue(Value* self, std::vector<std::string> method_names)
+      : self_(self), method_names_(std::move(method_names)) {}
+  MethodValue(Value* self, std::string method_name)
+      : MethodValue(self, std::vector<std::string>({std::move(method_name)})) {}
+
+  std::string kind() const override {
+    return "method";
+  }
+
+  std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& f,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
+      size_t n_binders) override {
+    std::vector<NamedValue> argsWithSelf = {self_};
+    argsWithSelf.insert(argsWithSelf.end(), args.begin(), args.end());
+    std::vector<const FunctionSchema*> schemas;
+    for (const std::string& method_name : method_names_) {
+      if (auto class_type = self_->type()->cast<ClassType>()) {
+        Function& method = class_type->getMethod(method_name);
+        try {
+          method.ensure_defined();
+        } catch (const RecursiveMethodCallError&) {
+          throw(
+              ErrorReport(loc)
+              << " method '" << method.name() << "' is called recursively. "
+              << "Recursive calls are not supported");
+        }
+        schemas.push_back(&method.getSchema());
+      } else if (auto interface_type = self_->type()->cast<InterfaceType>()) {
+        schemas.push_back(interface_type->getMethod(method_name));
+      } else {
+        TORCH_INTERNAL_ASSERT(
+            false, "method constructed that is not a class or interface");
+      }
+    }
+    auto match = matchSchemas(schemas, loc, *f.graph(), argsWithSelf, kwargs);
+    Value* output =
+        f.graph()->insertMethodCall(method_names_[match.first], match.second);
+    output->node()->setSourceRange(loc);
+    return std::make_shared<SimpleValue>(output);
+  }
+
+ private:
+  Value* self_;
+  std::vector<std::string> method_names_;
+};
+
+struct TORCH_API PrintValue : public SugaredValue {
+  std::string kind() const override {
+    return "print";
+  }
+  std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& m,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
+      size_t n_binders) override;
+};
+
+// expressions like int(x)
+// these are the same as call prim::Int or equivalent except it
+// is a noop when the input is a subtype of 'type'
+struct TORCH_API CastValue : public BuiltinFunction {
+  CastValue(TypePtr type, c10::Symbol method)
+      : BuiltinFunction(method, std::nullopt), type_(std::move(type)) {}
+  std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& m,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
+      size_t n_binders) override {
+    if (args.size() == 1 && kwargs.empty()) {
+      auto len_op = std::make_shared<BuiltinFunction>(aten::len, std::nullopt);
+      auto gt_op = std::make_shared<BuiltinFunction>(aten::gt, std::nullopt);
+      auto zero = m.graph()->insertConstant(0);
+
+      auto v = args[0].value(*m.graph());
+      if (v->type()->isSubtypeOf(*type_)) {
+        return std::make_shared<SimpleValue>(v);
+      } else if (
+          *type_ == *BoolType::get() &&
+          (v->type()->isSubtypeOf(*AnyListType::get()) ||
+           v->type()->isSubtypeOf(*StringType::get()) ||
+           v->type()->cast<DictType>())) {
+        auto len = len_op->call(loc, m, {v}, {}, 1);
+        return gt_op->call(loc, m, {len->asValue(loc, m), zero}, {}, 1);
+      }
+    }
+    return BuiltinFunction::call(loc, m, args, kwargs, n_binders);
+  }
+
+ private:
+  TypePtr type_;
+};
+
+struct TORCH_API TensorCastValue : public SugaredValue {
+  TensorCastValue(at::ScalarType type, NamedValue self)
+      : dtype_(type), self_(std::move(self)) {}
+
+  std::string kind() const override {
+    return "Cast";
+  }
+
+  std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& m,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
+      size_t n_binders) override {
+    TORCH_INTERNAL_ASSERT(args.empty() && kwargs.empty());
+    Value* dtype_const = m.graph()->insertConstant(dtype_, loc);
+    std::vector<NamedValue> kwargs_{
+        self_, NamedValue(loc, "dtype", dtype_const)};
+    Value* casted_val = m.graph()->insert(
+        /*opname=*/Symbol::fromQualString("aten::to"),
+        /*args=*/args,
+        /*kwargs=*/kwargs_,
+        /*range=*/loc);
+    return std::make_shared<SimpleValue>(casted_val);
+  }
+
+  at::ScalarType dtype_;
+  NamedValue self_;
+};
+
+// builtins operators and functions that call a method if it exists
+// on a class type, like 'len(x)' and 'x + y'
+struct TORCH_API MagicMethod : public SugaredValue {
+  MagicMethod(std::string desugared_name, SugaredValuePtr base)
+      : base_value_(std::move(base)),
+        desugared_name_(std::move(desugared_name)) {}
+
+  std::string kind() const override {
+    return desugared_name_;
+  }
+
+  std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& m,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
+      size_t n_binders) override;
+
+ private:
+  SugaredValuePtr base_value_;
+  std::string desugared_name_;
+};
+
+// things that look like function applications, but
+// perform non-standard evaluation are represented
+// with SpecialFormValues, e.g.
+//   isinstance(x, int)
+//   fork(fn)
+//   annotate(int, 3)
+// The implementation of each value is handled by a case inside emitApplyExpr
+struct TORCH_API SpecialFormValue : public SugaredValue {
+  SpecialFormValue(Symbol form) : form_(form) {}
+  std::string kind() const override {
+    return form_.toUnqualString();
+  }
+  Symbol form() const {
+    return form_;
+  }
+  static std::shared_ptr<SpecialFormValue> create(Symbol form) {
+    return std::make_shared<SpecialFormValue>(form);
+  }
+
+ private:
+  Symbol form_;
+};
+
+struct TORCH_API LegacyTensorConstructor : public SpecialFormValue {
+  LegacyTensorConstructor(Symbol form, at::ScalarType dtype, at::Device device)
+      : SpecialFormValue(form), device_(device), dtype_(dtype) {}
+
+  static std::shared_ptr<LegacyTensorConstructor> create(
+      Symbol form,
+      at::ScalarType dtype,
+      at::Device device) {
+    return std::make_shared<LegacyTensorConstructor>(form, dtype, device);
+  }
+  at::ScalarType dtype() const {
+    return dtype_;
+  }
+
+ private:
+  at::Device device_;
+  at::ScalarType dtype_;
+};
+
+// matched against for special handling of range expressions
+struct TORCH_API RangeValue : SugaredValue {
+  RangeValue(
+      const SourceRange& loc,
+      GraphFunction& m,
+      std::vector<Value*> input,
+      std::optional<int64_t> static_len = std::nullopt);
+
+  std::string kind() const override {
+    return "range";
+  }
+  Value* len(const SourceRange& loc, GraphFunction& m) override;
+  SugaredValuePtr getitem(
+      const SourceRange& loc,
+      GraphFunction& m,
+      Value* idx,
+      TypePtr type_hint = nullptr) override;
+  std::shared_ptr<SugaredValue> iter(const SourceRange& loc, GraphFunction& m)
+      override;
+
+  // When Range is instantiated via enumerate(iterable_with_static_len),
+  // then it takes the static length of the iterable
+  std::optional<int64_t> staticLen() override {
+    return static_len_;
+  }
+
+ private:
+  Value* start_{};
+  Value* end_{};
+  Value* step_{};
+  // a flag to determine if it's a simple range() call with only end_ from
+  // arguments If true, we will not insert length calculation and index
+  // derivation nodes to simplify the graph and enable more possible
+  // optimizations
+  bool has_only_end_{};
+  std::optional<int64_t> static_len_;
+};
+
+// Specialized Tree structure to matched against for special handling
+// of builtin functions iterables expressions like zip(), enumerate(), etc.
+// zip and enumerate can be modeled as a tree of SimpleValue/RangeValue:
+//    zip(x, y) ->  (x, y) with tuple assignment to each loop target
+//    enumerate(x) -> (range(0, math.inf, 1), x)
+// So a complicated expression like zip(a, enumerate(b), range(0, 100)) will be:
+// (a, (range(0, math.inf, 1), b), range(0, 100))
+// We use those base iterables to fill in the loop information like
+// max_trip_count and set the value table for loop targets
+// Iterables can contain lists of SugaredValues like ModuleLists. If it
+// does, then we emit it unrolled and require that all values it contains
+// have a statically-determinable length.
+struct TORCH_API IterableTree : SugaredValue {
+  IterableTree() = default;
+  IterableTree(
+      const SourceRange& range,
+      GraphFunction& m,
+      at::ArrayRef<SugaredValuePtr> children) {
+    for (const auto& child : children) {
+      addChild(range, m, child);
+    }
+  }
+  std::string kind() const override {
+    return "iterabletree";
+  }
+
+  std::shared_ptr<SugaredValue> iter(const SourceRange& loc, GraphFunction& m)
+      override {
+    return shared_from_this();
+  }
+
+  void addChild(
+      const SourceRange& range,
+      GraphFunction& m,
+      const SugaredValuePtr& iter_value);
+
+  std::vector<SugaredValuePtr> get_children() {
+    return children_;
+  }
+
+  // If this iterable contains a ModuleList or Tuple, then it will have a
+  // static length, and we will emit it as an unrolled for loop.
+  std::optional<int64_t> staticLen() override {
+    return unroll_length_;
+  }
+
+  // given a IterableTree node, get all the base iterables/leaves under the
+  // IterableTree node. This enables
+  // us to get all the basic SugaredValues that contains valid loop information
+  // with len() and getitem()
+  std::vector<SugaredValuePtr> get_base_iterables();
+
+  Value* len(const SourceRange& loc, GraphFunction& m) override;
+  SugaredValuePtr getitem(
+      const SourceRange& loc,
+      GraphFunction& m,
+      Value* idx,
+      TypePtr type_hint = nullptr) override;
+
+ private:
+  std::optional<int64_t> unroll_length_ = std::nullopt;
+  std::vector<SugaredValuePtr> children_;
+};
+
+static inline std::vector<Value*> toValues(
+    Graph& g,
+    at::ArrayRef<NamedValue> nvs) {
+  return fmap(nvs, [&](const NamedValue& v) { return v.value(g); });
+}
+
+struct SimpleSelf : public Self {
+  explicit SimpleSelf(ClassTypePtr classType)
+      : Self(), classType_(std::move(classType)) {}
+  std::shared_ptr<SugaredValue> makeSugared(Value* v) const override {
+    v->setType(classType_);
+    return std::make_shared<SimpleValue>(v);
+  }
+  ClassTypePtr getClassType() const override {
+    return classType_;
+  }
+
+ private:
+  ClassTypePtr classType_;
+};
+
+// This is not a SimpleValue so it can not pass through the code paths that
+// expect a SimpleValue as a sugared value.
+struct TORCH_API ExceptionMessageValue : public SugaredValue {
+  explicit ExceptionMessageValue(
+      Value* value,
+      Value* qualified_class_name = nullptr)
+      : value_(value), qualified_class_name_(qualified_class_name) {}
+
+  std::string kind() const override {
+    return "exception message";
+  }
+
+  Value* getValue() {
+    return value_;
+  }
+
+  // qualified python class name
+  Value* getQualifiedClassName() {
+    return qualified_class_name_;
+  }
+
+ private:
+  Value* value_;
+  Value* qualified_class_name_;
+};
+
+struct TORCH_API ExceptionValue : public SugaredValue {
+  explicit ExceptionValue(std::string message) : message_(std::move(message)) {}
+
+  std::string kind() const override {
+    return "exception";
+  }
+
+  std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& m,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> /*attributes*/,
+      size_t /*n_binders*/) override {
+    auto exception_message = insertConstant(*m.graph(), message_ + ": ", loc);
+    for (auto& input : args) {
+      auto input_str = input.value(*m.graph());
+      if (!input_str->type()->isSubtypeOf(*StringType::get())) {
+        input_str =
+            emitBuiltinCall(loc, *m.graph(), aten::str, {input_str}, {});
+      }
+      exception_message = emitBuiltinCall(
+          loc, *m.graph(), aten::add, {exception_message, input_str}, {});
+    }
+    return std::make_shared<ExceptionMessageValue>(exception_message);
+  }
+
+  std::string message_;
+};
+
+struct TORCH_API SugaredEnumClass : public SugaredValue {
+  explicit SugaredEnumClass(EnumTypePtr enum_type)
+      : enum_type_(std::move(enum_type)) {}
+
+  std::string kind() const override {
+    return "EnumClass";
+  }
+
+  SugaredValuePtr attr(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::string& field) override;
+
+  SugaredValuePtr iter(const SourceRange& loc, GraphFunction& m) override;
+
+ private:
+  EnumTypePtr enum_type_;
+};
+
+struct TORCH_API SliceValue : public SugaredValue {
+  explicit SliceValue(Value* start, Value* stop, Value* step)
+      : start_(start), stop_(stop), step_(step) {}
+
+  std::string kind() const override {
+    return "Python slice value";
+  }
+
+  Value* start() {
+    return start_;
+  }
+  Value* stop() {
+    return stop_;
+  }
+  Value* step() {
+    return step_;
+  }
+
+ private:
+  Value* start_;
+  Value* stop_;
+  Value* step_;
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/tracer.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/tracer.h
new file mode 100644
index 0000000000000000000000000000000000000000..9bd864ab45bdc582515efe02a4804b3f7b42b1fd
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/tracer.h
@@ -0,0 +1,413 @@
+#pragma once
+
+#include <ATen/core/Dimname.h>
+#include <ATen/core/class_type.h>
+#include <ATen/core/jit_type.h>
+#include <ATen/core/stack.h>
+#include <ATen/core/symbol.h>
+#include <c10/util/Exception.h>
+#include <torch/csrc/Export.h>
+
+#include <torch/csrc/jit/frontend/source_range.h>
+#include <torch/csrc/utils/variadic.h>
+
+#include <cstdint>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+namespace torch::jit {
+struct Node;
+struct Value;
+struct Graph;
+struct Module;
+
+namespace tracer {
+
+using ::c10::ivalue::Shared;
+
+using ::c10::IValue;
+using ::c10::ivalue::Future;
+
+using ::c10::ArrayRef;
+using ::c10::TupleType;
+using ::c10::TupleTypePtr;
+using ::c10::ivalue::ConstantString;
+
+using torch::autograd::Variable;
+using variable_list = std::vector<Variable>;
+
+TORCH_API std::atomic<bool>& getTracerStateWarnMode();
+
+struct TORCH_API TracingState
+    : public std::enable_shared_from_this<TracingState> {
+  TracingState();
+  ~TracingState();
+
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::shared_ptr<Graph> graph;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  bool warn = getTracerStateWarnMode();
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  bool strict = true;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  bool force_outplace = false;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::function<std::string(const Variable& var)> lookup_var_name_fn =
+      [](const Variable& var) { return ""; };
+
+  void enterFrame() {
+    env_stack.emplace_back();
+  }
+
+  void leaveFrame() {
+    env_stack.pop_back();
+  }
+
+  void setValue(const IValue& v, Value* value);
+  void delValue(const IValue& var);
+  Value* getValue(const IValue& var);
+  Value* getOutput(const IValue& var, size_t i);
+  bool hasValue(const IValue& var) const;
+
+  Node* createNode(c10::Symbol op_name, size_t num_outputs);
+  void insertNode(Node* node);
+
+ private:
+  using WeakIValue = at::WeakIValue;
+
+  struct WeakIValueHasher {
+    size_t operator()(const WeakIValue& t) const {
+      return t.hash();
+    }
+  };
+
+  struct WeakIValueEq {
+    bool operator()(const WeakIValue& t1, const WeakIValue& t2) const {
+      return t1.isSameIdentity(t2);
+    }
+  };
+
+  using Frame =
+      std::unordered_map<WeakIValue, Value*, WeakIValueHasher, WeakIValueEq>;
+  std::vector<Frame> env_stack;
+};
+
+// This is meant to be used as a thread local place, where we can store extra
+// info that gets lost when we call into ATen from Python bindings. One example
+// for when this happens is when we get an IntArrayRef argument with e.g. sizes
+// for view. When tracing, those might be tensors, which let us encode extra
+// data dependencies, but once they get to the ATen call where we actually have
+// the tracing logic, they get converted into a raw IntArrayRef, and we loose
+// all information. To prevent this, we temporarily stash it in here.
+struct ArgumentStash {
+  struct IntArrayRefTrace : std::vector<Value*> {
+    IntArrayRefTrace(size_t size) : std::vector<Value*>(size, nullptr) {}
+  };
+
+  static bool empty() {
+    return stash.intlists.empty();
+  }
+
+  TORCH_API static void stashIntArrayRefElem(
+      const std::string& arg_name,
+      size_t size,
+      size_t idx,
+      const Variable& var);
+
+  static bool hasIntArrayRef(const std::string& arg_name) {
+    return stash.intlists.count(arg_name) > 0;
+  }
+
+  static IntArrayRefTrace popIntArrayRef(const std::string& arg_name) {
+    auto info = std::move(stash.intlists.at(arg_name));
+    stash.intlists.erase(arg_name);
+    return info;
+  }
+
+  // Value stashing: Use these methods to stash arguments which correspond
+  // to regular Value*'s in the graph. i.e. they don't require special
+  // handling like in the case of IntArrayRefs
+  TORCH_API static void stashValue(
+      const std::string& arg_name,
+      size_t idx,
+      const Variable& var,
+      const c10::TypePtr& type = nullptr);
+
+  static bool hasValue(const std::string& arg_name) {
+    return stash.values.count(arg_name) > 0;
+  }
+
+  static Value* popValue(const std::string& arg_name) {
+    auto info = stash.values.at(arg_name);
+    stash.values.erase(arg_name);
+    return info;
+  }
+
+ private:
+  static thread_local ArgumentStash stash;
+  std::unordered_map<std::string, IntArrayRefTrace> intlists;
+  std::unordered_map<std::string, Value*> values;
+};
+
+// Retrieve or set the current tracing state. Returns a nullptr if tracing is
+// disabled.
+TORCH_API const std::shared_ptr<TracingState>& getTracingState();
+TORCH_API void setTracingState(std::shared_ptr<TracingState> state);
+
+inline bool isTracing() {
+  return static_cast<bool>(getTracingState());
+}
+
+using warn_fn_type = void (*)(const std::string& msg);
+TORCH_API extern const char* WARN_PYTHON_DATAFLOW;
+TORCH_API extern const char* WARN_CONSTRUCTOR;
+TORCH_API extern const char* WARN_RESIZE;
+TORCH_API extern const char* STRICT_TRACER_MSG;
+TORCH_API void _do_warn(const char* _reason, const char* _kind);
+inline void warn(const char* _reason, const char* _kind = nullptr) {
+  if (const auto& state = getTracingState()) {
+    if (!state->warn)
+      return;
+    _do_warn(_reason, _kind);
+  }
+}
+TORCH_API void setWarn(warn_fn_type fn);
+
+struct TORCH_API NoWarn {
+  NoWarn() : state(getTracingState()) {
+    if (state) {
+      prev = state->warn;
+      state->warn = false;
+    }
+  }
+  ~NoWarn() {
+    if (state) {
+      state->warn = prev;
+    }
+  }
+  std::shared_ptr<TracingState> state;
+  bool prev{false};
+};
+
+struct WithNestedTracingFrame {
+  WithNestedTracingFrame() {
+    getTracingState()->enterFrame();
+  }
+
+  ~WithNestedTracingFrame() {
+    getTracingState()->leaveFrame();
+  }
+};
+TORCH_API void recordSourceLocation(Node* n);
+TORCH_API void setRecordSourceLocation(void (*v)(Node*));
+
+TORCH_API std::vector<StackEntry> pythonCallstack();
+TORCH_API void setPythonCallstack(std::vector<StackEntry> (*v)());
+
+// Having finished adding a new 'node' to the graph IR 'setValueTrace'
+// associates this node with an output variable, so that further operations
+// involving this variable know which node in the IR to reference.
+TORCH_API void setValueTrace(const IValue& v, Value* value);
+
+TORCH_API void delValueTrace(const IValue& var);
+
+TORCH_API std::function<void()> pauseTracing();
+
+TORCH_API Value* getValueTrace(const IValue& var);
+
+TORCH_API std::pair<std::shared_ptr<TracingState>, Stack> trace(
+    Stack inputs,
+    const std::function<Stack(Stack)>& traced_fn,
+    std::function<std::string(const Variable&)> var_name_lookup_fn,
+    bool strict = true,
+    bool force_outplace = false,
+    Module* self = nullptr,
+    const std::vector<std::string>& argument_names = {});
+
+TORCH_API void abandon();
+
+// NB: those serve both as an intermediate steps in addInputs below,
+// as well as the overloads that terminate template recursion
+TORCH_API void addInputs(Node* n, const char* name, int64_t value);
+TORCH_API void addInputs(Node* n, const char* name, const c10::SymInt& value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    std::optional<int64_t> value);
+TORCH_API void addInputs(Node* n, const char* name, bool value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const std::optional<bool>& value);
+TORCH_API void addInputs(Node* n, const char* name, double value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const std::optional<double>& value);
+TORCH_API void addInputs(Node* n, const char* name, const at::Scalar& value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const std::optional<at::Scalar>& value);
+TORCH_API void addInputs(Node* n, const char* name, const at::Tensor& value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const std::optional<at::Tensor>& value);
+TORCH_API void addInputs(Node* n, const char* name, ArrayRef<int64_t> value);
+TORCH_API void addInputs(Node* n, const char* name, c10::SymIntArrayRef value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    std::optional<c10::SymInt> value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const std::optional<ArrayRef<int64_t>>& value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const at::OptionalIntArrayRef& opt_value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const at::OptionalSymIntArrayRef& opt_value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    ArrayRef<at::Tensor> value,
+    bool allow_undefined = false);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const std::vector<at::Tensor>& value,
+    bool allow_undefined = false);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    at::ITensorListRef value,
+    bool allow_undefined = false);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const List<std::optional<at::Tensor>>& value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    ArrayRef<c10::intrusive_ptr<c10::ivalue::Object>> value,
+    const c10::ClassTypePtr& class_type);
+TORCH_API void addInputs(Node* n, const char* name, ArrayRef<double> value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const std::optional<ArrayRef<double>>& value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const std::string_view value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const std::optional<std::string_view>& value);
+TORCH_API void addInputs(Node* n, const char* name, at::Device value);
+TORCH_API void addInputs(Node* n, const char* name, c10::Stream stream);
+TORCH_API void addInputs(Node* n, const char* name, at::Layout value);
+TORCH_API void addInputs(Node* n, const char* name, at::ScalarType value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const std::optional<at::ScalarType>& value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const std::optional<at::Device>& value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const std::optional<at::Layout>& value);
+TORCH_API void addInputs(Node* n, const char* name, at::MemoryFormat value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    std::optional<at::DimnameList> value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const std::optional<at::MemoryFormat>& value);
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const std::optional<at::Generator>& value);
+
+inline void addInputs(
+    Node* n,
+    const char* name,
+    const std::vector<bool>& value) {
+  TORCH_CHECK(false, "Tracing a list of bool type is currently not supported!");
+}
+
+template <typename T>
+void addInputs(Node* n, const char* name, ArrayRef<T> value) {
+  TORCH_CHECK(
+      false, "Tracing a list of arbitrary type is currently not supported!");
+}
+template <typename K, typename V>
+void addInputs(
+    Node* n,
+    const char* name,
+    const std::unordered_map<K, V>& value) {
+  TORCH_CHECK(
+      false, "Tracing a dict of arbitrary types is currently not supported!");
+}
+
+template <size_t N>
+void addInputs(Node* n, const char* name, std::array<bool, N> value) {
+  throw std::runtime_error(
+      "Found an unsupported argument type in the JIT tracer. File a bug report.");
+}
+
+TORCH_API void addInputs(
+    Node* n,
+    const char* name,
+    const c10::intrusive_ptr<c10::ivalue::Object>& obj);
+
+TORCH_API void ensureUniqueIfOutOfPlaced(
+    const char* name,
+    const at::Tensor& tensor);
+TORCH_API void ensureUniqueIfOutOfPlaced(
+    const char* name,
+    const std::optional<at::Tensor>& tensor);
+
+template <
+    typename T,
+    typename = std::enable_if_t<
+        (!std::is_convertible_v<std::decay_t<T>, at::TensorList> &&
+         !std::is_convertible_v<std::decay_t<T>, c10::List<at::Tensor>> &&
+         !std::is_convertible_v<std::decay_t<T>, at::Tensor> &&
+         !std::is_convertible_v<
+             std::decay_t<T>,
+             c10::intrusive_ptr<c10::ivalue::Object>>)>>
+void addOutput(Node* node, T&&) {
+  TORCH_CHECK(
+      false,
+      "Found an unsupported argument type ",
+      c10::demangle_type<T>(),
+      " in the JIT tracer. File a bug report.");
+}
+TORCH_API void addOutput(Node* node, const at::Tensor& tensor);
+TORCH_API void setOutput(Value* value, const at::Tensor& output);
+TORCH_API void addOutput(Node* node, const std::vector<at::Tensor>& list);
+TORCH_API void addOutput(Node* node, const c10::List<at::Tensor>& list);
+TORCH_API void addOutput(
+    Node* node,
+    const c10::intrusive_ptr<c10::ivalue::Object>& output);
+
+TORCH_API autograd::Variable getSizeOf(
+    const autograd::Variable& var,
+    int64_t dim);
+
+TORCH_API autograd::Variable getNumelOf(const autograd::Variable& var);
+
+} // namespace tracer
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/tree.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/tree.h
new file mode 100644
index 0000000000000000000000000000000000000000..9fecc6d8435bd1927d5279c2068d13a311a418e2
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/tree.h
@@ -0,0 +1,218 @@
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include <c10/util/SmallVector.h>
+#include <c10/util/intrusive_ptr.h>
+#include <torch/csrc/jit/frontend/lexer.h>
+
+namespace torch::jit {
+
+// Trees are used to represent all forms of TC IR, pre- and post-typechecking.
+// Rather than have a full class hierarchy for all TC statements, trees are a
+// slight variation of Lisp s-expressions. For instance, the expression a*b+1
+// is represented as:
+// (+ (* (ident a) (ident b)) (const 1))
+// Atoms like 'a', 'b', and '1' are represented by subclasses of Tree which
+// define stringValue(). Everything else is a Compound object, which has a
+// 'kind' that is a token from lexer.h's TokenKind enum. Single-character
+// operators like '+' are represented using the character itself (so, add.kind()
+// would be '+'). Each Compound object also contains a list of subtrees and is
+// associated with a SourceRange for error reporting.
+// Memory management of trees is done using intrusive_ptr.
+
+struct Tree;
+using TreeRef = c10::intrusive_ptr<Tree>;
+using TreeList = at::SmallVector<TreeRef, 4>;
+
+struct Tree : c10::intrusive_ptr_target {
+  Tree(int kind_) : kind_(kind_) {}
+  int kind() const {
+    return kind_;
+  }
+  virtual bool isAtom() const {
+    return true;
+  }
+  virtual const SourceRange& range() const {
+    throw std::runtime_error("is an Atom");
+  }
+  virtual const std::string& stringValue() const {
+    throw std::runtime_error("stringValue can only be called on TK_STRING");
+  }
+  virtual const TreeList& trees() const {
+    static const TreeList empty_trees = {};
+    return empty_trees;
+  }
+  const TreeRef& tree(size_t i) const {
+    return trees().at(i);
+  }
+  virtual TreeRef map(const std::function<TreeRef(TreeRef)>& fn) {
+    (void)fn;
+    c10::raw::intrusive_ptr::incref(this); // we are creating a new pointer
+                                           // from a raw `this` pointer
+                                           // so we need to bump the refcount
+                                           // to account for this ownership
+    return TreeRef::reclaim(this);
+  }
+  template <typename... Args>
+  void match(int k, Args&... args) const {
+    matchD(k, "unknown", 0, args...);
+  }
+  template <typename... Args>
+  void matchD(int k, const char* filename, int lineno, Args&... args) const {
+    std::initializer_list<TreeRef*> vars = {args...};
+    matchNumSubtreesD(k, filename, lineno, vars.size(), true);
+    size_t i = 0;
+    for (TreeRef* v : vars) {
+      *v = trees()[i++];
+    }
+  }
+  void matchNumSubtrees(int k, size_t expected_subtrees) {
+    return matchNumSubtreesD(k, "unknown", 0, expected_subtrees, false);
+  }
+  void matchNumSubtreesD(
+      int k,
+      const char* filename,
+      int lineno,
+      size_t expected_subtrees,
+      bool allow_more) const {
+    if (kind() != k) {
+      std::stringstream ss;
+      ss << filename << ":" << lineno << ": expecting kind '" << kindToString(k)
+         << "' but found '" << kindToString(kind()) << "'\n";
+      range().highlight(ss);
+      throw std::runtime_error(ss.str());
+    }
+    if (trees().size() < expected_subtrees ||
+        (!allow_more && trees().size() != expected_subtrees)) {
+      std::stringstream ss;
+      ss << filename << ":" << lineno << ": expected at least "
+         << expected_subtrees << " subtrees, but found only " << trees().size()
+         << "\n";
+      range().highlight(ss);
+      throw std::runtime_error(ss.str());
+    }
+  }
+  ~Tree() override = default;
+
+ private:
+  int kind_;
+};
+
+struct String : public Tree {
+  String(std::string value) : Tree(TK_STRING), value_(std::move(value)) {}
+  const std::string& stringValue() const override {
+    return value_;
+  }
+  template <typename... Args>
+  static TreeRef create(Args&&... args) {
+    return c10::make_intrusive<String>(std::forward<Args>(args)...);
+  }
+
+ private:
+  std::string value_;
+};
+
+static SourceRange mergeRanges(SourceRange c, const TreeList& others) {
+  for (const auto& t : others) {
+    if (t->isAtom())
+      continue;
+    size_t s = std::min(c.start(), t->range().start());
+    size_t e = std::max(c.end(), t->range().end());
+    c = SourceRange(c.source(), s, e);
+  }
+  return c;
+}
+
+struct Compound : public Tree {
+  Compound(int kind, SourceRange range)
+      : Tree(kind), range_(std::move(range)) {}
+  Compound(int kind, const SourceRange& range_, TreeList&& trees_)
+      : Tree(kind),
+        range_(mergeRanges(range_, trees_)),
+        trees_(std::move(trees_)) {}
+  const TreeList& trees() const override {
+    return trees_;
+  }
+  static TreeRef create(
+      int kind,
+      const SourceRange& range_,
+      TreeList&& trees_) {
+    return c10::make_intrusive<Compound>(kind, range_, std::move(trees_));
+  }
+  bool isAtom() const override {
+    return false;
+  }
+  TreeRef map(const std::function<TreeRef(TreeRef)>& fn) override {
+    TreeList ret;
+    for (auto& t : trees()) {
+      ret.push_back(fn(t));
+    }
+    return Compound::create(kind(), range(), std::move(ret));
+  }
+
+  const SourceRange& range() const override {
+    return range_;
+  }
+
+ private:
+  SourceRange range_;
+  TreeList trees_;
+};
+
+// tree pretty printer
+struct pretty_tree {
+  pretty_tree(const TreeRef& tree, size_t col = 40) : tree(tree), col(col) {}
+  const TreeRef& tree;
+  size_t col;
+  std::unordered_map<TreeRef, std::string> flat_strings;
+  const std::string& get_flat(const TreeRef& t) {
+    auto it = flat_strings.find(t);
+    if (it != flat_strings.end())
+      return it->second;
+
+    std::stringstream out;
+    switch (t->kind()) {
+      case TK_STRING:
+        out << t->stringValue();
+        break;
+      default:
+        out << "(" << kindToString(t->kind());
+        for (const auto& e : t->trees()) {
+          out << " " << get_flat(e);
+        }
+        out << ")";
+        break;
+    }
+    auto it_ = flat_strings.emplace(t, out.str());
+    return it_.first->second;
+  }
+  void print(std::ostream& out, const TreeRef& t, int indent) {
+    const std::string& s = get_flat(t);
+    if (indent + s.size() < col || t->isAtom()) {
+      out << s;
+      return;
+    }
+    std::string k = kindToString(t->kind());
+    out << "(" << k;
+    for (const auto& e : t->trees()) {
+      out << "\n" << std::string(indent + 2, ' ');
+      print(out, e, indent + 2);
+    }
+    out << ")";
+  }
+};
+
+static inline std::ostream& operator<<(std::ostream& out, pretty_tree t_) {
+  t_.print(out, t_.tree, 0);
+  return out << '\n';
+}
+
+static inline std::ostream& operator<<(std::ostream& out, const TreeRef& t) {
+  return out << pretty_tree(t);
+}
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/tree_views.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/tree_views.h
new file mode 100644
index 0000000000000000000000000000000000000000..525bfd88c045162733f2bb936b343c88c275bc45
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/tree_views.h
@@ -0,0 +1,1280 @@
+#pragma once
+#include <torch/csrc/jit/frontend/error_report.h>
+#include <torch/csrc/jit/frontend/strtod.h>
+#include <torch/csrc/jit/frontend/tree.h>
+
+#include <c10/util/complex.h>
+#include <functional>
+#include <iostream>
+#include <string>
+#include <utility>
+
+namespace torch::jit {
+
+// clang-format off
+// TreeView provides a statically-typed way to traverse the tree, which should
+// be formed according to the grammar below.
+//
+// A few notes on types and their aliases:
+// - List<T> is really a Tree with kind TK_LIST and elements as subtrees
+// - Maybe<T> is really a Tree with kind TK_OPTION that has 0 or 1 subtree of type T
+// - Builtin types are: Ident (TK_IDENT), String (TK_STRING)
+//
+// Param = Param(Maybe<Expr> type, Ident name)                          TK_PARAM
+//
+// Decl  = Decl(List<Param> params, Maybe<Expr> return_type)            TK_DECL
+// Def   = Def(Ident name, Decl decl, List<Stmt> body)                  TK_DEF
+// ClassDef = ClassDef(Ident name,                                      TK_CLASS_DEF
+//                     Maybe<Expr> superclass,
+//                     List<Stmt> body)
+//
+// Stmt  = If(Expr cond, List<Stmt> true_body, List<Stmt> false_body)   TK_IF
+//       | For(List<Expr> targets, List<Expr> iters, List<Stmt> body)   TK_FOR
+//       | While(Expr cond, List<Stmt> body)                            TK_WHILE
+//       | Global(List<Ident> idents)                                   TK_GLOBAL
+//       -- NB: the only type of Expr's allowed on lhs are Var
+//          Or a tuple containing Var with an optional terminating Starred
+//       | Assign(Expr lhs, Maybe<Expr> rhs, Maybe<Expr> type)          TK_ASSIGN
+//       | AugAssign(Expr lhs, AugAssignKind aug_op, Expr rhs)          TK_AUG_ASSIGN
+//       | Return(List<Expr> values)                                    TK_RETURN
+//       | ExprStmt(List<Expr> expr)                                    TK_EXPR_STMT
+//       | Raise(Expr expr)                                             TK_RAISE
+//       | Def                                                          TK_DEF
+//       | With(List<WithItem> targets, List<Stmt> body)                TK_WITH
+//
+// Expr  = TernaryIf(Expr cond, Expr true_expr, Expr false_expr)        TK_IF_EXPR
+//       | BinOp(Expr lhs, Expr rhs)
+//       |     And                                                      TK_AND
+//       |     Or                                                       TK_OR
+//       |     Lt                                                       '<'
+//       |     Gt                                                       '>'
+//       |     Eq                                                       TK_EQ
+//       |     Le                                                       TK_LE
+//       |     Ge                                                       TK_GE
+//       |     Ne                                                       TK_NE
+//       |     Is                                                       TK_IS
+//       |     IsNot                                                    TK_ISNOT
+//       |     Add                                                      '+'
+//       |     Sub                                                      '-'
+//       |     Mul                                                      '*'
+//       |     Div                                                      '/'
+//       |     Mod                                                      '%'
+//       |     MatMult                                                  '@'
+//       |     Pow                                                      TK_POW
+//       | UnaryOp(Expr expr)
+//       |     Not                                                      TK_NOT
+//       |     USub                                                     '-'
+//       | Const(String value)                                          TK_CONST
+//       -- NB: x.name(y) is desugared into name(x, y)
+//       | Apply(Ident name, List<Expr> args, List<Attribute> kwargs)   TK_APPLY
+//       | Select(Expr value, Ident selector)                           '.'
+//       | Subscript(Expr value, List<Expr> subscript_exprs)            TK_SUBSCRIPT
+//       | SliceExpr(Maybe<Expr> start, Maybe<Expr> end)                TK_SLICE_EXPR
+//       | Var(Ident name)                                              TK_VAR
+//       | ListLiteral(List<Expr> inputs)                               TK_LIST_LITERAL
+//       | TupleLiteral(List<Expr> inputs)                              TK_TUPLE_LITERAL
+//       | Starred(Expr expr)                                           TK_STARRED
+//       | WithItem(Expr target, Maybe<Var> var)                        TK_WITH_ITEM
+// -- NB: only allowed expressions are Const or List(Const)
+//        (List as a value, not type constructor)
+// Attribute = Attribute(Ident name, Expr value)                        TK_ATTRIBUTE
+//
+// AugAssignKind =
+//            | Add()                                                   TK_PLUS_EQ
+//            | Sub()                                                   TK_MINUS_EQ
+//            | Mul()                                                   TK_TIMES_EQ
+//            | Div()                                                   TK_DIV_EQ
+//            | Mod()                                                   TK_MOD_EQ
+//
+
+// Each subclass of TreeView should provide:
+// 1. Constructor that takes a TreeRef, and checks that it's of the right type.
+// 2. Accessors that get underlying information out of the object. If they
+//    return subtrees, they should wrap them in appropriate views too.
+// 3. Static method 'create' that creates the underlying TreeRef object
+//    for every TreeRef kind that has a TreeView, the parser always uses
+//    (e.g.) Ident::create rather than Compound::Create, this means that
+//    changes to the structure of Ident are always made right here rather
+//    than both in the parser and in this code.
+// XXX: these structs should have no fields to prevent slicing when passing by value
+// clang-format on
+struct TreeView {
+  explicit TreeView(TreeRef tree) : tree_(std::move(tree)) {}
+  TreeRef tree() const {
+    return tree_;
+  }
+  const SourceRange& range() const {
+    return tree_->range();
+  }
+  operator TreeRef() const {
+    return tree_;
+  }
+  const TreeRef& get() const {
+    return tree_;
+  }
+  int kind() const {
+    return tree_->kind();
+  }
+  void dump() const {
+    std::cout << tree_;
+  }
+
+ protected:
+  const TreeRef& subtree(size_t i) const {
+    return tree_->trees().at(i);
+  }
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  TreeRef tree_;
+};
+
+template <typename T>
+struct ListIterator {
+  ListIterator(TreeList::const_iterator it) : it(it) {}
+  bool operator!=(const ListIterator& rhs) const {
+    return it != rhs.it;
+  }
+  bool operator==(const ListIterator& rhs) const {
+    return it == rhs.it;
+  }
+  T operator*() const {
+    return T(*it);
+  }
+  ListIterator& operator+=(std::ptrdiff_t n) {
+    it += n;
+    return *this;
+  }
+  ListIterator& operator++() {
+    ++it;
+    return *this;
+  }
+  ListIterator& operator--() {
+    --it;
+    return *this;
+  }
+
+ private:
+  TreeList::const_iterator it;
+};
+
+template <typename T>
+struct List : public TreeView {
+  using iterator = ListIterator<T>;
+  using const_iterator = ListIterator<T>;
+
+  List(const TreeRef& tree) : TreeView(tree) {
+    tree->match(TK_LIST);
+    // Iterate over list to temporarily instantiate Ts that will check the type
+    for (const T& elem : *this) {
+      (void)elem; // silence unused warning
+    }
+  }
+  iterator begin() const {
+    return iterator(tree_->trees().begin());
+  }
+  iterator end() const {
+    return iterator(tree_->trees().end());
+  }
+  bool empty() const {
+    return tree_->trees().begin() == tree_->trees().end();
+  }
+  T operator[](size_t i) const {
+    return T(subtree(i));
+  }
+  TreeRef map(const std::function<TreeRef(const T&)>& fn) {
+    return tree_->map([&](TreeRef v) { return fn(T(v)); });
+  }
+  static List create(const SourceRange& range, const std::vector<T>& subtrees) {
+    TreeList type_erased_sub{subtrees.begin(), subtrees.end()};
+    return List(Compound::create(TK_LIST, range, std::move(type_erased_sub)));
+  }
+  static List unsafeCreate(const SourceRange& range, TreeList&& subtrees) {
+    return List(Compound::create(TK_LIST, range, std::move(subtrees)));
+  }
+  size_t size() const {
+    return tree_->trees().size();
+  }
+};
+
+template <typename T>
+struct Maybe : public TreeView {
+  explicit Maybe(const TreeRef& tree) : TreeView(tree) {
+    tree_->match(TK_OPTION);
+    if (tree_->trees().size() > 1)
+      throw(ErrorReport(tree) << "Maybe trees can have at most one subtree");
+  }
+  /* implicit */ Maybe(const T& tree) : TreeView(tree) {}
+  bool present() const {
+    return tree_->trees().size() > 0;
+  }
+  T get() const {
+    return T(tree_->trees().at(0));
+  }
+  TreeRef map(const std::function<TreeRef(const T&)>& fn) {
+    return tree_->map([&](TreeRef v) { return fn(T(v)); });
+  }
+  static Maybe<T> create(const SourceRange& range) {
+    return Maybe<T>(Compound::create(TK_OPTION, range, {}));
+  }
+  static Maybe<T> create(const SourceRange& range, const T& value) {
+    return Maybe<T>(Compound::create(TK_OPTION, range, {value}));
+  }
+};
+
+struct Ident : public TreeView {
+  explicit Ident(const TreeRef& tree) : TreeView(tree) {
+    tree_->match(TK_IDENT);
+  }
+  const std::string& name() const {
+    return subtree(0)->stringValue();
+  }
+  static Ident create(const SourceRange& range, std::string name) {
+    return Ident(
+        Compound::create(TK_IDENT, range, {String::create(std::move(name))}));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Base types (production LHS)
+////////////////////////////////////////////////////////////////////////////////
+
+struct Stmt : public TreeView {
+  explicit Stmt(const TreeRef& tree) : TreeView(tree) {
+    switch (tree->kind()) {
+      case TK_IF:
+      case TK_FOR:
+      case TK_WHILE:
+      case TK_GLOBAL:
+      case TK_ASSIGN:
+      case TK_AUG_ASSIGN:
+      case TK_RETURN:
+      case TK_EXPR_STMT:
+      case TK_RAISE:
+      case TK_ASSERT:
+      case TK_PASS:
+      case TK_BREAK:
+      case TK_DELETE:
+      case TK_CONTINUE:
+      case TK_DEF:
+      case TK_WITH:
+        return;
+      default:
+        throw(
+            ErrorReport(tree)
+            << kindToString(tree->kind()) << " is not a valid Stmt");
+    }
+  }
+};
+
+struct Expr : public TreeView {
+  explicit Expr(const TreeRef& tree) : TreeView(tree) {
+    switch (tree->kind()) {
+      case TK_IF_EXPR:
+      case TK_AND:
+      case TK_OR:
+      case '<':
+      case '>':
+      case TK_IS:
+      case TK_ISNOT:
+      case TK_EQ:
+      case TK_LE:
+      case TK_GE:
+      case TK_NE:
+      case '+':
+      case '-':
+      case TK_UNARY_MINUS:
+      case '~':
+      case '*':
+      case TK_STARRED:
+      case '/':
+      case '%':
+      case TK_NOT:
+      case TK_CONST:
+      case TK_STRINGLITERAL:
+      case TK_TRUE:
+      case TK_FALSE:
+      case TK_NONE:
+      case TK_NONE_TYPE:
+      case TK_CAST:
+      case TK_APPLY:
+      case '.':
+      case TK_SUBSCRIPT:
+      case TK_SLICE_EXPR:
+      case TK_VAR:
+      case TK_LIST_LITERAL:
+      case TK_TUPLE_LITERAL:
+      case TK_DICT_LITERAL:
+      case '@':
+      case TK_POW:
+      case TK_LSHIFT:
+      case TK_RSHIFT:
+      case TK_FLOOR_DIV:
+      case '&':
+      case '^':
+      case '|':
+      case TK_LIST_COMP:
+      case TK_DICT_COMP:
+      case TK_DOTS:
+      case TK_IN:
+      case TK_WITH_ITEM:
+        return;
+      default:
+        throw(
+            ErrorReport(tree)
+            << kindToString(tree->kind()) << " is not a valid Expr");
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Helper nodes (mostly for function arguments)
+////////////////////////////////////////////////////////////////////////////////
+
+struct Attribute : public TreeView {
+  explicit Attribute(const TreeRef& tree) : TreeView(tree) {
+    tree_->match(TK_ATTRIBUTE);
+  }
+  Ident name() const {
+    return Ident(subtree(0));
+  }
+  Expr value() const {
+    return Expr(subtree(1));
+  }
+  static Attribute create(
+      const SourceRange& range,
+      const Ident& name,
+      const TreeRef& value) {
+    return Attribute(Compound::create(TK_ATTRIBUTE, range, {name, value}));
+  }
+};
+
+struct Param : public TreeView {
+  explicit Param(const TreeRef& tree) : TreeView(tree) {
+    tree_->match(TK_PARAM);
+  }
+  static Param create(
+      const SourceRange& range,
+      const Ident& ident,
+      const Maybe<Expr>& type,
+      const Maybe<Expr>& def,
+      bool kwarg_only) {
+    TreeRef kwarg_only_tree =
+        Compound::create(kwarg_only ? TK_TRUE : TK_FALSE, range, {});
+    return Param(Compound::create(
+        TK_PARAM, range, {ident, type, def, std::move(kwarg_only_tree)}));
+  }
+  Ident ident() const {
+    return Ident(subtree(0));
+  }
+  Maybe<Expr> type() const {
+    return Maybe<Expr>(subtree(1));
+  }
+  Maybe<Expr> defaultValue() const {
+    return Maybe<Expr>(subtree(2));
+  }
+  bool kwarg_only() const {
+    return TK_TRUE == subtree(3)->kind();
+  }
+  Param withType(const Maybe<Expr>& typ) const {
+    return Param::create(range(), ident(), typ, defaultValue(), kwarg_only());
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Top level definitions
+////////////////////////////////////////////////////////////////////////////////
+
+struct Decl : public TreeView {
+  explicit Decl(const TreeRef& tree) : TreeView(tree) {
+    tree->match(TK_DECL);
+  }
+  List<Param> params() const {
+    return List<Param>(subtree(0));
+  }
+  Maybe<Expr> return_type() const {
+    return Maybe<Expr>(subtree(1));
+  }
+  static Decl create(
+      const SourceRange& range,
+      const List<Param>& params,
+      const Maybe<Expr>& return_type) {
+    return Decl(Compound::create(TK_DECL, range, {params, return_type}));
+  }
+};
+
+struct Def : public TreeView {
+  explicit Def(const TreeRef& tree) : TreeView(tree) {
+    tree->match(TK_DEF);
+  }
+  Def withName(std::string new_name) const {
+    auto new_ident = Ident::create(name().range(), std::move(new_name));
+    return create(range(), new_ident, decl(), statements());
+  }
+  Def withDecl(const Decl& decl) const {
+    return create(range(), name(), decl, statements());
+  }
+  Ident name() const {
+    return Ident(subtree(0));
+  }
+  Decl decl() const {
+    return Decl(subtree(1));
+  }
+  List<Stmt> statements() const {
+    return List<Stmt>(subtree(2));
+  }
+  static Def create(
+      const SourceRange& range,
+      const Ident& name,
+      const Decl& decl,
+      const List<Stmt>& stmts) {
+    return Def(Compound::create(TK_DEF, range, {name, decl, stmts}));
+  }
+};
+
+// Property represents a named attribute combined with a getter and setter
+// method to access and mutate that attribute.
+struct Property : public TreeView {
+  explicit Property(const TreeRef& tree) : TreeView(tree) {
+    tree->match(TK_PROP);
+  }
+  Ident name() const {
+    return Ident(subtree(0));
+  }
+  Def getter() const {
+    return Def(subtree(1));
+  }
+  Maybe<Def> setter() const {
+    return Maybe<Def>(subtree(2));
+  }
+  static Property create(
+      const SourceRange& range,
+      const Ident& name,
+      const Def& getter,
+      const Maybe<Def>& setter) {
+    return Property(Compound::create(TK_PROP, range, {name, getter, setter}));
+  }
+};
+
+struct Assign;
+
+struct ClassDef : public TreeView {
+  explicit ClassDef(const TreeRef& tree) : TreeView(tree) {
+    tree->match(TK_CLASS_DEF);
+  }
+  explicit ClassDef(TreeRef&& tree) : TreeView(std::move(tree)) {
+    tree_->match(TK_CLASS_DEF);
+  }
+  ClassDef withName(std::string new_name) const {
+    auto new_ident = Ident::create(name().range(), std::move(new_name));
+    return create(range(), new_ident, superclass(), body());
+  }
+  Ident name() const {
+    return Ident(subtree(0));
+  }
+  Maybe<Expr> superclass() const {
+    return Maybe<Expr>(subtree(1));
+  }
+  List<Stmt> body() const {
+    return List<Stmt>(subtree(2));
+  }
+  Maybe<List<Property>> properties() const {
+    return Maybe<List<Property>>(subtree(3));
+  }
+  Maybe<List<Assign>> assigns() const {
+    return Maybe<List<Assign>>(subtree(4));
+  }
+  static ClassDef create(
+      const SourceRange& range,
+      const Ident& name,
+      const Maybe<Expr>& superclass,
+      const List<Stmt>& body) {
+    return ClassDef(Compound::create(
+        TK_CLASS_DEF,
+        range,
+        {name,
+         superclass,
+         body,
+         Maybe<List<Property>>::create(range),
+         Maybe<List<Assign>>::create(range)}));
+  }
+  static ClassDef create(
+      const SourceRange& range,
+      const Ident& name,
+      const Maybe<Expr>& superclass,
+      const List<Stmt>& body,
+      const List<Property>& properties,
+      const List<Assign>& assigns);
+};
+
+TORCH_API std::vector<std::string> getUnresolvedClassAttributes(
+    const ClassDef& def);
+
+////////////////////////////////////////////////////////////////////////////////
+// Statements
+////////////////////////////////////////////////////////////////////////////////
+
+struct If : public Stmt {
+  explicit If(const TreeRef& tree) : Stmt(tree) {
+    tree_->match(TK_IF);
+  }
+  Expr cond() const {
+    return Expr(subtree(0));
+  }
+  List<Stmt> trueBranch() const {
+    return List<Stmt>(subtree(1));
+  }
+  List<Stmt> falseBranch() const {
+    return List<Stmt>(subtree(2));
+  }
+  If withNewBranches(
+      const List<Stmt>& true_branch,
+      const List<Stmt>& false_branch) const {
+    return create(range(), cond(), true_branch, false_branch);
+  }
+  static If create(
+      const SourceRange& range,
+      const Expr& cond,
+      const List<Stmt>& true_branch,
+      const List<Stmt>& false_branch) {
+    return If(
+        Compound::create(TK_IF, range, {cond, true_branch, false_branch}));
+  }
+};
+
+struct While : public Stmt {
+  explicit While(const TreeRef& tree) : Stmt(tree) {
+    tree_->match(TK_WHILE);
+  }
+  Expr cond() const {
+    return Expr(subtree(0));
+  }
+  List<Stmt> body() const {
+    return List<Stmt>(subtree(1));
+  }
+  static While create(
+      const SourceRange& range,
+      const Expr& cond,
+      const List<Stmt>& body) {
+    return While(Compound::create(TK_WHILE, range, {cond, body}));
+  }
+};
+
+struct For : public Stmt {
+  explicit For(const TreeRef& tree) : Stmt(tree) {
+    tree->match(TK_FOR);
+  }
+  List<Expr> targets() const {
+    return List<Expr>(subtree(0));
+  }
+  List<Expr> itrs() const {
+    return List<Expr>(subtree(1));
+  }
+  List<Stmt> body() const {
+    return List<Stmt>(subtree(2));
+  }
+  static For create(
+      const SourceRange& range,
+      const List<Expr>& targets,
+      const List<Expr>& itrs,
+      const List<Stmt>& body) {
+    return For(Compound::create(TK_FOR, range, {targets, itrs, body}));
+  }
+};
+
+// TODO: supports only single comprehension for now
+struct ListComp : public Expr {
+  explicit ListComp(const TreeRef& tree) : Expr(tree) {
+    tree->match(TK_LIST_COMP);
+  }
+  Expr elt() const {
+    return Expr(subtree(0));
+  }
+  Expr target() const {
+    return Expr(subtree(1));
+  }
+  Expr iter() const {
+    return Expr(subtree(2));
+  }
+  // TODO: no ifs for now
+  static ListComp create(
+      const SourceRange& range,
+      const Expr& elt,
+      const Expr& target,
+      const Expr& iter) {
+    return ListComp(Compound::create(TK_LIST_COMP, range, {elt, target, iter}));
+  }
+};
+
+// TODO: supports only single comprehension for now
+struct DictComp : public Expr {
+  explicit DictComp(const TreeRef& tree) : Expr(tree) {
+    tree->match(TK_DICT_COMP);
+  }
+  Expr key() const {
+    return Expr(subtree(0));
+  }
+  Expr value() const {
+    return Expr(subtree(1));
+  }
+  Expr target() const {
+    return Expr(subtree(2));
+  }
+  Expr iter() const {
+    return Expr(subtree(3));
+  }
+  // TODO: no ifs for now
+  static DictComp create(
+      const SourceRange& range,
+      const Expr& key,
+      const Expr& value,
+      const Expr& target,
+      const Expr& iter) {
+    return DictComp(
+        Compound::create(TK_DICT_COMP, range, {key, value, target, iter}));
+  }
+};
+
+struct Global : public Stmt {
+  explicit Global(const TreeRef& tree) : Stmt(tree) {
+    tree_->match(TK_GLOBAL);
+  }
+  List<Ident> names() {
+    return List<Ident>(subtree(0));
+  }
+  static Global create(const SourceRange& range, const List<Ident>& names) {
+    return Global(Compound::create(TK_GLOBAL, range, {names}));
+  }
+};
+
+struct AugAssignKind : public TreeView {
+  explicit AugAssignKind(const TreeRef& tree) : TreeView(tree) {
+    switch (tree->kind()) {
+      case '+':
+      case '-':
+      case '*':
+      case '/':
+      case '%':
+      case '|':
+      case '&':
+      case '^':
+      case TK_POW:
+      case TK_LSHIFT:
+      case TK_RSHIFT:
+        return;
+      default:
+        throw(ErrorReport(tree) << "is not a valid AugAssignKind");
+    }
+  }
+};
+
+// Augmented assignment, like "foo += bar"
+struct AugAssign : public Stmt {
+  explicit AugAssign(const TreeRef& tree) : Stmt(tree) {
+    tree_->match(TK_AUG_ASSIGN);
+  }
+  static AugAssign create(
+      const SourceRange& range,
+      const Expr& lhs,
+      const AugAssignKind& aug_op,
+      const Expr& rhs) {
+    return AugAssign(
+        Compound::create(TK_AUG_ASSIGN, range, {lhs, aug_op, rhs}));
+  }
+  Expr lhs() const {
+    return Expr(subtree(0));
+  }
+  int aug_op() const {
+    return subtree(1)->kind();
+  }
+  Expr rhs() const {
+    return Expr(subtree(2));
+  }
+};
+
+struct Assign : public Stmt {
+  explicit Assign(const TreeRef& tree) : Stmt(tree) {
+    tree_->match(TK_ASSIGN);
+  }
+  static Assign create(
+      const SourceRange& range,
+      const List<Expr>& lhs,
+      const Maybe<Expr>& rhs,
+      const Maybe<Expr>& type) {
+    return Assign(Compound::create(TK_ASSIGN, range, {lhs, rhs, type}));
+  }
+
+  List<Expr> lhs_list() const {
+    return List<Expr>(subtree(0));
+  }
+
+  Expr lhs() const {
+    const auto& li = lhs_list();
+    TORCH_INTERNAL_ASSERT(li.size() == 1);
+    return *li.begin();
+  }
+
+  Maybe<Expr> rhs() const {
+    return Maybe<Expr>(subtree(1));
+  }
+
+  Maybe<Expr> type() const {
+    return Maybe<Expr>(subtree(2));
+  }
+};
+
+struct Return : public Stmt {
+  explicit Return(const TreeRef& tree) : Stmt(tree) {
+    tree_->match(TK_RETURN);
+  }
+  Expr expr() const {
+    return Expr(subtree(0));
+  }
+  static Return create(const SourceRange& range, const Expr& value) {
+    return Return(Compound::create(TK_RETURN, range, {value}));
+  }
+};
+
+struct Raise : public Stmt {
+  explicit Raise(const TreeRef& tree) : Stmt(tree) {
+    tree_->match(TK_RAISE);
+  }
+  Expr expr() const {
+    return Expr(subtree(0));
+  }
+  static Raise create(const SourceRange& range, const Expr& expr) {
+    return Raise(Compound::create(TK_RAISE, range, {expr}));
+  }
+};
+
+struct Assert : public Stmt {
+  explicit Assert(const TreeRef& tree) : Stmt(tree) {
+    tree_->match(TK_ASSERT);
+  }
+  Expr test() const {
+    return Expr(subtree(0));
+  }
+  Maybe<Expr> msg() const {
+    return Maybe<Expr>(subtree(1));
+  }
+  static Assert create(
+      const SourceRange& range,
+      const Expr& test,
+      const Maybe<Expr>& msg) {
+    return Assert(Compound::create(TK_ASSERT, range, {test, msg}));
+  }
+};
+
+struct Pass : public Stmt {
+  explicit Pass(const TreeRef& tree) : Stmt(tree) {
+    tree_->match(TK_PASS);
+  }
+  static Pass create(const SourceRange& range) {
+    return Pass(Compound::create(TK_PASS, range, {}));
+  }
+};
+
+struct Dots : public Expr {
+  explicit Dots(const TreeRef& tree) : Expr(tree) {
+    tree_->match(TK_DOTS);
+  }
+  static Dots create(const SourceRange& range) {
+    return Dots(Compound::create(TK_DOTS, range, {}));
+  }
+};
+
+struct Break : public Stmt {
+  explicit Break(const TreeRef& tree) : Stmt(tree) {
+    tree_->match(TK_BREAK);
+  }
+  static Break create(const SourceRange& range) {
+    return Break(Compound::create(TK_BREAK, range, {}));
+  }
+};
+
+struct Continue : public Stmt {
+  explicit Continue(const TreeRef& tree) : Stmt(tree) {
+    tree_->match(TK_CONTINUE);
+  }
+  static Continue create(const SourceRange& range) {
+    return Continue(Compound::create(TK_CONTINUE, range, {}));
+  }
+};
+
+struct ExprStmt : public Stmt {
+  explicit ExprStmt(const TreeRef& tree) : Stmt(tree) {
+    tree_->match(TK_EXPR_STMT);
+  }
+  Expr expr() {
+    return Expr(subtree(0));
+  }
+  static ExprStmt create(const SourceRange& range, const Expr& list) {
+    return ExprStmt(Compound::create(TK_EXPR_STMT, range, {list}));
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Expressions
+////////////////////////////////////////////////////////////////////////////////
+
+struct BinOp : public Expr {
+  explicit BinOp(const TreeRef& tree) : Expr(tree) {
+    switch (tree->kind()) {
+      case TK_AND:
+      case TK_OR:
+      case '<':
+      case '>':
+      case TK_IS:
+      case TK_ISNOT:
+      case TK_EQ:
+      case TK_LE:
+      case TK_GE:
+      case TK_NE:
+      case '+':
+      case '*':
+      case '/':
+      case '-':
+      case '@':
+      case TK_POW:
+      case TK_LSHIFT:
+      case TK_RSHIFT:
+      case '%':
+      case '&':
+      case '^':
+      case '|':
+      case TK_FLOOR_DIV:
+      case TK_IN:
+        if (tree->trees().size() != 2)
+          throw(
+              ErrorReport(tree)
+              << "BinOp expected 2 subtrees, found " << tree->trees().size());
+        return;
+      default:
+        throw(
+            ErrorReport(tree)
+            << kindToString(tree->kind()) << " is not a valid BinOp");
+    }
+  }
+  Expr lhs() const {
+    return Expr(subtree(0));
+  }
+  Expr rhs() const {
+    return Expr(subtree(1));
+  }
+  static BinOp create(
+      const SourceRange& range,
+      int kind,
+      const Expr& lhs,
+      const Expr& rhs) {
+    return BinOp(Compound::create(kind, range, {lhs, rhs}));
+  }
+};
+
+struct UnaryOp : public Expr {
+  explicit UnaryOp(const TreeRef& tree) : Expr(tree) {
+    switch (tree->kind()) {
+      case TK_UNARY_MINUS:
+      case '~':
+      case TK_NOT:
+        if (tree->trees().size() != 1)
+          throw(
+              ErrorReport(tree)
+              << "UnaryOp expected 1 subtree, found " << tree->trees().size());
+        return;
+      default:
+        throw(
+            ErrorReport(tree)
+            << kindToString(tree->kind()) << " is not a valid UnaryOp");
+    }
+  }
+  static UnaryOp create(const SourceRange& range, int kind, const Expr& expr) {
+    return UnaryOp(Compound::create(kind, range, {expr}));
+  }
+};
+
+struct Const : public Expr {
+  explicit Const(const TreeRef& tree) : Expr(tree) {
+    tree_->matchNumSubtrees(TK_CONST, 1);
+  }
+  bool isFloatingPoint() const {
+    if (isComplex())
+      return false;
+
+    bool is_inf = subtree(0)->stringValue() == "inf";
+    return is_inf ||
+        subtree(0)->stringValue().find_first_of(".eE") != std::string::npos;
+  }
+  bool isIntegral() const {
+    return !isFloatingPoint() && !isComplex();
+  }
+  bool isComplex() const {
+    return subtree(0)->stringValue().find_first_of('j') != std::string::npos;
+  }
+  int64_t asIntegral() const {
+    try {
+      return std::stoll(subtree(0)->stringValue(), nullptr, 0);
+    } catch (const std::out_of_range&) {
+      throw(
+          ErrorReport(range()) << "Integral constant out of range "
+                                  "(must fit in a signed 64 bit integer)");
+    }
+  }
+  double asFloatingPoint() const {
+    // We can't pass in nullptr as the dummy pointer gets dereferenced for
+    // Android version of strtod_c().
+    char* dummy = nullptr;
+    return torch::jit::strtod_c(subtree(0)->stringValue().c_str(), &dummy);
+  }
+  c10::complex<double> asComplex() const {
+    char* dummy = nullptr;
+    auto str = subtree(0)->stringValue();
+    // Complex numbers (a+bj, where a is non-zero) are parsed as an addition
+    // between float/int a and a complex number "bj". When a is 0, a complex
+    // number bj is created as above. So, while parsing the string, we don't
+    // have to worry about the real component of the complex number.
+    auto imag =
+        torch::jit::strtod_c(str.substr(0, str.size() - 1).c_str(), &dummy);
+    return c10::complex<double>(0, imag);
+  }
+  const std::string& text() const {
+    return subtree(0)->stringValue();
+  }
+  static Const create(const SourceRange& range, const std::string& value) {
+    return Const(Compound::create(TK_CONST, range, {String::create(value)}));
+  }
+};
+
+struct StringLiteral : public Expr {
+  explicit StringLiteral(const TreeRef& tree) : Expr(tree) {
+    tree_->matchNumSubtrees(TK_STRINGLITERAL, 1);
+  }
+  const std::string& text() const {
+    return subtree(0)->stringValue();
+  }
+  static StringLiteral create(
+      const SourceRange& range,
+      const std::string& value) {
+    return StringLiteral(
+        Compound::create(TK_STRINGLITERAL, range, {String::create(value)}));
+  }
+};
+
+struct Apply : public Expr {
+  explicit Apply(const TreeRef& tree) : Expr(tree) {
+    tree_->match(TK_APPLY);
+  }
+  Expr callee() const {
+    return Expr(subtree(0));
+  }
+  List<Expr> inputs() const {
+    return List<Expr>(subtree(1));
+  }
+  List<Attribute> attributes() const {
+    return List<Attribute>(subtree(2));
+  }
+  static Apply create(
+      const SourceRange& range,
+      const Expr& callee,
+      const List<Expr>& inputs,
+      const List<Attribute>& attributes) {
+    return Apply(
+        Compound::create(TK_APPLY, range, {callee, inputs, attributes}));
+  }
+};
+
+struct Select : public Expr {
+  explicit Select(const TreeRef& tree) : Expr(tree) {
+    tree_->match('.');
+  }
+  Expr value() const {
+    return Expr(subtree(0));
+  }
+  Ident selector() const {
+    return Ident(subtree(1));
+  }
+  static Select create(
+      const SourceRange& range,
+      const Expr& value,
+      const Ident& selector) {
+    return Select(Compound::create('.', range, {value, selector}));
+  }
+};
+
+struct SliceExpr : public Expr {
+  explicit SliceExpr(const TreeRef& tree) : Expr(tree) {
+    tree_->match(TK_SLICE_EXPR);
+  }
+  Maybe<Expr> start() const {
+    return Maybe<Expr>(subtree(0));
+  }
+  Maybe<Expr> end() const {
+    return Maybe<Expr>(subtree(1));
+  }
+  Maybe<Expr> step() const {
+    return Maybe<Expr>(subtree(2));
+  }
+  Expr startOr(int64_t alternative) const {
+    const auto startOption = start();
+    return startOption.present() ? startOption.get() : createInt(alternative);
+  }
+  Expr endOr(int64_t alternative) const {
+    const auto endOption = end();
+    return endOption.present() ? endOption.get() : createInt(alternative);
+  }
+  Expr stepOr(int64_t alternative) const {
+    const auto stepOption = step();
+    return stepOption.present() ? stepOption.get() : createInt(alternative);
+  }
+  static SliceExpr create(
+      const SourceRange& range,
+      const Maybe<Expr>& start,
+      const Maybe<Expr>& end,
+      const Maybe<Expr>& step) {
+    return SliceExpr(
+        Compound::create(TK_SLICE_EXPR, range, {start, end, step}));
+  }
+
+ private:
+  Expr createInt(int64_t value) const {
+    return Expr(Const::create(range(), std::to_string(value)));
+  }
+};
+
+struct Subscript : public Expr {
+  explicit Subscript(const TreeRef& tree) : Expr(tree) {
+    tree_->match(TK_SUBSCRIPT);
+  }
+  Expr value() const {
+    return Expr(subtree(0));
+  }
+  List<Expr> subscript_exprs() const {
+    return List<Expr>(subtree(1));
+  }
+  static Subscript create(
+      const SourceRange& range,
+      const Expr& value,
+      const List<Expr>& subscript_exprs) {
+    auto whole_range = SourceRange(
+        range.source(), range.start(), subscript_exprs.range().end() + 1);
+    return Subscript(
+        Compound::create(TK_SUBSCRIPT, whole_range, {value, subscript_exprs}));
+  }
+};
+
+struct Var : public Expr {
+  explicit Var(const TreeRef& tree) : Expr(tree) {
+    tree_->match(TK_VAR);
+  }
+  Ident name() const {
+    return Ident(subtree(0));
+  }
+  static Var create(const SourceRange& range, const Ident& name) {
+    return Var(Compound::create(TK_VAR, range, {name}));
+  }
+};
+
+// WithItem represents an item using with a WithStmt.
+struct WithItem : public Expr {
+  explicit WithItem(const TreeRef& tree) : Expr(tree) {
+    tree_->match(TK_WITH_ITEM);
+  }
+
+  Expr target() const {
+    return Expr(subtree(0));
+  }
+
+  Maybe<Var> var() const {
+    return Maybe<Var>(subtree(1));
+  }
+
+  static WithItem create(
+      const SourceRange& range,
+      const Expr& target,
+      const Maybe<Var>& var) {
+    return WithItem(Compound::create(TK_WITH_ITEM, range, {target, var}));
+  }
+};
+
+// With represents a with statement consisting of a list of with items and a
+// body of statements.
+struct With : public Stmt {
+  explicit With(const TreeRef& tree) : Stmt(tree) {
+    tree_->match(TK_WITH);
+  }
+
+  List<WithItem> targets() const {
+    return List<WithItem>(subtree(0));
+  }
+
+  List<Stmt> body() const {
+    return List<Stmt>(subtree(1));
+  }
+
+  static With create(
+      const SourceRange& range,
+      const List<WithItem>& targets,
+      const List<Stmt>& body) {
+    return With(Compound::create(TK_WITH, range, {targets, body}));
+  }
+};
+
+struct TernaryIf : public Expr {
+  explicit TernaryIf(const TreeRef& tree) : Expr(tree) {
+    tree_->matchNumSubtrees(TK_IF_EXPR, 3);
+  }
+  Expr cond() const {
+    return Expr(subtree(0));
+  }
+  Expr true_expr() const {
+    return Expr(subtree(1));
+  }
+  Expr false_expr() const {
+    return Expr(subtree(2));
+  }
+  static TernaryIf create(
+      const SourceRange& range,
+      const Expr& cond,
+      const Expr& true_expr,
+      const Expr& false_expr) {
+    return TernaryIf(
+        Compound::create(TK_IF_EXPR, range, {cond, true_expr, false_expr}));
+  }
+};
+
+struct ListLiteral : public Expr {
+  explicit ListLiteral(const TreeRef& tree) : Expr(tree) {
+    tree_->match(TK_LIST_LITERAL);
+  }
+  List<Expr> inputs() const {
+    return subtree(0);
+  }
+  static ListLiteral create(
+      const SourceRange& range,
+      const List<Expr>& inputs) {
+    return ListLiteral(Compound::create(TK_LIST_LITERAL, range, {inputs}));
+  }
+};
+
+struct TupleLiteral : public Expr {
+  explicit TupleLiteral(const TreeRef& tree) : Expr(tree) {
+    tree_->match(TK_TUPLE_LITERAL);
+  }
+  List<Expr> inputs() const {
+    return subtree(0);
+  }
+  static TupleLiteral create(
+      const SourceRange& range,
+      const List<Expr>& inputs) {
+    return TupleLiteral(Compound::create(TK_TUPLE_LITERAL, range, {inputs}));
+  }
+};
+
+struct DictLiteral : public Expr {
+  explicit DictLiteral(const TreeRef& tree) : Expr(tree) {
+    tree_->match(TK_DICT_LITERAL);
+  }
+  List<Expr> key_inputs() const {
+    return subtree(0);
+  }
+  List<Expr> value_inputs() const {
+    return subtree(1);
+  }
+  static DictLiteral create(
+      const SourceRange& range,
+      const List<Expr>& keys,
+      const List<Expr>& values) {
+    return DictLiteral(
+        Compound::create(TK_DICT_LITERAL, range, {keys, values}));
+  }
+};
+
+struct Starred : public Expr {
+  explicit Starred(const TreeRef& tree) : Expr(tree) {
+    tree_->match(TK_STARRED);
+  }
+  Expr expr() const {
+    return Expr(subtree(0));
+  }
+  static Starred create(const SourceRange& range, const Expr& expr) {
+    return Starred(Compound::create(TK_STARRED, range, {expr}));
+  }
+};
+
+struct Delete : public Stmt {
+  explicit Delete(const TreeRef& tree) : Stmt(tree) {
+    tree_->match(TK_DELETE);
+  }
+  List<Expr> targets() const {
+    return subtree(0);
+  }
+  static Delete create(const SourceRange& range, const List<Expr>& targets) {
+    return Delete(Compound::create(TK_DELETE, range, {targets}));
+  }
+};
+
+/*
+ * NOTE: transforming PEP 604 union into equivalent union type
+ *
+ * NOTE: Union[int, float] parses into:
+ * <EXPR> expr:(subscript
+ *  (variable (ident Union))
+ *  (list
+ *    (variable (ident int))
+ *    (variable (ident float))))
+ * <KIND> subscript
+ *
+ * NOTE: (int | float) parses into:
+ * <EXPR> expr:(|
+ *  (variable (ident int))
+ *  (variable (ident float)))
+ * <KIND> |
+ */
+
+inline void _flatten_pep604_union(
+    const torch::jit::Expr& node,
+    std::vector<torch::jit::Expr>* result) {
+  // flatten possibly nested union expressions like (int | (float | str))
+  // into a flat list of expressions like [int, float, str]
+  if (node.kind() == '|') {
+    auto as_binop = torch::jit::BinOp(node);
+    _flatten_pep604_union(as_binop.lhs(), result);
+    _flatten_pep604_union(as_binop.rhs(), result);
+  } else {
+    result->push_back(node);
+  }
+}
+
+inline std::vector<Expr> get_pep604_union_members(const Expr& node) {
+  std::vector<Expr> result;
+  _flatten_pep604_union(node, &result);
+  return result;
+}
+
+// Flattens a PEP 604 union into a classical union.
+// For example, ((x | y) | z) is transformed into Union[x, y, z].
+inline Expr pep604union_to_union(const Expr& expr) {
+  // noop if not a pep604 union
+  if (expr.kind() != '|')
+    return expr;
+
+  // In order to support unions with more than 2 operands ((x|y)|z), we need to
+  // recursively flatten the tree of | expressions.
+  auto members = get_pep604_union_members(expr);
+  auto synthesised_union = Subscript::create(
+      expr.range(),
+      Var::create(expr.range(), Ident::create(expr.range(), "Union")),
+      List<Expr>::create(expr.range(), members));
+#if defined(__clang__)
+  return std::move(synthesised_union);
+#else
+  return synthesised_union;
+#endif
+}
+
+} // namespace torch::jit
+
+namespace std {
+
+template <typename T>
+struct iterator_traits<torch::jit::ListIterator<T>>
+    : std::iterator_traits<torch::jit::TreeList::const_iterator> {};
+
+} // namespace std
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/versioned_symbols.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/versioned_symbols.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae2f65cc0cf369acbd19bdd4df7f5610572eba82
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/frontend/versioned_symbols.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <caffe2/serialize/versions.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/api/module.h>
+
+#include <cstdint>
+
+namespace torch::jit {
+// Maps the given symbol into an implementation of its behavior at the
+// given version.
+// See note [Versioned Symbols]
+TORCH_API Symbol
+get_symbol_for_version(const Symbol name, const uint64_t version);
+
+// Maps the given kind to the minimum version that supports it.
+// See note [Dynamic Versions and torch.jit.save vs. torch.save]
+TORCH_API uint64_t get_min_version_for_kind(const NodeKind& kind);
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/alias_analysis.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/alias_analysis.h
new file mode 100644
index 0000000000000000000000000000000000000000..da23a4693c88eaa61fb11555a27041a68b111353
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/alias_analysis.h
@@ -0,0 +1,363 @@
+#pragma once
+
+#include <ATen/core/alias_info.h>
+#include <c10/util/flat_hash_map.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/ir/type_hashing.h>
+#include <torch/csrc/jit/passes/create_functional_graphs.h>
+#include <torch/csrc/jit/passes/utils/memory_dag.h>
+
+namespace torch::jit {
+
+class ValueAndMemoryLocationSet;
+
+/**
+ * Alias analysis pass.
+ *
+ * This pass produces an AliasDb that contains aliasing and mutation
+ * information about the graph. Users can use this information to determine
+ * whether mutations to the graph are safe, i.e. they don't reorder/change
+ * nodes in a way that affects output.
+ *
+ * Every value with a mutable type (Tensors, Lists, Tuples, etc.) will be
+ * associated with one or more "alias sets". If two values share an alias set,
+ * that means they may alias, implying that a mutation to one value cannot be
+ * reordered past a use of the other. Only reordering two reads of an alias set
+ * is considered safe.
+ *
+ * There is a special alias set called the "wildcard set", which indicates that
+ * we're not sure what this value may alias. To be conservative, we consider the
+ * wildcard alias set as potentially aliasing any other wildcard value within
+ * the same type class. Whenever a value becomes contained by another value,
+ * such as when a Tensor is appended to a List[Tensor], the contained element
+ * becomes part of the wildcard set.
+ *
+ * Values that contain other mutable types, such as List[Tensor], are
+ * initialized as containing the Wildcard set for all contained mutable types.
+ *
+ * The AliasDb API references the idea of "mutable" vs "immutable"
+ * types. "Mutable" means that the object's value can change, while
+ * "immutable" means that the value is fixed. (For example, `List` is
+ * mutable, so you can add and delete elements from it. On the other
+ * hand, you can't modify a Tuple once you create it, making `Tuple` an
+ * immutable container.)
+ *
+ * `isFrozen` - if the Module is frozen then consider attributes as freshly
+ * created objects. Freezing API invokes alias analysis to check if they are
+ * mutated internally.
+ *
+ * `descendFunctionCalls` - recursively analyze function and method calls
+ * instead of conservative analysis. Generally analysis should be done after
+ * inlining so the implmentation for recursive analysis is unoptimized.
+ */
+class AliasDb {
+ public:
+  TORCH_API explicit AliasDb(
+      std::shared_ptr<Graph> graphi,
+      bool isFrozen = false,
+      bool descendFunctionCalls = false);
+  TORCH_API ~AliasDb();
+
+  // There are limitations to what effects the alias analysis can track. Two
+  // kinds of nodes may have untracked effects:
+  // 1. Nodes that write to a value that may alias the graph inputs (since
+  //    the inputs can be used outside the graph).
+  // 2. Nodes that write to something in the wildcard set.
+  //
+  // These nodes are considered not safe to eliminate or mutate under any
+  // circumstances.
+  bool writesToWildcard(Node* n) const;
+
+  // Does `n` write to an alias of one of the values in `vs`?
+  // if `recurseBlocks` is true, consider writes on the nodes in `n`s sub-blocks
+  TORCH_API bool writesToAlias(Node* n, const ValueSet& vs) const;
+
+  // Does `n` write to any of the values in `vls`?
+  TORCH_API bool writesToAlias(Node* n, const ValueAndMemoryLocationSet& vls)
+      const;
+
+  TORCH_API ValueAndMemoryLocationSet getValueAndMemoryLocationSet() const;
+
+  // Does `a` and `b` potentially share a memory location or do either
+  // hold in memory any element that exists in the other
+  TORCH_API bool mayContainAlias(Value* a, Value* b) const;
+
+  TORCH_API bool mayContainAlias(Value* a, const at::ArrayRef<Value*> b) const;
+
+  // Do any values in group `a` share a memory location or hold in memory
+  // any element that exists in group `b`
+  TORCH_API bool mayContainAlias(
+      const at::ArrayRef<Value*> a,
+      const at::ArrayRef<Value*> b) const;
+
+  // Do `a` and `b` potentially share a memory location?
+  TORCH_API bool mayAlias(const Value* a, const Value* b) const;
+  // Do any values in group `a` potentially share a memory location with any
+  // value in group `b`? i.e. may they overlap?
+  TORCH_API bool mayAlias(const ValueSet& a, const ValueSet& b) const;
+
+  // Do any nodes write to an alias set input to `n`?
+  TORCH_API bool hasInputWriters(const Node* n) const;
+
+  // Do any nodes write to an alias set output by `n`?
+  TORCH_API bool hasOutputWriters(const Node* n) const;
+
+  // Do any nodes write to an alias set inputed/outputed by `n`?
+  TORCH_API bool hasWriters(const Node* n) const;
+
+  // Do any nodes write to `v`s memory location?
+  TORCH_API bool hasWriters(const Value* v) const;
+
+  // Is the operation in-place? i.e. doesn't write anywhere but locations it
+  // reads from.
+  TORCH_API bool isMutable(Node* n) const;
+
+  TORCH_API bool escapesScope(const at::ArrayRef<Value*>& vs) const;
+
+  // Is it safe to change whether `a` and `b` alias each other ?
+  TORCH_API bool safeToChangeAliasingRelationship(
+      const at::ArrayRef<Value*>& a,
+      const at::ArrayRef<Value*>& b) const;
+
+  // Move `n` (already in the graph) after `movePoint` in the topological order.
+  //
+  // Tries to preserve value dependencies, so other nodes might be moved. We
+  // make two guarantees about the postcondition of the node list:
+  //   - `n` is directly after `movePoint`.
+  //   - only nodes between `n` and `movePoint` have been moved.
+  //
+  // Returns `false` if it's impossible to move `n` after `MovePoint` without
+  // violating dependencies, otherwise executes the move and returns `true`
+  TORCH_API bool moveAfterTopologicallyValid(Node* n, Node* movePoint);
+  TORCH_API bool moveBeforeTopologicallyValid(Node* n, Node* movePoint);
+
+  bool couldMoveAfterTopologically(Node* n, Node* movePoint);
+  bool couldMoveBeforeTopologically(Node* n, Node* movePoint);
+
+  // For debugging: print alias db state to stdout
+  TORCH_API void dump() const;
+  TORCH_API std::string toString() const;
+
+  // Generates a DOT (www.graphviz.org) graph representation
+  //
+  // Returns `true` if the output file was successfully generated
+  //
+  // WARNING: The output dot file path can't include shell specific notations,
+  //  for example you can't use "~/temp/aliasdb.dot"
+  //  (instead, use "/home/user/temp/aliasdb.dot")
+  //
+  TORCH_API bool dumpToGraphvizFile(const char* filename) const;
+  TORCH_API std::string toGraphviz() const;
+
+  // Returns `true` if the given element is mutable or if it is a
+  // container type with an internal mutable element (e.g.
+  // `Tuple[int, Tensor]` has an internal mutable type `Tensor`, so
+  // it would be considered a "mutable type" in AliasDb)
+  static bool isMutableType(const Value* v);
+  static bool isMutableType(const TypePtr& type);
+
+  /**
+   * Mutation API
+   *
+   * These methods allow you to update AliasDb in-place if you are performing
+   * graph mutation.
+   *
+   * WARNING: These methods should be considered INTERNAL. They do not perform
+   * very many correctness checks, the user is responsible for making sure they
+   * are updating AliasDb correctly. `Lint()`ing the AliasDb can help with
+   * this.
+   */
+  // Copy `existing`s aliasing info to `new_value`, and remove `existing`.
+  TORCH_API void replaceWithNewValue(Value* existing, Value* new_value);
+  // Copy `from`s aliasing info to `to`.
+  TORCH_API void copyValue(Value* from, Value* to);
+  // Create a new `value` that does not alias anything else.
+  TORCH_API void createValue(const Value* value);
+
+  // Enable more precise treatment of prim::TupleConstruct.
+  void enablePreciseTupleContainerAnalysis();
+
+  friend struct MutationRemover;
+  friend class ValueAndMemoryLocationSet;
+
+ private:
+  // Helper for topologically-safe node moves.
+  class WorkingSet;
+  enum class MoveSide { BEFORE, AFTER };
+  bool tryMove(Node* toMove, Node* movePoint, MoveSide moveSide, bool dryRun);
+  void move(Node* toMove, Node* movePoint, MoveSide moveSide);
+  bool isBeforeOrAfter(const Node* n, MoveSide moveSide) const;
+
+  bool isMutableTypeInternal(const Value* v) const;
+  bool isMutableTypeInternal(const TypePtr& type) const;
+
+  /**
+   * Write and read internal API
+   */
+  // Get all the values that `n` writes to.
+  // NOTE: this only returns values directly written to, not aliases thereof
+  //
+  // if `recurseBlocks` is true, gather writes on the nodes in `n`s sub-blocks
+  MemoryLocations getWrites(Node* n) const;
+  void getWritesImpl(Node* n, MemoryLocations& ret) const;
+  // Register the fact that `n` writes to `v`.
+  void registerWrite(const Value* v, Node* n, bool writeToContained = false);
+  // Get all the values that `n` reads from.
+  // if `recurseBlocks` is true, gather reads on the nodes in `n`s sub-blocks
+  MemoryLocations getReads(Node* n) const;
+  void getReadsImpl(Node* n, MemoryLocations& ret) const;
+  MemoryLocations getMemoryLocations(Value* v) const;
+
+  /**
+   * Wildcard methods
+   */
+  // Register `v` as a wildcard value.
+  std::optional<Element*> setWildcard(const Value* v);
+
+  // Is this a value which will not alias?
+  bool nonAliasingValue(const Value* elem) const;
+
+  /**
+   * Special analysis methods
+   */
+  void analyze(const std::shared_ptr<Graph>& graph);
+  void analyze(Block* block);
+  void analyze(Node* node);
+  void analyzeImpl(Node* node);
+  void analyzeIf(Node* node);
+  void analyzeLoop(Node* node);
+  void analyzeSubgraph(Node* node, const std::shared_ptr<Graph>& subgraph);
+  void analyzeSubgraph(Node* node);
+  void analyzeCreator(Node* node);
+  void analyzeExtractor(Node* node);
+  void analyzeChunk(Node* node);
+  void analyzeBroadcastingChunk(Node* node);
+  void analyzeFork(Node* node);
+  void analyzeWait(Node* node);
+  void analyzeAwaitable(Node* node);
+  void analyzeAwaitableWait(Node* node);
+  void analyzeRpcAsync(Node* node);
+  void analyzeBatchNorm(Node* node);
+  void analyzeInstanceNorm(Node* node);
+  void analyzeGradOf(Node* node);
+  void analyzeSetAttr(Node* node);
+  void analyzeConservative(Node* node);
+  void analyzeContainerConstruct(Node* node);
+  bool tryRegisteredAnalysis(Node* node);
+
+  /**
+   * Alias manipulation methods
+   */
+  void makeAllAlias(const std::vector<Value*>& values);
+  void makePointerTo(const Value* value, const Value* to);
+  TORCH_API void addToContainedElements(
+      const Value* element,
+      const Value* container);
+  void mapAliases(at::ArrayRef<Value*> to, at::ArrayRef<Value*> from);
+  void giveFreshAlias(
+      const Value* value,
+      bool add_wildcard_to_contained_elems = true);
+  Element* getOrCreateElement(const Value* value);
+
+  const AliasTypeSet* mapTypeToAliasTypeSetPtr(const TypePtr& type) const;
+  bool functionalNonEscapingListUse(const Use& use) const;
+  bool functionalNonEscapingTupleUse(const Use& use) const;
+
+  std::shared_ptr<Graph> graph_;
+
+  // If the Module is frozen then consider attributes as freshly created
+  // objects. Freezing API invokes alias analysis to check if they are mutated
+  // internally.
+  bool isFrozen_;
+
+  bool descend_function_calls_;
+  std::unordered_map<Graph*, std::vector<std::shared_ptr<Graph>>>
+      function_call_copies_;
+
+  // The points-to graph that stores aliasing relationships
+  std::unique_ptr<MemoryDAGBuilder> memoryDAGBuilder_;
+  std::unique_ptr<MemoryDAG> memoryDAG_;
+
+  // Mapping of values to MemoryDAG elements
+  ska::flat_hash_map<const Value*, Element*> elementMap_;
+  // All wildcard Elements (one for each unique mutable type)
+  ska::flat_hash_map<TypePtr, Element*, HashType, EqualType> wildcardIndex_;
+  Element* getWildcard(const TypePtr& type) const;
+  std::optional<Element*> tryGetOrCreateWildcard(const TypePtr& type);
+  void addContainedTypesToFreshElement(
+      Element* container_elem,
+      const AliasTypeSet& mut_types);
+  void pointUnionTypeElementToAllContainedTypes(
+      Element* container_elem,
+      const AliasTypeSet& mut_types);
+
+  std::vector<Element*> getElements(at::ArrayRef<Value*> vs) const;
+  bool mayAliasWildcard(const Value* v) const;
+  bool mayAliasWildcard(const at::ArrayRef<Value*> vs) const;
+  bool hasWriters(const at::ArrayRef<Value*>& values) const;
+
+  // Cached mapping of type ptrs to their mutable types
+  mutable ska::flat_hash_map<TypePtr, AliasTypeSet> mapped_mutable_types_;
+
+  /**
+   * State for tracking write info.
+   */
+  // Write registry where the analysis can record the writes as it sees them.
+  // This information is later denormalized into various caches to improve query
+  // efficiency.
+  struct WriteRegistry;
+  std::unique_ptr<WriteRegistry> writeRegistry_;
+
+  // Map of nodes to the memory locations that they write to
+  using TWriteIndex = ska::flat_hash_map<Node*, MemoryLocations>;
+  std::optional<TWriteIndex> writeIndex_;
+  // Collection of all memory locations that are written to.
+  std::optional<MemoryLocations> writtenToLocationsIndex_;
+  void buildWrittenToLocationsIndex();
+
+  std::unordered_set<const Value*> wildcards_;
+
+  std::string getElementName(const Element* e) const;
+
+  friend void Lint(const AliasDb* db);
+};
+
+// Helper check that invariants over AliasDb are maintained.
+// Useful if you are using the AliasDb mutation API and want to check you did
+// the right thing.
+TORCH_API void Lint(const AliasDb* db);
+
+/**
+ * ValueAndMemoryLocationSet
+ *
+ * A insert-only set of values which also maintains a MemoryLocations bitset
+ * of the memory locations that the values alias. It is insert-only. It
+ * should be constructed by calling aliasDb.getValueAndMemoryLocationSet().
+ *
+ * WARNING:
+ *  * The AliasDb must not be mutated after construction of a
+ *    ValueAndMemoryLocationsSet, or else the MemoryLocations stored in the
+ *    ValueAndMemoryLocationSet will no longer be accurate.
+ *  * A ValueAndMemoryLocationsSet is tied to an instsance of AliasDb but
+ *    does not own the AliasDb. It is the user's responsibility to ensure
+ *    that the AliasDb outlives the ValuesAndMemoryLocationsSet.
+ *
+ * The use case for this is to be able to implement writesToAlias
+ * more efficiently for a set of values.
+ */
+class ValueAndMemoryLocationSet {
+ public:
+  TORCH_API void insert(Value* v);
+  TORCH_API ValueSet& getValueSet();
+
+  friend class AliasDb;
+
+ private:
+  ValueAndMemoryLocationSet(const AliasDb* db) : aliasDb_(db) {}
+
+  const AliasDb* aliasDb_;
+  ValueSet valueSet_;
+  MemoryLocations memoryLocations_;
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/attributes.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/attributes.h
new file mode 100644
index 0000000000000000000000000000000000000000..c701c48b78ab9724435695850a3ca1b16931a999
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/attributes.h
@@ -0,0 +1,180 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+#include <string>
+#include <vector>
+
+#include <ATen/core/jit_type_base.h>
+#include <ATen/core/symbol.h>
+
+#include <torch/csrc/Export.h>
+
+namespace torch::jit {
+
+using ::c10::Symbol;
+
+constexpr int max_tensor_display_size = 10;
+
+enum class AttributeKind {
+  f,
+  fs,
+  c,
+  cs,
+  i,
+  is,
+  s,
+  ss,
+  t,
+  ts,
+  g,
+  gs,
+  ty,
+  tys,
+  ival
+};
+static inline const char* toString(AttributeKind kind) {
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+  static const char* names[] = {
+      "f",
+      "c",
+      "cs",
+      "fs",
+      "i",
+      "is",
+      "s",
+      "ss",
+      "t",
+      "ts",
+      "g",
+      "gs",
+      "ty",
+      "tys",
+      "ival"};
+  AT_ASSERT(size_t(kind) < sizeof(names) / sizeof(*names));
+  return names[int(kind)];
+}
+
+struct AttributeValue {
+  AttributeValue(Symbol name) : name(name) {}
+  using Ptr = std::unique_ptr<AttributeValue>;
+  Symbol name;
+  virtual AttributeKind kind() const = 0;
+  virtual Ptr clone() const = 0;
+  virtual ~AttributeValue() = default;
+};
+
+template <typename T, AttributeKind Kind>
+struct ScalarAttributeValue : public AttributeValue {
+  using ConstructorType = T;
+  using ValueType = T;
+  ScalarAttributeValue(Symbol name, ConstructorType value_)
+      : AttributeValue(name), value_(std::move(value_)) {}
+  ValueType& value() {
+    return value_;
+  }
+  Ptr clone() const override {
+    return Ptr(new ScalarAttributeValue(name, value_));
+  }
+  AttributeKind kind() const override {
+    return Kind;
+  }
+
+ private:
+  ValueType value_;
+};
+
+template <typename T, AttributeKind Kind>
+struct VectorAttributeValue : public AttributeValue {
+  using ConstructorType = std::vector<T>;
+  using ValueType = std::vector<T>;
+  VectorAttributeValue(Symbol name, ConstructorType value_)
+      : AttributeValue(name), value_(std::move(value_)) {}
+  ValueType& value() {
+    return value_;
+  }
+  AttributeKind kind() const override {
+    return Kind;
+  }
+  std::unique_ptr<AttributeValue> clone() const override {
+    auto copy = value_;
+    return Ptr(new VectorAttributeValue(name, std::move(copy)));
+  }
+
+ private:
+  ValueType value_;
+};
+
+using ComplexAttr =
+    ScalarAttributeValue<c10::complex<double>, AttributeKind::c>;
+using ComplexValsAttr =
+    VectorAttributeValue<c10::complex<double>, AttributeKind::cs>;
+using FloatAttr = ScalarAttributeValue<double, AttributeKind::f>;
+using FloatsAttr = VectorAttributeValue<double, AttributeKind::fs>;
+using IntAttr = ScalarAttributeValue<int64_t, AttributeKind::i>;
+using IntsAttr = VectorAttributeValue<int64_t, AttributeKind::is>;
+using StringAttr = ScalarAttributeValue<std::string, AttributeKind::s>;
+using StringsAttr = VectorAttributeValue<std::string, AttributeKind::ss>;
+using TensorAttr = ScalarAttributeValue<at::Tensor, AttributeKind::t>;
+using TensorsAttr = VectorAttributeValue<at::Tensor, AttributeKind::ts>;
+using TypeAttr = ScalarAttributeValue<c10::TypePtr, AttributeKind::ty>;
+using TypesAttr = VectorAttributeValue<c10::TypePtr, AttributeKind::tys>;
+using IValueAttr = ScalarAttributeValue<at::IValue, AttributeKind::ival>;
+
+struct Graph;
+
+// We special case Graph attributes like this because we want to ensure that
+// Graph::copy() is called when we clone() these attributes.
+struct TORCH_API GraphAttr : public AttributeValue {
+  using ConstructorType = std::shared_ptr<Graph>;
+  using ValueType = std::shared_ptr<Graph>;
+  GraphAttr(Symbol name, ConstructorType value_)
+      : AttributeValue(name), value_(std::move(value_)) {}
+  ValueType& value() {
+    return value_;
+  }
+  Ptr clone() const override;
+  AttributeKind kind() const override {
+    return AttributeKind::g;
+  }
+
+ private:
+  std::shared_ptr<Graph> value_;
+};
+
+struct TORCH_API GraphsAttr : public AttributeValue {
+  using ConstructorType = std::vector<std::shared_ptr<Graph>>;
+  using ValueType = std::vector<std::shared_ptr<Graph>>;
+  GraphsAttr(Symbol name, ConstructorType value_)
+      : AttributeValue(name), value_(std::move(value_)) {}
+  ValueType& value() {
+    return value_;
+  }
+  AttributeKind kind() const override {
+    return AttributeKind::gs;
+  }
+  std::unique_ptr<AttributeValue> clone() const override;
+
+ private:
+  ValueType value_;
+};
+
+struct IRAttributeError : public std::exception {
+  IRAttributeError(Symbol name, bool defined) {
+    std::stringstream ss;
+    // NOLINTNEXTLINE(bugprone-branch-clone)
+    if (!defined) {
+      ss << "required keyword attribute '" << name.toUnqualString()
+         << "' is undefined";
+    } else {
+      ss << "required keyword attribute '" << name.toUnqualString()
+         << "' has the wrong type";
+    }
+    msg = ss.str();
+  }
+  const char* what() const noexcept override {
+    return msg.c_str();
+  }
+
+ private:
+  std::string msg;
+};
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/constants.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/constants.h
new file mode 100644
index 0000000000000000000000000000000000000000..8abc32624f8df91acff319e712b814b151293549
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/constants.h
@@ -0,0 +1,60 @@
+#pragma once
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/frontend/source_range.h>
+#include <torch/csrc/jit/ir/scope.h>
+
+// helpers for handling constants in the IR
+// - create constant nodes from ints, floats, complex, intlist, Tensors, and
+// other types
+// - implement primitive constant ops.
+
+namespace torch::jit {
+
+using ::c10::IValue;
+
+struct Graph;
+struct Value;
+
+// thrown when insertConstant cannot encode the IValue into a graph
+struct TORCH_API constant_not_supported_error : public std::runtime_error {
+  using runtime_error::runtime_error;
+};
+
+TORCH_API Value* insertConstant(
+    Graph& g,
+    const IValue& val,
+    std::optional<SourceRange> loc = std::nullopt,
+    std::optional<ScopePtr> scope = std::nullopt);
+
+// note: prefer g.insertConsant(val, loc) which does exactly the same thing
+// this function is only declared/defined here because its implementation is
+// closely related to the implementation of prim::Constant that is also in
+// constants.cpp.
+//
+// returns a std::nullopt if the IValue kind cannot be inserted as a constant
+TORCH_API std::optional<Value*> tryInsertConstant(
+    Graph& g,
+    const IValue& val,
+    std::optional<SourceRange> loc = std::nullopt,
+    std::optional<ScopePtr> scope = std::nullopt);
+
+////////////////////////////////////////////////////////////////////////////////
+// Helper for retrieving constants
+////////////////////////////////////////////////////////////////////////////////
+
+// attempt to convert a (possibly constant) Value* into an interpreter value
+// (IValue). returns std::nullopt if the Value* was not constant
+TORCH_API std::optional<IValue> toIValue(const Value* v);
+
+// if a value is a constant then try to turn into type T using the
+// same rules as the interpreter
+template <typename T>
+std::optional<T> constant_as(const Value* v) {
+  if (auto ivalue = toIValue(v)) {
+    return ivalue->to<T>();
+  }
+  return std::nullopt;
+}
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/graph_node_list.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/graph_node_list.h
new file mode 100644
index 0000000000000000000000000000000000000000..0e3ea6ef9671b7e7de7f5da5ec9e8f651328c5ad
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/graph_node_list.h
@@ -0,0 +1,199 @@
+#pragma once
+
+#include <c10/util/Exception.h>
+
+namespace torch::jit {
+
+// Intrusive doubly linked lists with sane reverse iterators.
+// The header file is named generic_graph_node_list.h because it is ONLY
+// used for Graph's Node lists, and if you want to use it for other
+// things, you will have to do some refactoring.
+//
+// At the moment, the templated type T must support a few operations:
+//
+//  - It must have a field: T* next_in_graph[2] = { nullptr, nullptr };
+//    which are used for the intrusive linked list pointers.
+//
+//  - It must have a method 'destroy()', which removes T from the
+//    list and frees a T.
+//
+// In practice, we are only using it with Node and const Node.  'destroy()'
+// needs to be renegotiated if you want to use this somewhere else.
+//
+// Regardless of the iteration direction, iterators always physically point
+// to the element they logically point to, rather than
+// the off-by-one behavior for all standard library reverse iterators like
+// std::list.
+
+// The list is includes two sentinel nodes, one at the beginning and one at the
+// end with a circular link between them. It is an error to insert nodes after
+// the end sentinel node but before the beginning node:
+
+// Visualization showing only the next() links:
+//  HEAD -> first -> second  -> ... -> last -> TAIL
+//   ^------------------------------------------
+
+// Visualization showing only the prev() links:
+//  HEAD <- first <- second  <- ... <- last <- TAIL
+//   ------------------------------------------^
+
+static constexpr int kNextDirection = 0;
+static constexpr int kPrevDirection = 1;
+
+template <typename T>
+struct generic_graph_node_list;
+
+template <typename T>
+struct generic_graph_node_list_iterator;
+
+struct Node;
+using graph_node_list = generic_graph_node_list<Node>;
+using const_graph_node_list = generic_graph_node_list<const Node>;
+using graph_node_list_iterator = generic_graph_node_list_iterator<Node>;
+using const_graph_node_list_iterator =
+    generic_graph_node_list_iterator<const Node>;
+
+template <typename T>
+struct generic_graph_node_list_iterator {
+  generic_graph_node_list_iterator() : cur(nullptr), d(kNextDirection) {}
+  generic_graph_node_list_iterator(T* cur, int d) : cur(cur), d(d) {}
+  generic_graph_node_list_iterator(
+      const generic_graph_node_list_iterator& rhs) = default;
+  generic_graph_node_list_iterator(
+      generic_graph_node_list_iterator&& rhs) noexcept = default;
+  generic_graph_node_list_iterator& operator=(
+      const generic_graph_node_list_iterator& rhs) = default;
+  generic_graph_node_list_iterator& operator=(
+      generic_graph_node_list_iterator&& rhs) noexcept = default;
+  T* operator*() const {
+    return cur;
+  }
+  T* operator->() const {
+    return cur;
+  }
+  generic_graph_node_list_iterator& operator++() {
+    AT_ASSERT(cur);
+    cur = cur->next_in_graph[d];
+    return *this;
+  }
+  generic_graph_node_list_iterator operator++(int) {
+    generic_graph_node_list_iterator old = *this;
+    ++(*this);
+    return old;
+  }
+  generic_graph_node_list_iterator& operator--() {
+    AT_ASSERT(cur);
+    cur = cur->next_in_graph[reverseDir()];
+    return *this;
+  }
+  generic_graph_node_list_iterator operator--(int) {
+    generic_graph_node_list_iterator old = *this;
+    --(*this);
+    return old;
+  }
+
+  // erase cur without invalidating this iterator
+  // named differently from destroy so that ->/. bugs do not
+  // silently cause the wrong one to be called.
+  // iterator will point to the previous entry after call
+  void destroyCurrent() {
+    T* n = cur;
+    cur = cur->next_in_graph[reverseDir()];
+    n->destroy();
+  }
+  generic_graph_node_list_iterator reverse() {
+    return generic_graph_node_list_iterator(cur, reverseDir());
+  }
+
+ private:
+  int reverseDir() {
+    return d == kNextDirection ? kPrevDirection : kNextDirection;
+  }
+  T* cur;
+  int d; // direction 0 is forward 1 is reverse, see next_in_graph
+};
+
+template <typename T>
+struct generic_graph_node_list {
+  using iterator = generic_graph_node_list_iterator<T>;
+  using const_iterator = generic_graph_node_list_iterator<const T>;
+  generic_graph_node_list_iterator<T> begin() {
+    return generic_graph_node_list_iterator<T>(head->next_in_graph[d], d);
+  }
+  generic_graph_node_list_iterator<const T> begin() const {
+    return generic_graph_node_list_iterator<const T>(head->next_in_graph[d], d);
+  }
+  generic_graph_node_list_iterator<T> end() {
+    return generic_graph_node_list_iterator<T>(head->next_in_graph[!d], d);
+  }
+  generic_graph_node_list_iterator<const T> end() const {
+    return generic_graph_node_list_iterator<const T>(
+        head->next_in_graph[!d], d);
+  }
+  generic_graph_node_list_iterator<T> rbegin() {
+    return reverse().begin();
+  }
+  generic_graph_node_list_iterator<const T> rbegin() const {
+    return reverse().begin();
+  }
+  generic_graph_node_list_iterator<T> rend() {
+    return reverse().end();
+  }
+  generic_graph_node_list_iterator<const T> rend() const {
+    return reverse().end();
+  }
+  generic_graph_node_list reverse() {
+    return generic_graph_node_list(head->next_in_graph[!d], !d);
+  }
+  const generic_graph_node_list reverse() const {
+    return generic_graph_node_list(head->next_in_graph[!d], !d);
+  }
+  T* front() {
+    return head->next_in_graph[d];
+  }
+  const T* front() const {
+    return head->next_in_graph[d];
+  }
+  T* back() {
+    return head->next_in_graph[!d];
+  }
+  const T* back() const {
+    return head->next_in_graph[!d];
+  }
+  generic_graph_node_list(T* head, int d) : head(head), d(d) {}
+
+ private:
+  T* head; // both head and tail are sentinel nodes
+           // the first real node is head->next_in_graph[d]
+           // the tail sentinel is head->next_in_graph[!d]
+  int d;
+};
+
+template <typename T>
+static inline bool operator==(
+    generic_graph_node_list_iterator<T> a,
+    generic_graph_node_list_iterator<T> b) {
+  return *a == *b;
+}
+
+template <typename T>
+static inline bool operator!=(
+    generic_graph_node_list_iterator<T> a,
+    generic_graph_node_list_iterator<T> b) {
+  return *a != *b;
+}
+
+} // namespace torch::jit
+
+namespace std {
+
+template <typename T>
+struct iterator_traits<torch::jit::generic_graph_node_list_iterator<T>> {
+  using difference_type = int64_t;
+  using value_type = T*;
+  using pointer = T**;
+  using reference = T*&;
+  using iterator_category = bidirectional_iterator_tag;
+};
+
+} // namespace std
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/graph_utils.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/graph_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..eddf72808ea7ee57ef3833dfb88196f0d302b34f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/graph_utils.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+#include <vector>
+
+namespace torch::jit {
+
+TORCH_API TypePtr getTensorType(const at::Tensor& t, bool complete);
+
+TORCH_API TypePtr inferShapeAndTypeForInput(
+    TypePtr input_type,
+    Stack::const_iterator& s_iter,
+    const Stack::const_iterator& s_iter_end,
+    bool complete);
+
+TORCH_API void setInputTensorTypes(
+    Graph& g,
+    const Stack& stack,
+    bool complete,
+    const std::vector<int>& param_count_list = {});
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/ir.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/ir.h
new file mode 100644
index 0000000000000000000000000000000000000000..87a3b3c98cc952b91ea173d29be458155a205f25
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/ir.h
@@ -0,0 +1,1833 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/attributes.h>
+#include <torch/csrc/jit/ir/graph_node_list.h>
+#include <torch/csrc/jit/ir/named_value.h>
+#include <torch/csrc/jit/ir/scope.h>
+#include <torch/csrc/jit/runtime/operator.h>
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/utils/python_stub.h>
+#include <torch/csrc/utils/schema_info.h>
+
+#include <ATen/Utils.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/core/dynamic_type.h>
+#include <ATen/core/enum_type.h>
+#include <ATen/core/functional.h>
+#include <ATen/core/interned_strings.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Exception.h>
+#include <optional>
+
+#include <functional>
+#include <iosfwd>
+#include <unordered_set>
+#include <vector>
+
+// Forward declare, the real meat is in python_ir.cpp
+template <class T>
+class THPPointer;
+using THPObjectPtr = THPPointer<PyObject>;
+using pyobj_list = std::vector<THPObjectPtr>;
+
+namespace torch::jit {
+namespace utils {
+TORCH_API std::string getNodesModuleHierarchy(const Node& n);
+} // namespace utils
+class AliasDb;
+
+using ::c10::Argument;
+using ::c10::FunctionSchema;
+using ::c10::Symbol;
+
+using ::c10::ivalue::Shared;
+
+using ::c10::IValue;
+using ::c10::ivalue::Future;
+
+using ::c10::ivalue::ConstantString;
+
+#define C10_USING(T) using ::c10::T;
+C10_FORALL_TYPES(C10_USING)
+#undef C10_USING
+
+#define C10_USING(T) using ::c10::T##Ptr;
+C10_FORALL_TYPES(C10_USING)
+#undef C10_USING
+
+using ::c10::Type;
+using ::c10::TypeEnv;
+using ::c10::TypePtr;
+
+using ::c10::getTypePtr;
+using ::c10::MatchTypeReturn;
+using ::c10::TypeKind;
+
+using ::c10::fmap;
+
+namespace prim {
+using namespace ::c10::prim;
+}
+namespace attr {
+using namespace ::c10::attr;
+}
+namespace aten {
+using namespace ::c10::aten;
+}
+namespace cuda {
+#if !defined(USE_ROCM)
+using namespace ::c10::cuda;
+#endif
+} // namespace cuda
+
+struct Function;
+struct GraphFunction;
+struct MatchedSchema;
+
+// A Graph represents one "function" of computation.
+// It uses a simple ownership model where the graph owns all the nodes inside
+// it. All references inside the graph are raw pointers. Destroying the Graph
+// will invalidate any pointers to nodes in the graph.
+struct Graph;
+
+// Node is the base class of the IR graph. It represents one computation
+// and dependencies on a list of Values. The "prim-ops", so to speak.
+struct Node;
+
+// A Value represents an input or output to node that is either a
+// Tensor or an opaque Handle object, as determined by type().
+struct Value;
+
+TORCH_API std::ostream& operator<<(std::ostream& out, const Graph& g);
+TORCH_API std::ostream& operator<<(std::ostream& out, const Node& n);
+
+// A list of nodes, with inputs and outputs
+struct Block;
+
+// Each use is represented by this type, see 'Node::uses()'
+// 'user' is the consumer of the value, 'offset' is the index into
+// 'user's input this where the producers will be found.
+struct Use {
+  Use(Node* user, size_t offset) : user(user), offset(offset) {}
+  Node* user;
+  size_t offset;
+
+  bool operator==(const Use& b) {
+    return user == b.user && offset == b.offset;
+  }
+};
+
+// Note [User node does not uniquely identify use]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// A while back, we wrote some code manipulating uses that looked like this:
+//
+//    for (auto& use : used_val->uses_) {
+//      if (use.user == this_node) {
+//        use.offset += 1;
+//        break;
+//      }
+//    }
+//
+// This code is trying to find a particular use (our node's use) to update it.
+// However, it's wrong: there may be *multiple* uses of a value %x in a node,
+// as might be the case in this IR:
+//
+//    %y = Add %x %x
+//
+// In this case, there are two uses of %x whose user is the node 'Add %x %x'.
+// So, "use induced by this node" is not a well-formed concept.
+//
+// If you are looking for "use induced by an input", it's best to use
+// findUseForInput() to get it.
+
+// the list types are intentionally simple, but we type-def
+// them here so if we need to change them, refactoring will be easier
+using node_list = std::vector<Node*>;
+using value_list = std::vector<Value*>;
+using use_list = std::vector<Use>;
+template <typename T>
+using ArrayRef = at::ArrayRef<T>;
+using NodeKind = Symbol;
+using topo_position_t = int64_t;
+using ValueSet = std::unordered_set<const Value*>;
+
+struct OperatorSet;
+template <typename T>
+struct OperatorMap;
+
+// This is a wrapper to allow invalidating the Python object
+// safely when the C++ object for a Node/Value/Block is deleted
+// like much of graph, it isn't safe for different threads to
+// access the same graph
+template <typename T>
+struct Wrap {
+  explicit Wrap(T* p) : elem(p) {}
+  void clear() {
+    if (clear_cb) {
+      clear_cb(elem);
+    }
+    elem = nullptr;
+  }
+  T* elem;
+  void (*clear_cb)(void*){nullptr};
+};
+
+struct Value {
+  AT_DISALLOW_COPY_AND_ASSIGN(Value);
+  Value(Node* node_, size_t offset_);
+
+ private:
+  friend struct Node;
+  friend struct Graph;
+  Node* node_;
+  size_t offset_;
+  size_t unique_ = 0; // unique id
+  use_list uses_;
+  std::string unique_name_;
+  TypePtr type_;
+  // a managing wrapper for Python to allow invalidation
+  std::shared_ptr<Wrap<Value>> wrap_;
+
+ public:
+  Value* setType(TypePtr type);
+  TORCH_API void inferTypeFrom(const at::Tensor& output);
+  TORCH_API void inferTypeFrom(
+      const c10::intrusive_ptr<c10::ivalue::Object>& output);
+  const TypePtr& type() const {
+    AT_ASSERT(type_ != nullptr);
+    return type_;
+  }
+  bool requires_grad() const {
+    return type()->requires_grad();
+  }
+  bool isCompleteTensor() const {
+    if (auto pt = type()->cast<TensorType>()) {
+      return pt->isComplete();
+    }
+    return false;
+  }
+  TORCH_API bool mustBeNone() const;
+  TORCH_API bool mustNotBeNone() const;
+  size_t unique() const {
+    return unique_;
+  }
+  bool hasDebugName() const {
+    return !unique_name_.empty();
+  }
+  static bool isValidName(const std::string& name);
+  TORCH_API Value* setDebugName(const std::string& name);
+  std::string debugName() const {
+    if (hasDebugName()) {
+      return unique_name_;
+    }
+    return std::to_string(unique());
+  }
+  TORCH_API std::string debugNameBase() const;
+  Node* node() {
+    return node_;
+  }
+  size_t offset() const {
+    return offset_;
+  }
+  void setOffset(size_t offset) {
+    offset_ = offset;
+  }
+  const Node* node() const {
+    return node_;
+  }
+
+  /**
+   * @warning NEVER pass raw pointer of smart pointer managed Graph to Python.
+   * Check #87343 for details.
+   */
+  Graph* owningGraph();
+  const Graph* owningGraph() const;
+  // TODO: make this more const correct
+  const use_list& uses() const {
+    return uses_;
+  }
+
+  bool hasUses() const {
+    return !uses().empty();
+  }
+
+  TORCH_API void replaceFirstUseWith(Value* newValue);
+
+  // Replaces all uses of this value with 'newValue'.
+  //
+  // Given:   %3 = f(%1, %2)
+  //          %4 = g(%3)
+  //          %5 = h(%3, %3)
+  // Execute: %3.replaceAllUsesWith(%6)
+  // Result:  %3 = f(%1, %2)
+  //          %4 = g(%6)
+  //          %5 = h(%6, %6)
+  TORCH_API void replaceAllUsesWith(Value* newValue);
+
+  // Replaces all uses of this value with 'newValue' after 'node'.
+  // Given:   %3 = f(%1, %2)
+  //          %4 = g(%3)
+  //          %5 = inplace_(%3)
+  //          %6 = h(%3, %3)
+  // Execute: %3.replaceAllUsesAfterNodeWith(%5.node(), %5)
+  // Result:  %3 = f(%1, %2)
+  //          %4 = g(%3)
+  //          %5 = inplace_(%3)
+  //          %6 = h(%5, %5)
+  // XXX: does not check scoping legality, consider using
+  // replaceAllUsesDominatedByNodeWith
+  TORCH_API void replaceAllUsesAfterNodeWith(const Node* node, Value* newValue);
+
+  // Replaces all uses of this value with 'newValue' that are dominated by
+  // 'node'. Given:
+  // x = op(...).
+  // if cond:
+  //    z = foo(..)
+  //    bar(x)
+  // else:
+  //    print(x)
+  // x.replaceAllUsesDominatedByNodeWith(foo, z) would replace bar(x)
+  // but not print(x) because print is not dominated by foo.
+  // replaceAllUsesAfterNode does not check domination, so in this example
+  // it would produce invalid IR.
+  TORCH_API void replaceAllUsesDominatedByNodeWith(
+      const Node* node,
+      Value* newValue);
+
+  TORCH_API Value* copyMetadata(Value* from);
+
+  TORCH_API std::shared_ptr<Wrap<Value>> wrap() {
+    if (!wrap_) {
+      wrap_ = std::make_shared<Wrap<Value>>(this);
+    }
+    return wrap_;
+  }
+
+  virtual ~Value() {
+    if (wrap_) {
+      wrap_->clear();
+    }
+  }
+};
+
+struct TORCH_API Node {
+  AT_DISALLOW_COPY_AND_ASSIGN(Node);
+  friend struct Graph;
+  friend struct Block;
+  friend struct Value;
+  friend graph_node_list;
+  friend const_graph_node_list;
+  friend graph_node_list_iterator;
+  friend const_graph_node_list_iterator;
+
+ private:
+  const NodeKind kind_;
+  std::vector<Value*> inputs_;
+  std::vector<Value*> outputs_;
+  // subblocks
+  std::vector<Block*> blocks_;
+  Graph* graph_;
+  Block* owning_block_;
+  std::optional<SourceRange> source_range_;
+  ScopePtr scope_;
+  std::optional<InlinedCallStackPtr> callstack_;
+  // Assumes FunctionSchemas are persistent, so we don't manage their lifetime.
+  // This field is effective a cache that's populated on attribute lookups and
+  // invalidated every time we perform an operation that could potentially
+  // change the schema. note: mutable because schema_ is effectively a cache
+  mutable const Operator* op_;
+  topo_position_t topo_position_ = 0;
+  // a managing wrapper for Python to allow invalidation
+  std::shared_ptr<Wrap<Node>> wrap_;
+  // Stores the full schema name, if the operator is historic
+  // When the operator is deprecated or the name of the operator
+  // is changed, we need to rely on this name
+  // to retrieve old schemas to successfully apply upgraders
+  // for this operator.
+  std::optional<std::string> historic_schema_name_ = std::nullopt;
+
+ protected:
+  Node(Graph* graph_, NodeKind kind_); // defined after graph
+ public:
+  // Each Node but Return/Param Nodes are associated with exactly one
+  // place in the Node list of the Graph. The Graph itself is a circular
+  // doubly-linked list. The Return Node is used as the sentinel for the
+  // "beginning"/"end" of the list. This means that you can tell when
+  // you've traversed the entire list without means worrying about null
+  // pointers. `next_in_graph[0]` is the pointer to the next Node, while
+  // `next_in_graph[1]` is the pointer to the previous Node. The
+  // linked list is implemented as an array to allow the same iterator
+  // class for forward and reversed Node lists. Taken together, this
+  // list also represents a topological sort of the Nodes in the Graph.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,cppcoreguidelines-non-private-member-variables-in-classes,modernize-avoid-c-arrays)
+  Node* next_in_graph[2] = {nullptr, nullptr};
+
+  std::shared_ptr<Wrap<Node>> wrap() {
+    if (!wrap_) {
+      wrap_ = std::make_shared<Wrap<Node>>(this);
+    }
+    return wrap_;
+  }
+
+  const std::optional<std::string> getHistoricSchemaName() {
+    return historic_schema_name_;
+  }
+
+  void setHistoricSchemaName(const std::string& name) {
+    historic_schema_name_ = name;
+  }
+
+  Node*& next() {
+    return next_in_graph[kNextDirection];
+  }
+  Node*& prev() {
+    return next_in_graph[kPrevDirection];
+  }
+  Node* const& next() const {
+    return next_in_graph[kNextDirection];
+  }
+  Node* const& prev() const {
+    return next_in_graph[kPrevDirection];
+  }
+
+  NodeKind kind() const {
+    return kind_;
+  }
+  Node* setSourceRange(SourceRange r) {
+    source_range_ = std::move(r);
+    return this;
+  }
+  SourceRange sourceRange() const;
+
+  /**
+   * @warning NEVER pass raw pointer of smart pointer managed Graph to Python.
+   * Check #87343 for details.
+   */
+  Graph* owningGraph() {
+    return graph_;
+  }
+  const Graph* owningGraph() const {
+    return graph_;
+  }
+  Block* owningBlock() {
+    return owning_block_;
+  }
+  const Block* owningBlock() const {
+    return owning_block_;
+  }
+  ScopePtr scope() {
+    return scope_;
+  }
+  void setScope(ScopePtr scope) {
+    scope_ = std::move(scope);
+  }
+  std::string scopeName() const {
+    if (!scope_) {
+      return "";
+    }
+    return scope_->namesFromRoot();
+  }
+
+  // Copies the source range, scope and callstack from another node.
+  Node* copyMetadata(Node* from) {
+    this->setSourceRange(from->sourceRange());
+    this->setScope(from->scope());
+    if (auto cs = from->callstack()) {
+      this->setCallStack(*cs);
+    }
+    return this;
+  }
+
+  std::optional<InlinedCallStackPtr> callstack() const {
+    return callstack_;
+  }
+  void setCallStack(InlinedCallStackPtr cs) {
+    callstack_ = std::move(cs);
+  }
+
+  // NB: This returns an ArrayRef; that means that it will
+  // get invalidated if you resize inputs (e.g., using addInput)
+  // We can't return a std::vector<Node*>& because there's no
+  // way to soundly cast to std::vector<const Node*> (an insane
+  // implementation of std::vector could make this representationally
+  // different.)
+  at::ArrayRef<Value*> inputs() {
+    return inputs_;
+  }
+  at::ArrayRef<const Value*> inputs() const {
+    // Vectors are not convertible in const-ness of elements, but
+    // raw pointers are.
+    return {inputs_.data(), inputs_.size()};
+  }
+  // NB: This returns an ArrayRef; that means that it will
+  // get invalidated if you resize inputs (e.g., using addInput)
+  // We can't return a std::vector<Node*>& because there's no
+  // way to soundly cast to std::vector<const Node*> (an insane
+  // implementation of std::vector could make this representationally
+  // different.)
+  at::ArrayRef<Value*> outputs() {
+    return outputs_;
+  }
+  at::ArrayRef<const Value*> outputs() const {
+    // Vectors are not convertible in const-ness of elements, but
+    // raw pointers are.
+    return {outputs_.data(), outputs_.size()};
+  }
+  Value* output(size_t i) const {
+    return outputs_.at(i);
+  }
+  bool hasUses() const {
+    for (auto o : outputs()) {
+      if (!o->uses().empty()) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  void replaceAllUsesWith(Node* n);
+
+  // replaces `this` with a new node with the same inputs and outputs
+  // but a new node symbol. does not destroy `this`
+  Node* replaceWithNewSymbol(Symbol new_symbol);
+
+  // Checks if this node is dominated by `dominator` which means that
+  // `dominator` will always be executed before `this` and `dominator`
+  // is in scope of `this.
+  bool isDominatedBy(const Node* dominator) const;
+
+  // lots of things like chunk have a single input or single output, so we have
+  // a helper to make accessing it easier
+  Value* input() {
+    AT_ASSERT(inputs_.size() == 1);
+    return inputs_.at(0);
+  }
+  Value* output() {
+    AT_ASSERT(outputs_.size() == 1);
+    return outputs_.at(0);
+  }
+  const Value* output() const {
+    AT_ASSERT(outputs_.size() == 1);
+    return outputs_.at(0);
+  }
+  const Value* input() const {
+    AT_ASSERT(inputs_.size() == 1);
+    return inputs_.at(0);
+  }
+  // Access a particular input.  This is a checked index.
+  Value* input(size_t i) const {
+    return inputs_.at(i);
+  }
+
+  bool hasNamedInput(const std::string& unqualName) const;
+  Value* namedInput(const std::string& unqualName) const;
+  Value* namedInput(Symbol name) const;
+
+  std::optional<IValue> get(Symbol name) const;
+
+  template <typename T>
+  std::optional<T> get(Symbol name) const {
+    if (auto v = get(name)) {
+      return v->template to<T>();
+    }
+    return std::nullopt;
+  }
+
+  // Returns true if the value of input name is statically known
+  bool is_constant(Symbol name) const {
+    return static_cast<bool>(get(name));
+  }
+  bool mustBeNone() const;
+
+  bool isNondeterministic() const;
+  bool hasSideEffects() const;
+
+  // instructions lowered by the interpreter and not run in the optimized graph
+  bool notExecutedOp() const {
+    return kind_ == prim::Constant || kind_ == prim::profile ||
+        kind_ == prim::profile_ivalue;
+  }
+
+  // Graphs
+
+  // Note [Topological invariant]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // We always maintain an up-to-date topological ordering of all nodes via
+  // the next()/prev() links.  All transformations to graphs must preserve
+  // this topological ordering: for example, it is only valid to 'addInput'
+  // with an input which is topologically before the current node.
+  //
+  // Usually, it is obvious whether or not topological order is maintained;
+  // for example, if you are adding nodes to the end of the topsort, it's
+  // impossible for them to refer to inputs that are not in the topsort.
+  // If it is not obvious, please comment accordingly.
+
+  // Add 'node' as an input to 'this' at the end of existing
+  // arguments.  Returns the added node for ease of chaining.
+  //
+  // Given:   %3 = f(%1, %2)
+  // Execute: %3.addInput(%4)
+  // Result:  %3 = f(%1, %2, %4)
+  Value* addInput(Value* value);
+
+  // Add 'value' as an input to 'this' at the specified position in the
+  // arguments. Returns the added value for ease of chaining.
+  Value* insertInput(size_t i, Value* value);
+
+  // Replace the input of 'this' at position 'i' with
+  // 'newValue', returning the old node.
+  //
+  // Given:   %3 = f(%1, %2)
+  // Execute: %3.replaceInput(1, %4)
+  // Result:  %3 = f(%1, %4)
+  Value* replaceInput(size_t i, Value* newValue);
+
+  // Replace all occurrences of 'from' in the inputs of this
+  // node with 'to'. Corresponds to llvm's replaceUsesOfWith.
+  //
+  // Given:   %3 = f(%1, %2, %1)
+  // Execute: %3.replaceInputWith(%1, %4)
+  // Result:  %3 = f(%4, %2, %4)
+  void replaceInputWith(Value* from, Value* to);
+
+  Value* addOutput();
+
+  Value* insertOutput(size_t i);
+
+  void eraseOutput(size_t i);
+
+  Block* addBlock();
+  void eraseBlock(size_t i);
+
+  // Each Node can have a list of subblocks. These are used to define structured
+  // nested control flow operators such as If and Loop.
+  // The meaning of a block is specific to the kind of node it is in, but
+  // all blocks share these semantics:
+  // * Nested lexical scoping: If a node 'Parent' has a subblock which contains
+  //   a node 'Child', Child can use any value that was in scope for the Parent
+  //   node in addition to any values defined before 'Child' in the subblock.
+  // * The list of inputs to the block are in scope for the duration of the
+  //   block
+  // * the outputs of the Parent node are not in scope for the subblocks
+  // Typically the inputs to a block that represents control flow act as
+  // as the equivalents phi-nodes in standard SSA form,
+  // defining a new Value to represent any term that has multiple
+  // definitions depending on how control flowed. Outputs of the node containing
+  // control flow serve a similiar purpose defining new values for variables
+  // that would have different definitions depending on which way control
+  // flowed.
+
+  at::ArrayRef<Block*> blocks() {
+    return blocks_;
+  }
+  at::ArrayRef<const Block*> blocks() const {
+    // Vectors are not convertible in const-ness of elements, but
+    // raw pointers are.
+    return {blocks_.data(), blocks_.size()};
+  }
+
+  // Is 'this' before 'n' in the topological order?
+  bool isBefore(const Node* n) const;
+
+  // Is 'this' after 'n' in the topological order?
+  bool isAfter(const Node* n) const;
+
+  // Insert unattached 'this' node before 'n' in the topological order.
+  // Returns this (for chaining).
+  //
+  // Given:   %3 = f(%1, %2)
+  //          %4 = g(%3)
+  // and unattached: %5 = h(%1)
+  // Execute: %5.insertBefore(%4)
+  // Result:  %3 = f(%1, %2)
+  //          %5 = h(%1)
+  //          %4 = g(%3)
+  Node* insertBefore(Node* n);
+
+  // Insert unattached 'this' node after 'n' in the topological order.
+  // Returns this (for chaining).
+  //
+  // Given: %3 = f(%1, %2)
+  //        %4 = g(%3)
+  // and unattached: %5 = h(%1)
+  // Execute: %5.insertAfter(%4)
+  // Result:  %3 = f(%1, %2)
+  //          %4 = g(%3)
+  //          %5 = h(%1)
+  Node* insertAfter(Node* n);
+
+  // Move 'this' (already in the graph) after 'n' in the topological order.
+  //
+  // NOTE: Does not check that value dependencies are preserved, see
+  //   AliasDb::moveAfterTopologicallyValid
+  //
+  // Given: %2 = f(%1)
+  //        %3 = g(%1)
+  // Execute: %2.moveAfter(%3)
+  // Result: %3 = g(%1)
+  //         %2 = f(%1)
+  //
+  void moveAfter(Node* n);
+
+  // Move a node 'n' (already in the graph) before 'this' in the topological
+  // order.
+  //
+  // NOTE: Does not check that value dependencies are preserved, see
+  //   AliasDb::moveBeforeTopologicallyValid
+  //
+  // Given: %2 = f(%1)
+  //        %3 = g(%1)
+  // Execute: %3.moveBefore(%2)
+  // Result: %3 = g(%1)
+  //         %2 = f(%1)
+  void moveBefore(Node* n);
+
+  // Remove the input at 'i' from this node.
+  //
+  // WARNING: This is O(n) in the number of inputs, so avoid repeatedly calling
+  // removeInput.
+  //
+  // Given: %3 = f(%1, %2)
+  // Execute: %3.removeInput(1)
+  // Result: %3 = f(%1)
+  void removeInput(size_t i);
+
+  // Remove all inputs from a node.
+  //
+  // Given: %3 = f(%1, %2)
+  // Execute: %3.removeAllInputs()
+  // Result: %3 = f()
+  void removeAllInputs();
+
+  // Remove all outputs from a node.
+  //
+  // Given: %1, %2 = f()
+  // Execute:removeAllInputs()
+  // Result: = f()
+  void removeAllOutputs();
+
+  // Rearrange the ordering of inputs or outputs of a node
+  // Given: %3 = f(%1, %2)
+  // Execute: %3.permuteInputs({1, 0})
+  // Result: %3 = f(%2, %1)
+  // Each index must appear exactly once
+  void permuteInputs(const std::vector<size_t>& new_inputs);
+  void permuteOutputs(const std::vector<size_t>& new_inputs);
+
+  // iterators of the node list starting at this node
+  // useful for resuming a search starting at this node
+  inline graph_node_list_iterator iterator() {
+    return {this, 0};
+  }
+  inline graph_node_list_iterator reverseIterator() {
+    return iterator().reverse();
+  }
+  inline const_graph_node_list_iterator iterator() const {
+    return {this, 0};
+  }
+  inline const_graph_node_list_iterator reverseIterator() const {
+    return iterator().reverse();
+  }
+
+  // Remove 'this' from the instruction list and deallocate it.
+  //
+  // Invariant: no outputs of 'this' may have any uses.
+  //
+  // Given: %2 = f(%1)
+  //        %3 = g(%1)
+  // Execute: %2.destroy()
+  // Result: %3 = g(%1)
+  void destroy();
+
+  // Dynamically cast this node to the subclass indicated by the
+  // template variable, returning nullptr if the cast is invalid..
+  //
+  // Example usage: if(auto s = n.cast<Select>()) { ... }
+  template <typename T>
+  T* cast() {
+    if (T::Kind == kind()) {
+      return static_cast<T*>(this);
+    }
+    return nullptr;
+  }
+  template <typename T>
+  const T* cast() const {
+    if (T::Kind == kind()) {
+      return static_cast<const T*>(this);
+    }
+    return nullptr;
+  }
+
+  template <typename T>
+  T* expect() {
+    TORCH_CHECK(
+        T::Kind == kind(),
+        "expected a ",
+        T::Kind.toDisplayString(),
+        " but found a ",
+        kind().toDisplayString());
+    return static_cast<T*>(this);
+  }
+
+  bool matches(const FunctionSchema& schema) const;
+
+  // XXX: this function is meant to be used with string literals only!
+  bool matches(
+      const char* signature_literal,
+      at::ArrayRef<Symbol> const_inputs = {}) const;
+
+  bool isMemberOf(const OperatorSet& os) const;
+  template <typename T>
+  bool isMemberOf(const OperatorMap<T>& om) const {
+    auto it = om.map.find(kind());
+    if (it == om.map.end()) {
+      return false;
+    }
+    for (auto& op : it->second) {
+      if (matches(op.first->schema())) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  const FunctionSchema& schema() const;
+  const FunctionSchema* maybeSchema() const;
+  const Operator& getOperator() const;
+  Operation getOperation() const;
+
+  const Operator* maybeOperator() const;
+
+  void dump() const;
+
+  std::ostream& print(
+      std::ostream& out,
+      size_t level,
+      std::vector<const Node*>* groups,
+      bool print_source_locations = true,
+      bool print_attributes = true,
+      bool print_scopes = true,
+      bool print_body = true) const;
+
+  virtual ~Node() {
+    if (wrap_) {
+      wrap_->clear();
+    }
+  }
+
+  // Methods for accessing attributes
+  Node* copyAttributes(const Node& rhs) {
+    values_.clear();
+    for (const AVPtr& i : rhs.values_) {
+      values_.push_back(i->clone());
+    }
+    return this;
+  }
+  bool hasAttribute(Symbol name) const {
+    AT_ASSERT(name.is_attr());
+    return findAttr(name, false) != values_.end();
+  }
+  bool hasAttributeS(const std::string& name) const {
+    return hasAttribute(Symbol::attr(name));
+  }
+  AttributeKind kindOf(Symbol name) const {
+    AT_ASSERT(name.is_attr());
+    return (*findAttr(name, true))->kind();
+  }
+  AttributeKind kindOfS(const std::string& name) const {
+    return kindOf(Symbol::attr(name));
+  }
+  Node* removeAttribute(Symbol name) {
+    AT_ASSERT(name.is_attr());
+    values_.erase(findAttr(name, true));
+    return this;
+  }
+  Node* removeAttributeS(const std::string& name) {
+    return removeAttribute(Symbol::attr(name));
+  }
+  bool hasAttributes() const {
+    return !values_.empty();
+  }
+  size_t numAttributes() const {
+    return values_.size();
+  }
+  // The names are returned in order, since name actually is the index.
+  std::vector<Symbol> attributeNames() const {
+    std::vector<Symbol> names;
+    names.reserve(values_.size());
+    for (const AVPtr& a : values_) {
+      names.push_back(a->name);
+    }
+    return names;
+  }
+  std::vector<const char*> attributeNamesS() const {
+    std::vector<const char*> names;
+    names.reserve(values_.size());
+    for (const AVPtr& a : values_) {
+      names.push_back(a->name.toUnqualString());
+    }
+    return names;
+  }
+
+#define CREATE_ACCESSOR(Kind, method)                           \
+  Node* method##_(Symbol name, Kind##Attr::ConstructorType v) { \
+    return setAttr<Kind##Attr>(                                 \
+        name, std::forward<Kind##Attr::ConstructorType>(v));    \
+  }                                                             \
+  const Kind##Attr::ValueType& method(Symbol name) const {      \
+    return getAttr<Kind##Attr>(name);                           \
+  }
+
+  CREATE_ACCESSOR(Float, f)
+  CREATE_ACCESSOR(Complex, c)
+  CREATE_ACCESSOR(Floats, fs)
+  CREATE_ACCESSOR(ComplexVals, cs)
+  CREATE_ACCESSOR(String, s)
+  CREATE_ACCESSOR(Strings, ss)
+  CREATE_ACCESSOR(Int, i)
+  CREATE_ACCESSOR(Ints, is)
+  CREATE_ACCESSOR(Graph, g)
+  CREATE_ACCESSOR(Graphs, gs)
+  CREATE_ACCESSOR(Type, ty)
+  CREATE_ACCESSOR(Types, tys)
+  CREATE_ACCESSOR(IValue, ival)
+
+#undef CREATE_ACCESSOR
+
+  // Our Graphs are not very const-correct, so we need to allow returning
+  // non-const references too
+  GraphAttr::ValueType& g(Symbol name) {
+    return getAttr<GraphAttr>(name);
+  }
+
+  // does not use CREATE_ACCESSOR because we need additional asserts
+  Node* t_(Symbol name, TensorAttr::ConstructorType v) {
+    return setAttr<TensorAttr>(
+        name, std::forward<TensorAttr::ConstructorType>(v));
+  }
+  const TensorAttr::ValueType& t(Symbol name) const {
+    return getAttr<TensorAttr>(name);
+  }
+
+  Node* ts_(Symbol name, TensorsAttr::ConstructorType v) {
+    return setAttr<TensorsAttr>(
+        name, std::forward<TensorsAttr::ConstructorType>(v));
+  }
+  const TensorsAttr::ValueType& ts(Symbol name) const {
+    return getAttr<TensorsAttr>(name);
+  }
+
+  Block* findCommonAncestorBlockWith(Node* n);
+
+  size_t blocksFromGraphBlock();
+
+ private:
+  void printAttrValue(std::ostream& out, const Symbol& name) const;
+  void printAttributes(std::ostream& out, bool ignore_subgraph) const;
+
+  template <typename T>
+  Node* setAttr(Symbol name, typename T::ConstructorType v) {
+    AT_ASSERT(name.is_attr());
+    auto it = findAttr(name, false);
+    auto nv = AVPtr(new T(name, std::forward<typename T::ConstructorType>(v)));
+    // NOLINTNEXTLINE(bugprone-branch-clone)
+    if (it == values_.end()) {
+      values_.push_back(std::move(nv));
+    } else {
+      *it = std::move(nv);
+    }
+    return this;
+  }
+  template <typename T>
+  typename T::ValueType& getAttr(Symbol name) const {
+    AT_ASSERT(name.is_attr());
+    auto it = findAttr(name, true);
+    auto* child = dynamic_cast<T*>(it->get());
+    if (child == nullptr) {
+      throw IRAttributeError(name, true);
+    }
+    return child->value();
+  }
+  using AVPtr = AttributeValue::Ptr;
+  // NB: For determinism, we use a vector rather than a hash map.  This does
+  // mean that lookups are O(n), so you shouldn't use Attributes to store
+  // a big pile of messages.
+  std::vector<AVPtr> values_;
+  std::vector<AVPtr>::iterator findAttr(Symbol name, bool required) {
+    AT_ASSERT(name.is_attr());
+    auto it = std::find_if(values_.begin(), values_.end(), [&](const AVPtr& v) {
+      return v->name == name;
+    });
+    if (required && it == values_.end()) {
+      throw IRAttributeError(name, false);
+    }
+    AT_ASSERT(!required || it != values_.end());
+    return it;
+  }
+  std::vector<AVPtr>::const_iterator findAttr(Symbol name, bool required)
+      const {
+    AT_ASSERT(name.is_attr());
+    auto it = std::find_if(values_.begin(), values_.end(), [&](const AVPtr& v) {
+      return v->name == name;
+    });
+    if (required && it == values_.end()) {
+      throw IRAttributeError(name, false);
+    }
+    AT_ASSERT(!required || it != values_.end());
+    return it;
+  }
+
+  enum class MoveSide { BEFORE, AFTER };
+  bool isBeforeOrAfter(const Node* n, MoveSide moveSide) const;
+
+  std::pair<Value*, const Argument&> findInput(Symbol name);
+  // Lookup iterator in use list of _input i_ that corresponds to its use of
+  // _this_
+  use_list::iterator findUseForInput(size_t i);
+
+  // remove the use of input i, this sets input i to nullptr, but
+  // is only used internally to Node before setting it to a new value
+  // or erasing the entry from the list.
+  Value* dropInput(size_t i);
+
+  bool inBlockList() const {
+    if (next() == nullptr) {
+      AT_ASSERT(prev() == nullptr);
+    }
+    return next() != nullptr;
+  }
+
+  void removeFromList();
+  void lint() const;
+
+  void assignTopoPosition();
+
+ protected:
+  // subclasses must override
+  // this function is used by createClone to initialize a new version
+  // of a node in another graph. It should allocate a new instance of the same
+  // concrete type as 'this', but in graph 'g' which might be different
+  // than graph_
+  virtual Node* allocNewInstance(Graph* g) {
+    return new Node(g, kind());
+  }
+  // create a copy of all properties of Node s into this.
+  // subclasses should extend if they have additional information to copy.
+  // 'this' will be allocated with s->allocNewInstance(g) so it should have
+  // the same concrete type as 's'
+  virtual void cloneFrom(Node* s);
+};
+
+struct Block {
+  friend struct Node;
+  friend struct Graph;
+
+  AT_DISALLOW_COPY_AND_ASSIGN(Block);
+  TORCH_API Block(Graph* graph_, Node* node_);
+
+  at::ArrayRef<Value*> inputs() {
+    return input_->outputs();
+  }
+  at::ArrayRef<const Value*> inputs() const {
+    const auto& inputs = input_->outputs();
+    return {inputs.data(), inputs.size()};
+  }
+  at::ArrayRef<Value*> outputs() {
+    return output_->inputs();
+  }
+  at::ArrayRef<const Value*> outputs() const {
+    return static_cast<const Node*>(output_)->inputs();
+  }
+  graph_node_list nodes() {
+    return {input_, kNextDirection};
+  }
+  const_graph_node_list nodes() const {
+    return {input_, kNextDirection};
+  }
+  Node* return_node() {
+    return output_;
+  }
+  const Node* return_node() const {
+    return output_;
+  }
+  Node* param_node() {
+    return input_;
+  }
+  const Node* param_node() const {
+    return input_;
+  }
+  /**
+   * @warning NEVER pass raw pointer of smart pointer managed Graph to Python.
+   * Check #87343 for details.
+   */
+  Graph* owningGraph() {
+    return graph_;
+  }
+  const Graph* owningGraph() const {
+    return graph_;
+  }
+  Node* owningNode() {
+    return owning_node_;
+  }
+  const Node* owningNode() const {
+    return owning_node_;
+  }
+
+  Value* addInput(const std::string& name = "") {
+    Value* v = input_->addOutput();
+    v->setDebugName(name);
+    return v;
+  }
+  Value* insertInput(size_t i, const std::string& name = "") {
+    Value* v = input_->insertOutput(i);
+    v->setDebugName(name);
+    return v;
+  }
+  void eraseInput(size_t i) {
+    input_->eraseOutput(i);
+  }
+  void removeAllInputs() {
+    input_->removeAllOutputs();
+  }
+  size_t registerOutput(Value* v) {
+    output_->addInput(v);
+    return outputs().size() - 1;
+  }
+  size_t insertOutput(size_t i, Value* n) {
+    output_->insertInput(i, n);
+    return i;
+  }
+  void eraseOutput(size_t i) {
+    output_->removeInput(i);
+  }
+  void removeAllOutputs() {
+    output_->removeAllInputs();
+  }
+
+  void replaceOutput(size_t i, Value* n) {
+    output_->replaceInput(i, n);
+  }
+  void permuteOutputs(const std::vector<size_t>& new_inputs) {
+    output_->permuteInputs(new_inputs);
+  }
+  void permuteInputs(const std::vector<size_t>& new_inputs) {
+    input_->permuteOutputs(new_inputs);
+  }
+
+  Node* appendNode(Node* n) {
+    AT_ASSERT(n->graph_ == graph_ && !n->inBlockList());
+    n->insertBefore(output_);
+    return n;
+  }
+  Node* prependNode(Node* n) {
+    AT_ASSERT(n->graph_ == graph_ && !n->inBlockList());
+    n->insertAfter(input_);
+    return n;
+  }
+
+  // clone all inputs, nodes, and outputs from src and append them
+  // to the inputs, nodes, and outputs of this block
+  // value_map is used whenever a node in src references a free variable
+  // in src to look up its corresponding value
+  TORCH_API void cloneFrom(Block* src, std::function<Value*(Value*)> value_map);
+  TORCH_API void remapTypes(const std::function<TypePtr(TypePtr)>& type_map);
+
+  TORCH_API std::shared_ptr<Wrap<Block>> wrap() {
+    if (!wrap_) {
+      wrap_ = std::make_shared<Wrap<Block>>(this);
+    }
+    return wrap_;
+  }
+
+  virtual ~Block() {
+    if (wrap_) {
+      wrap_->clear();
+    }
+  }
+
+  void clear() {
+    removeAllOutputs();
+    for (auto it = nodes().rbegin(); it != nodes().rend(); it++) {
+      it.destroyCurrent();
+    }
+    removeAllInputs();
+  }
+
+ private:
+  void reIndexTopology();
+
+  // get rid of all nodes
+  // destroys in reverse order so that uses internal to this block
+  // do not have to be removed before you can destroy the block
+  void destroy();
+
+  Graph* const graph_;
+  // holds outputs in a way that can be reflected
+  // as a Use object
+  // also used as the beginning/end of the circular node list to avoid
+  // having corner cases where the list is empty.
+  Node* const output_;
+  Node* const input_;
+  Node* const
+      owning_node_; // either the node that has this block or nullptr for root
+  // a managing wrapper for Python to allow invalidation
+  std::shared_ptr<Wrap<Block>> wrap_;
+};
+
+struct Graph : std::enable_shared_from_this<Graph> {
+  AT_DISALLOW_COPY_AND_ASSIGN(Graph);
+  friend struct Node;
+  friend struct Value;
+  friend struct Block;
+
+ private:
+  // only used to keep track of allocated nodes
+  // actual representation of Graph is done with
+  // inputs, outputs, nodes
+
+  std::unordered_set<const Node*> all_nodes;
+  std::unordered_set<const Value*> all_values;
+  std::unordered_set<const Block*> all_blocks;
+  size_t next_unique_{0};
+
+  std::unordered_map<std::string, Value*> unique_names_;
+  // name_base_suffix tracks largest suffix currently used by all names sharing
+  // same name_base. Key of this map is name_base, value is largest suffix
+  // numeric value.
+  std::unordered_map<std::string, size_t> name_base_suffix_;
+
+  ScopePtr current_scope_;
+
+  Block* const block_;
+  // when insertNode() is called, the node is inserted before this node
+  // by default this is set to append to the top level block
+  Node* insert_before_;
+  int64_t predicted_insert_count_ = 0;
+
+  std::optional<size_t> op_version_;
+
+ public:
+  Graph(ScopePtr scope_root = c10::make_intrusive<Scope>())
+      : current_scope_(std::move(scope_root)),
+        block_(new Block(this, nullptr)),
+        insert_before_(return_node()) {}
+
+  at::ArrayRef<Value*> inputs() {
+    return block_->inputs();
+  }
+  at::ArrayRef<const Value*> inputs() const {
+    const Block& block = *block_;
+    return block.inputs();
+  }
+  at::ArrayRef<Value*> outputs() {
+    return block_->outputs();
+  }
+  at::ArrayRef<const Value*> outputs() const {
+    const Block& block = *block_;
+    return block.outputs();
+  }
+  graph_node_list nodes() {
+    return block_->nodes();
+  }
+  const_graph_node_list nodes() const {
+    const Block& block = *block_;
+    return block.nodes();
+  }
+  Node* param_node() {
+    return block_->param_node();
+  }
+  const Node* param_node() const {
+    return block_->param_node();
+  }
+  Node* return_node() {
+    return block_->return_node();
+  }
+  const Node* return_node() const {
+    return block_->return_node();
+  }
+  const std::unordered_map<std::string, Value*>& debugNames() const {
+    return unique_names_;
+  }
+
+  TORCH_API void push_scope(const std::string& scope_name);
+  TORCH_API void pop_scope();
+
+  ScopePtr current_scope() {
+    return current_scope_;
+  }
+
+  void set_op_version(std::optional<size_t> version) {
+    op_version_ = version;
+  }
+
+  std::optional<size_t> get_op_version() {
+    return op_version_;
+  }
+
+  void set_current_scope(ScopePtr scope) {
+    current_scope_ = std::move(scope);
+  }
+
+  Value* addInput(const std::string& name = "") {
+    return block_->addInput(name);
+  }
+  Value* insertInput(size_t i, const std::string& name = "") {
+    return block_->insertInput(i, name);
+  }
+  void eraseInput(size_t i) {
+    block_->eraseInput(i);
+  }
+  size_t registerOutput(Value* n) {
+    return block_->registerOutput(n);
+  }
+  void eraseOutput(size_t i) {
+    block_->eraseOutput(i);
+  }
+
+  TORCH_API Node* create(NodeKind kind, size_t num_outputs = 1);
+  TORCH_API Node* create(
+      NodeKind kind,
+      ArrayRef<Value*> inputs,
+      size_t num_outputs = 1);
+
+  TORCH_API Node* createNone();
+  TORCH_API Node* createAutogradZero();
+  TORCH_API Node* createUninitialized(TypePtr typ);
+  TORCH_API Node* createWithSubgraph(Symbol kind);
+  TORCH_API Node* createDifferentiableSubgraph();
+  TORCH_API Node* createTuple(
+      at::ArrayRef<Value*> values,
+      TupleTypePtr optional_named_tuple = nullptr);
+  TORCH_API Node* createTupleUnpack(Value* v);
+  TORCH_API Node* createTupleIndex(
+      Value* tup,
+      Value* idx,
+      const TypePtr& output_type);
+  TORCH_API Node* createTupleSlice(
+      Value* tup,
+      int64_t beg,
+      int64_t step_size,
+      int64_t num_values);
+  TORCH_API Node* createEnumName(Value* e);
+  TORCH_API Node* createEnumValue(Value* e);
+  TORCH_API Node* createList(
+      const TypePtr& contained_type,
+      at::ArrayRef<Value*> values);
+  TORCH_API Node* createListUnpack(Value* v, size_t size);
+  TORCH_API Node* createDict(
+      const TypePtr& key_type,
+      const TypePtr& value_type,
+      at::ArrayRef<Value*> keys,
+      at::ArrayRef<Value*> values);
+  TORCH_API Node* createNumToTensor(Value* value);
+  TORCH_API Node* createObject(const ClassTypePtr& type);
+  TORCH_API Node* createSetAttr(
+      Value* obj,
+      const std::string& field,
+      Value* newValue);
+  TORCH_API Node* createGetAttr(Value* obj, const std::string& field);
+  Value* insertGetAttr(Value* obj, const std::string& field) {
+    return insertNode(createGetAttr(obj, field))->output();
+  }
+  TORCH_API Node* createStore(const std::string& name, Value* v);
+  TORCH_API Node* createLoad(const std::string& name, const TypePtr& type);
+  TORCH_API Node* createIsInstance(Value* v, at::ArrayRef<TypePtr> types);
+
+  TORCH_API Value* insertUncheckedCast(Value* v, TypePtr type);
+
+  // Insert a ToList operator with argument \p v and output type \p type.
+  // \returns the output of the operation.
+  TORCH_API Value* insertToList(Value* v, TypePtr type);
+
+  TORCH_API Value* insertFunctionCall(
+      Function* callee,
+      const MatchedSchema& matched);
+  TORCH_API Value* insertMethodCall(
+      std::string method_name,
+      const MatchedSchema& matched);
+
+  // Note: defined in python_ir.cpp and can be used only in python extension
+  Node* createPythonOp(
+      THPObjectPtr&& pyobj,
+      const std::string& cconv,
+      pyobj_list&& scalar_args);
+  // clone n, making a new node in _this_ graph.
+  // use value_map to translate inputs of n to inputs of the cloned node
+  // if copy_blocks is false, it will not recursively clone the nested blocks
+  // this node contains.
+  TORCH_API Node* createClone(
+      Node* n,
+      const std::function<Value*(Value*)>& value_map,
+      bool copy_blocks = true);
+
+  // Insert constant IValue into the graph.
+  TORCH_API Value* insertConstant(
+      const IValue& val,
+      std::optional<SourceRange> loc = std::nullopt,
+      std::optional<ScopePtr> scope = std::nullopt);
+
+  // Schema-driven insert:
+  // This inserts a node into the graph with inputs determined from args and
+  // kwargs using Python argument matching rules, and checks that the op matches
+  // a known schema.
+  //
+  // If this node successfully completes, it guarentees the node
+  // is a correctly-formed invocation of opname
+  TORCH_API Value* insert(
+      Symbol opname,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs = {},
+      const std::optional<SourceRange>& range = {});
+
+  Node* appendNode(Node* n) {
+    return block_->appendNode(n);
+  }
+
+  Node* prependNode(Node* n) {
+    return block_->prependNode(n);
+  }
+
+  // insert before insert_before_ node
+  // initialized to insert at the end of the top level block
+  // can be changed with setInsertPoint()
+  Node* insertNode(Node* n) {
+    AT_ASSERT(
+        insert_before_->inBlockList() &&
+        "insert point node is no longer in a block list");
+    return n->insertBefore(insert_before_);
+  }
+  // set where nodes are inserted to append to the end of this block
+  void setInsertPoint(Block* b) {
+    AT_ASSERT(b->owningGraph() == this);
+    setInsertPoint(b->return_node());
+  }
+  // set where nodes are inserted to insert _before_ this node
+  // for implementation simplicity we only support inserting before a node for
+  // now
+  void setInsertPoint(Node* n) {
+    AT_ASSERT(n->owningGraph() == this && n->inBlockList());
+    insert_before_ = n;
+    predicted_insert_count_ = 0;
+  }
+  Node* insertPoint() {
+    return insert_before_;
+  }
+
+  // the top level block
+  Block* block() {
+    return block_;
+  }
+  const Block* block() const {
+    return block_;
+  }
+
+  // Checks well-formedness and invariants of graph
+  TORCH_API void lint() const;
+  // for use in debugger
+  TORCH_API void dump() const;
+
+  TORCH_API ~Graph();
+
+  TORCH_API std::string toString(bool print_source_locations = true) const;
+
+  TORCH_API std::ostream& print(
+      std::ostream& out,
+      bool print_source_locations = true) const;
+
+  friend TORCH_API std::ostream& operator<<(std::ostream& out, const Graph& g);
+
+  TORCH_API std::shared_ptr<Graph> copy();
+  TORCH_API std::unique_ptr<Graph> copyUnique();
+  TORCH_API void remapTypes(const std::function<TypePtr(TypePtr)>& type_map);
+
+ private:
+  friend TORCH_API void Lint(const AliasDb* db);
+  TORCH_API void freeNode(Node* n);
+  TORCH_API void freeValue(Value* v);
+  TORCH_API void freeBlock(Block* b);
+  void cloneFrom(Graph& src);
+};
+
+/** \brief An utility class for setting temporary insertion points.
+ *
+ * When an object of this class is created, it stores the current insertion
+ * point, sets the new one, and restores the original insertion point when the
+ * object is destroyed.
+ */
+struct WithInsertPoint {
+  WithInsertPoint(Node* n) : prev_(n->owningGraph()->insertPoint()) {
+    n->owningGraph()->setInsertPoint(n);
+  }
+  WithInsertPoint(Block* b) : WithInsertPoint(b->return_node()) {}
+
+  ~WithInsertPoint() {
+    prev_->owningGraph()->setInsertPoint(prev_);
+  }
+
+ private:
+  Node* prev_;
+};
+
+/** \brief An utility class for setting temporary scopes.
+ *
+ * When an object of this class is created, it stores the current scope, sets
+ * the new one, and restores the original scope when the object is destroyed.
+ */
+struct WithCurrentScope {
+  WithCurrentScope(Graph& g, ScopePtr scope)
+      : graph_(&g), prev_scope_(g.current_scope()) {
+    g.set_current_scope(std::move(scope));
+  }
+  ~WithCurrentScope() {
+    graph_->set_current_scope(prev_scope_);
+  }
+
+ private:
+  Graph* graph_;
+  ScopePtr prev_scope_;
+};
+
+inline Value::Value(Node* node_, size_t offset_)
+    : node_(node_),
+      offset_(offset_),
+      unique_(node_->graph_->next_unique_++),
+      type_(TensorType::get()) {
+  node_->graph_->all_values.emplace(this);
+}
+
+inline Value* Value::setType(TypePtr type) {
+  AT_ASSERT(type);
+  if (auto dyn = type->castRaw<c10::DynamicType>()) {
+    type = dyn->fallback();
+  }
+  type_ = std::move(type);
+  for (Use& use : uses_) {
+    use.user->op_ = nullptr;
+  }
+  return this;
+}
+
+inline Graph* Value::owningGraph() {
+  return node()->owningGraph();
+}
+
+inline const Graph* Value::owningGraph() const {
+  return node()->owningGraph();
+}
+
+/************* All nodes not required to be defined before Graph **************/
+struct ProfileOp : public Node {
+  static const Symbol Kind;
+  ProfileOp(Graph* graph, std::function<void(std::vector<IValue>&)> callback)
+      : Node(graph, ::c10::prim::profile), callback_(std::move(callback)) {}
+
+  void cloneFrom(Node* other_) override;
+  Node* allocNewInstance(Graph* g) override;
+
+  const std::function<void(std::vector<IValue>&)>& getCallback() const {
+    return callback_;
+  }
+
+  void setCallback(std::function<void(std::vector<IValue>&)> callback) {
+    callback_ = std::move(callback);
+  }
+
+  bool hasSeenTensor() const {
+    return has_seen_tensor_;
+  }
+
+  void setHasSeenTensor(bool has_seen_tensor) {
+    has_seen_tensor_ = has_seen_tensor;
+  }
+
+ private:
+  std::function<void(std::vector<IValue>&)> callback_;
+  bool has_seen_tensor_ = false;
+};
+
+struct TORCH_API ProfileIValueOp : public Node {
+  static const Symbol Kind;
+  ProfileIValueOp(
+      Graph* graph,
+      std::function<void(std::vector<IValue>&)> callback)
+      : Node(graph, ::c10::prim::profile_ivalue),
+        callback_(std::move(callback)) {}
+
+  void cloneFrom(Node* other_) override;
+  Node* allocNewInstance(Graph* g) override;
+
+  const std::function<void(std::vector<IValue>&)>& getCallback() const {
+    return callback_;
+  }
+
+  void setCallback(std::function<void(std::vector<IValue>&)> callback) {
+    callback_ = std::move(callback);
+  }
+
+ private:
+  std::function<void(std::vector<IValue>&)> callback_;
+};
+
+// execute a Python function, used for Ops we can't optimize but that we want to
+// optimize around
+//
+// Note: actual implementation (ConcretePythonOp) is defined in python_ir.cpp
+// which is not included in libtorch.so. We still include some bits and pieces
+// of PythonOp here to enable writing simple passes generically. In general,
+// python-aware bits need to be moved to the descendant classes.
+struct TORCH_API PythonOp : public Node {
+  using Node::Node;
+
+  virtual std::string name() const = 0;
+  virtual void writeScalars(std::ostream& out) const = 0;
+  void cloneFrom(Node* other_) override = 0;
+  Node* allocNewInstance(Graph* g) override = 0;
+  // recover the autograd.Function instance, if this PythonOp's function
+  // was originally SomeFunction.apply
+  // used in ONNX for discovering symbolics
+  virtual std::optional<THPObjectPtr> autogradFunction() const = 0;
+
+  virtual void lint_python() const = 0;
+};
+
+TORCH_API void LintGraph(const std::shared_ptr<Graph>& graph);
+
+TORCH_API at::ArrayRef<Value*> createTupleUnpack(Value* v);
+
+/** Insert graph \p CALLEE into graph \p G using \p INPUTS as input values.
+ * The insertion happens at the current insertion point.
+ * Optionally, one can also pass \p VALUE_MAP to get a map between \p CALLEE
+ * values and their cloned copies in \p G.
+ */
+TORCH_API std::vector<Value*> insertGraph(
+    Graph& g,
+    Graph& callee,
+    ArrayRef<Value*> inputs);
+TORCH_API std::vector<Value*> insertGraph(
+    Graph& g,
+    Graph& callee,
+    ArrayRef<Value*> inputs,
+    std::unordered_map<Value*, Value*>& value_map);
+
+/** Insert function \p CALLEE after node \p TO_REPLACE, remove the node and
+ * replace all its uses with corresponding outputs of the inserted function.
+ * This asserts that the number of outputs of the original node and the
+ * graph are the same.
+ */
+TORCH_API std::vector<Value*> inlineCallTo(
+    Node* to_replace,
+    GraphFunction* callee,
+    bool use_graph = true);
+
+TORCH_API std::vector<Value*> inlineCallTo(
+    Node* to_replace,
+    GraphFunction* callee,
+    Graph* callee_graph);
+
+/** If there is only one value in \p OUTPUTS and its kind is Tuple, insert a
+ * tuple unpack node and return the resulting values.
+ */
+TORCH_API std::vector<Value*> unpackOutputs(const std::vector<Value*>& outputs);
+
+TORCH_API std::vector<Node*> findAllNodes(Graph& g, Symbol kind, bool recurse);
+TORCH_API std::vector<Node*> findAllNodes(Block& b, Symbol kind, bool recurse);
+TORCH_API std::vector<Node*> findAllNodes(
+    at::ArrayRef<Block*> a,
+    Symbol kind,
+    bool recurse);
+
+struct TORCH_API OperatorSet {
+  OperatorSet(std::initializer_list<const char*> sig_literals);
+  std::vector<std::shared_ptr<Operator>> getOps() const;
+  void insert(std::initializer_list<const char*> sig_literals);
+
+ private:
+  friend struct Node;
+  std::unordered_map<Symbol, std::vector<std::shared_ptr<Operator>>> ops;
+};
+
+template <typename T>
+struct OperatorMap {
+  // Type aliasing
+  using OpMapType = typename std::pair<std::shared_ptr<Operator>, T>;
+  using ValueType = std::vector<OpMapType>;
+  using MapType = std::unordered_map<Symbol, ValueType>;
+
+  OperatorMap() = default;
+  explicit OperatorMap(
+      std::initializer_list<std::pair<std::shared_ptr<Operator>, T>> init) {
+    insert(init);
+  }
+  explicit OperatorMap(std::initializer_list<std::pair<const char*, T>> init) {
+    insert(init);
+  }
+
+  void insert(const std::shared_ptr<Operator>& op, T val) {
+    // Remove if exists before insert
+    erase(op);
+    map[Symbol::fromQualString(op->schema().name())].emplace_back(
+        std::make_pair(op, val));
+  }
+
+  void insert(const OperatorSet& op_set, T val) {
+    for (auto& op : op_set.getOps()) {
+      insert(op, val);
+    }
+  }
+
+  void insert(
+      std::initializer_list<std::pair<std::shared_ptr<Operator>, T>> v) {
+    for (auto& el : v) {
+      insert(el.first, el.second);
+    }
+  }
+
+  void insert(std::initializer_list<std::pair<const char*, T>> v) {
+    for (auto& el : v) {
+      insert(getOperatorForLiteral(el.first), el.second);
+    }
+  }
+
+  void erase(const std::shared_ptr<Operator>& op) {
+    auto it = map.find(Symbol::fromQualString(op->schema().name()));
+    if (it == map.end()) {
+      return;
+    }
+    for (auto vit = it->second.begin(); vit != it->second.end(); ++vit) {
+      if (vit->first->schema() == op->schema()) {
+        it->second.erase(vit);
+        break;
+      }
+    }
+    if (it->second.size() == 0) {
+      map.erase(Symbol::fromQualString(op->schema().name()));
+    }
+  }
+
+  bool contains(const Operator& op) const {
+    const auto it = map.find(Symbol::fromQualString(op.schema().name()));
+    if (it == map.end()) {
+      return false;
+    }
+    for (auto vit = it->second.begin(); vit != it->second.end(); ++vit) {
+      if (vit->first->schema() == op.schema()) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  bool contains(const Node* n) const {
+    return n->maybeOperator() && contains(n->getOperator());
+  }
+
+  std::optional<T> find(const Operator& op) {
+    const auto it = map.find(Symbol::fromQualString(op.schema().name()));
+    if (it == map.end()) {
+      return std::nullopt;
+    }
+    for (auto vit = it->second.begin(); vit != it->second.end(); ++vit) {
+      if (vit->first->schema() == op.schema()) {
+        return vit->second;
+      }
+    }
+    return std::nullopt;
+  }
+
+  // TODO: return iterator
+  std::vector<OpMapType> getAllKeysAndValues() const {
+    std::vector<OpMapType> keys_values;
+    keys_values.reserve(map.size());
+    for (auto& symbol_mapping : map) {
+      auto& vec = symbol_mapping.second;
+      for (auto& pair : vec) {
+        keys_values.push_back(pair);
+      }
+    }
+    return keys_values;
+  }
+
+ private:
+  friend struct Node;
+  MapType map;
+};
+
+template <typename T>
+struct FunctionSchemaMap {
+  // Type aliasing
+  using FuncSchemaMapType = typename std::pair<FunctionSchema, T>;
+  using ValueType = std::vector<FuncSchemaMapType>;
+  using MapType = std::unordered_map<Symbol, ValueType>;
+
+  FunctionSchemaMap() = default;
+  void insert(const FunctionSchema& schema, T val) {
+    // Remove if exists before insert
+    erase(schema);
+    map[Symbol::fromQualString(schema.name())].emplace_back(
+        std::make_pair(schema, val));
+  }
+
+  void erase(const FunctionSchema& schema) {
+    auto it = map.find(Symbol::fromQualString(schema.name()));
+    if (it == map.end()) {
+      return;
+    }
+    for (auto vit = it->second.begin(); vit != it->second.end(); ++vit) {
+      if (vit->first == schema) {
+        it->second.erase(vit);
+        break;
+      }
+    }
+    if (it->second.size() == 0) {
+      map.erase(Symbol::fromQualString(schema.name()));
+    }
+  }
+
+  bool contains(const FunctionSchema& schema) const {
+    const auto it = map.find(Symbol::fromQualString(schema.name()));
+    if (it == map.end()) {
+      return false;
+    }
+    for (auto vit = it->second.begin(); vit != it->second.end(); ++vit) {
+      if (vit->first->schema() == schema) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  std::optional<T> find(const FunctionSchema& schema) const {
+    const auto it = map.find(Symbol::fromQualString(schema.name()));
+    if (it == map.end()) {
+      return std::nullopt;
+    }
+    for (auto vit = it->second.begin(); vit != it->second.end(); ++vit) {
+      if (vit->first == schema) {
+        return vit->second;
+      }
+    }
+    return std::nullopt;
+  }
+
+  // TODO: return iterator
+  std::vector<FuncSchemaMapType> getAllKeysAndValues() const {
+    std::vector<FuncSchemaMapType> keys_values;
+    keys_values.reserve(map.size());
+    for (auto& symbol_mapping : map) {
+      auto& vec = symbol_mapping.second;
+      for (auto& pair : vec) {
+        keys_values.push_back(pair);
+      }
+    }
+    return keys_values;
+  }
+
+ private:
+  friend struct Node;
+  MapType map;
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/ir_views.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/ir_views.h
new file mode 100644
index 0000000000000000000000000000000000000000..70a1b277a2a3b17bb35966ea05d2396252b3beba
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/ir_views.h
@@ -0,0 +1,162 @@
+#pragma once
+
+#include <c10/util/irange.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+struct IfView {
+  explicit IfView(Node* node) : node_(node) {
+    AT_ASSERT(node->kind() == ::c10::prim::If);
+  }
+  Value* cond() const {
+    return node_->input(0);
+  }
+  Block* thenBlock() const {
+    return node_->blocks().at(0);
+  }
+  Block* elseBlock() const {
+    return node_->blocks().at(1);
+  }
+  ArrayRef<Value*> thenOutputs() const {
+    return thenBlock()->outputs();
+  }
+  ArrayRef<Value*> elseOutputs() const {
+    return elseBlock()->outputs();
+  }
+  ArrayRef<Value*> outputs() const {
+    return node_->outputs();
+  }
+  Node* node() const {
+    return node_;
+  }
+  operator Node*() const {
+    return node_;
+  }
+
+  void permuteOutputs(const std::vector<size_t>& new_output_order) {
+    node_->permuteOutputs(new_output_order);
+    thenBlock()->permuteOutputs(new_output_order);
+    elseBlock()->permuteOutputs(new_output_order);
+  }
+
+ private:
+  Node* node_;
+};
+
+struct LoopView {
+  explicit LoopView(Node* node) : node_(node) {
+    AT_ASSERT(
+        node->kind() == ::c10::prim::Loop || node->kind() == ::c10::onnx::Loop);
+  }
+  Block* bodyBlock() const {
+    return node_->blocks().at(0);
+  }
+  Value* cond() const {
+    return node_->input(0);
+  }
+  Value* maxTripCount() const {
+    return node_->input(0);
+  }
+  Value* inputCond() const {
+    return node_->input(1);
+  }
+  Value* nextCond() const {
+    return bodyBlock()->outputs().at(0);
+  }
+  Value* currentTripCount() const {
+    return bodyBlock()->inputs().at(0);
+  }
+  ArrayRef<Value*> carriedInputs() const {
+    // skip trip count and cond
+    return node_->inputs().slice(2);
+  }
+  ArrayRef<Value*> carriedInputsWithCond() const {
+    // skip trip count and cond
+    return node_->inputs().slice(1);
+  }
+  ArrayRef<Value*> carriedOutputs() const {
+    return node_->outputs();
+  }
+  ArrayRef<Value*> bodyCarriedInputs() const {
+    // skip trip count and cond
+    return bodyBlock()->inputs().slice(1);
+  }
+  ArrayRef<Value*> bodyCarriedOutputs() const {
+    return bodyBlock()->outputs().slice(1);
+  }
+  Node* node() const {
+    return node_;
+  }
+  operator Node*() const {
+    return node_;
+  }
+
+  void permuteLoopCarried(const std::vector<size_t>& new_output_order) {
+    node_->permuteOutputs(new_output_order);
+    // skip trip count and cond
+    node_->permuteInputs(adjustIndices(2, new_output_order));
+    auto adjusted_block_order = adjustIndices(1, new_output_order);
+    bodyBlock()->permuteOutputs(adjusted_block_order);
+    bodyBlock()->permuteInputs(adjusted_block_order);
+  }
+
+  void replaceMaxTripCount(Value* new_max_trip_count) {
+    node_->replaceInput(0, new_max_trip_count);
+  }
+  void replaceInputCondition(Value* new_input_condition) {
+    node_->replaceInput(1, new_input_condition);
+  }
+
+  // our way of encoding loops makes them difficult to turn back into python
+  // syntax. we have to check properties of the condition and trip count inputs
+  // to figure out which one it initially was. ModifiedLoops are not directly
+  // mappable to either For or While
+  enum LoopType { While, For, ModifiedLoop };
+
+  LoopType loopType() {
+    auto trip_count = toIValue(maxTripCount());
+    auto cond_input = toIValue(inputCond());
+    auto cond_next = toIValue(nextCond());
+
+    bool condition_is_always_true =
+        cond_input && cond_input->toBool() && cond_next && cond_next->toBool();
+    bool trip_count_is_specified = !trip_count || // trip is not a constant
+        trip_count->toInt() !=
+            std::numeric_limits<int64_t>::max() || // it is a constant but not
+                                                   // the default one
+        !currentTripCount()
+             ->uses()
+             .empty(); // it is actually being used in the body.
+
+    if (condition_is_always_true) {
+      // if the trip count was not specified this was a user-written while True:
+      return trip_count_is_specified ? For : While;
+    } else {
+      if (trip_count_is_specified) {
+        return ModifiedLoop;
+      }
+      return While;
+    }
+  }
+
+ private:
+  Node* node_;
+
+  // adjust index_ordering by adding indices 0 - thorugh adjust, and
+  // incrementing all existing inputs by adjust
+  static std::vector<size_t> adjustIndices(
+      size_t adjust,
+      const std::vector<size_t>& index_ordering) {
+    std::vector<size_t> adjusted;
+    adjusted.reserve(adjust + index_ordering.size());
+    for (const auto i : c10::irange(adjust)) {
+      adjusted.push_back(i);
+    }
+    for (auto index : index_ordering) {
+      adjusted.push_back(index + adjust);
+    }
+    return adjusted;
+  }
+};
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/irparser.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/irparser.h
new file mode 100644
index 0000000000000000000000000000000000000000..8054b610a023897ea7ef0f86ded1e30207b0e1cc
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/irparser.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <optional>
+#include <string>
+#include <unordered_map>
+
+#include <torch/csrc/Export.h>
+
+namespace torch::jit {
+
+struct Graph;
+struct Value;
+
+// \brief Parse IR from \p STR constructing the corresponding IR in\ GRAPH.
+// if parse_tensor_constants is true will construct empty tensors
+// for Tensor constants with random or unitialized contents, otherwise will
+// throw
+TORCH_API void parseIR(
+    const std::string& str,
+    torch::jit::Graph* graph,
+    bool parse_tensor_constants = false);
+
+/** \brief Parse IR from \p STR constructing the corresponding IR in\ GRAPH.
+ *
+ * \p VMAP is filled with String to Value pairs allowing to index Values in the
+ * newly created graph by their name in the original IR string.
+ * if parse_tensor_constants is true will construct empty tensors
+ * for Tensor constants with random or unitialized contents, otherwise will
+ * throw
+ */
+TORCH_API void parseIR(
+    const std::string& str,
+    torch::jit::Graph* graph,
+    std::unordered_map<std::string, Value*>& vmap,
+    bool parse_tensor_constants = false);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/named_value.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/named_value.h
new file mode 100644
index 0000000000000000000000000000000000000000..2212100a36845395d25fa498d75b1234c9bc8326
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/named_value.h
@@ -0,0 +1,81 @@
+#pragma once
+#include <ATen/core/ivalue.h>
+#include <torch/csrc/jit/frontend/source_range.h>
+#include <torch/csrc/jit/ir/constants.h>
+#include <torch/csrc/utils/variadic.h>
+
+namespace torch::jit {
+
+struct Value;
+
+/**
+ * A value with optional extra name and location information. Used during
+ * schema matching to provide extra error information and resolve kwargs.
+ */
+struct NamedValue {
+  NamedValue(const SourceRange& loc, const std::string& name, Value* value)
+      : loc_(loc), name_(name), value_(value) {}
+  NamedValue(const SourceRange& loc, Value* value) : loc_(loc), value_(value) {}
+
+  /* implicit */ NamedValue(Value* value) : value_(value) {}
+  NamedValue(const std::string& name, Value* value)
+      : name_(name), value_(value) {}
+
+  /* implicit */ NamedValue(IValue value) : ivalue_(std::move(value)) {}
+
+  NamedValue(const std::string& name, IValue value)
+      : name_(name), ivalue_(std::move(value)) {}
+
+  template <
+      typename T,
+      typename = std::enable_if_t<
+          (!std::is_same_v<std::decay_t<T>, NamedValue> &&
+           !std::is_same_v<std::decay_t<T>, Value*> &&
+           !std::is_same_v<std::decay_t<T>, IValue>)>>
+  // NOLINTNEXTLINE(bugprone-forwarding-reference-overload)
+  NamedValue(T&& t) : NamedValue(IValue(std::forward<T>(t))) {}
+
+  template <
+      typename T,
+      typename = std::enable_if_t<
+          (!std::is_same_v<std::decay_t<T>, Value*> &&
+           !std::is_same_v<std::decay_t<T>, IValue>)>>
+  NamedValue(const std::string& name, T&& t)
+      : NamedValue(name, IValue(std::forward<T>(t))) {}
+
+  SourceRange locOr(const SourceRange& backup_location) const {
+    if (!loc_)
+      return backup_location;
+    return loc();
+  }
+
+  // note: this will insert a constant node into the graph at the current
+  // insert point if this NamedValue is actually a constant
+  Value* value(Graph& g) const {
+    if (!value_)
+      return insertConstant(
+          g, ivalue_); // use insertConstant to remove need to include ir.h here
+    return value_;
+  }
+
+  const std::string& name() const {
+    AT_ASSERT(name_);
+    return *name_;
+  }
+
+  const SourceRange& loc() const {
+    AT_ASSERT(loc_);
+    return *loc_;
+  }
+
+  at::TypePtr type() const;
+
+ private:
+  std::optional<SourceRange> loc_;
+  std::optional<std::string> name_;
+  Value* value_{nullptr};
+  // only valid if value_ == nullptr;
+  IValue ivalue_;
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/node_hashing.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/node_hashing.h
new file mode 100644
index 0000000000000000000000000000000000000000..d6330fbb9162294d65fff3ffee563133f8855e55
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/node_hashing.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+struct TORCH_API HashNode {
+  size_t operator()(const Node* k) const;
+};
+
+struct TORCH_API EqualNode {
+  bool operator()(const Node* lhs, const Node* rhs) const;
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/scope.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/scope.h
new file mode 100644
index 0000000000000000000000000000000000000000..f51a6d65034040d28200e5fb7c9057080efda515
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/scope.h
@@ -0,0 +1,218 @@
+#pragma once
+#include <ATen/core/jit_type.h>
+#include <ATen/core/symbol.h>
+#include <c10/util/intrusive_ptr.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/frontend/source_range.h>
+#include <optional>
+#include <unordered_map>
+
+namespace torch::jit {
+struct ModuleInstanceInfo;
+constexpr size_t kModuleInstanceInfo = 2;
+
+namespace utils {
+std::string get_module_info(const ModuleInstanceInfo& module_instance_info);
+} // namespace utils
+
+// Scope is a node of a trie that represents the tree of nested scopes.
+// Individual scopes are pushed and popped from Graph, which holds a
+// pointer to the current scope. Each Node in Graph holds a pointer
+// to the scope that was current when the node was created.
+// The trie never needs to shrink, it only grows until it is disposed
+// of when Graph is deallocated. Hence, pointers to scopes held by nodes
+// will always be valid as long as Graph is alive.
+struct Scope;
+using ScopePtr = c10::intrusive_ptr<Scope>;
+using c10::Symbol;
+
+struct TORCH_API Scope : public c10::intrusive_ptr_target {
+ private:
+  ScopePtr parent_;
+  Symbol name_;
+  ScopePtr intrusive_from_this();
+
+ public:
+  Scope();
+
+  Scope(ScopePtr parent, Symbol name);
+
+  ScopePtr push(Symbol name);
+
+  ScopePtr parent();
+
+  bool isRoot() const;
+
+  bool isBlank() const;
+
+  ScopePtr getRoot();
+
+  size_t getDepth();
+
+  Symbol name() const;
+
+  std::string namesFromRoot(const std::string& separator = "/") const;
+};
+
+struct Function;
+struct InlinedCallStack;
+
+/**
+ * ModuleInstanceInfo is a structure to include the module type and instance
+ * name. It also provide public methods to get the pointer to module type and
+ * instance name.
+ *
+ * This structure is mainly used as a private member in InlinedCallStack, such
+ * that one can follow the callstack to find the relevant module hierarchy.
+ */
+struct ModuleInstanceInfo {
+ private:
+  c10::ClassTypePtr module_type_{nullptr};
+  std::string instance_name_;
+
+ public:
+  ModuleInstanceInfo() = default;
+  ModuleInstanceInfo(c10::ClassTypePtr module_type, std::string instance_name);
+  c10::ClassTypePtr class_type() {
+    return module_type_;
+  }
+  c10::ClassTypePtr class_type() const {
+    return module_type_;
+  }
+  std::string instance_name() const {
+    return instance_name_;
+  }
+
+  bool operator==(const ModuleInstanceInfo& rhs) const {
+    return (class_type() == rhs.class_type()) &&
+        (instance_name() == rhs.instance_name());
+  }
+};
+
+/**
+ * InlinedCallStack is an element in a list representing callstack of functions
+ * that have been inlined.
+ *
+ * Each such element holds info about the current callsite (Function and
+ * SourceRange) and a pointer to the next element in the list. The last element
+ * in the list represents the innermost function that was inlined.
+ *
+ * For instance, if a node has a callstack
+ *    [foo, source_range1] -> [bar, source_range2]
+ * it means that this node was originally from function 'bar' that was called
+ * at 'source_range2' in function 'foo' that was called in the current function
+ * at 'source_range1'.
+ *
+ * If a node did not come from any inlined function, its callstack will be
+ * empty.
+ *
+ * The callstack lists only grow, we never remove elements from them, which
+ * allows us to reuse same elements in different lists. For instance, if we
+ * inline function 'bar' to 'foo' and then inline 'foo' to two functions 'ham'
+ * and 'baz', the callstacks would look like:
+ *
+ *  [baz, source_range3]  --
+ *                           \
+ *                             --> [foo, source_range1] -> [bar, source_range2]
+ *                           /
+ *  [ham, source_range4]  --
+ */
+using InlinedCallStackPtr = c10::intrusive_ptr<InlinedCallStack>;
+using InlinedCallStackEntry =
+    std::tuple<Function*, SourceRange, std::optional<ModuleInstanceInfo>>;
+
+struct TORCH_API InlinedCallStack : public c10::intrusive_ptr_target {
+ private:
+  std::optional<InlinedCallStackPtr> callee_;
+  Function* fn_;
+  // Reason for fn_name_ even though we have fn_
+  // Serialized callstack is used in circustmances where InlinedCallstack
+  // cannot be constructed during runtime, e.g. mobile runtime or
+  // delegated backends.
+  // Since in those cases we do not have Function* we store function name
+  // fn_name does not give you access to the same information that Function*
+  // does, however in mobile/delegated backend runtime we use InlindedCallStack
+  // for exception stack and for that purpose fn_name_ suffices.
+  const std::string fn_name_;
+  SourceRange source_range_;
+  InlinedCallStackPtr intrusive_from_this();
+  std::optional<ModuleInstanceInfo> module_instance_info_;
+
+ public:
+  // Constructor for a leaf callstack node.
+  InlinedCallStack(Function* fn, SourceRange source_range);
+
+  // Constructor for a leaf callstack node.
+  InlinedCallStack(
+      Function* fn,
+      SourceRange source_range,
+      std::optional<ModuleInstanceInfo> module_instance_info);
+
+  // Constructor for a leaf callstack node.
+  InlinedCallStack(
+      Function* fn,
+      SourceRange source_range,
+      std::optional<ModuleInstanceInfo> module_instance_info,
+      std::string& function_name);
+
+  // Constructor for an inner callstack node.
+  InlinedCallStack(
+      InlinedCallStackPtr callee,
+      Function* fn,
+      SourceRange source_range);
+
+  InlinedCallStack(
+      InlinedCallStackPtr callee,
+      Function* fn,
+      SourceRange source_range,
+      std::optional<ModuleInstanceInfo> module_instance_info);
+
+  InlinedCallStack(
+      InlinedCallStackPtr callee,
+      Function* fn,
+      SourceRange source_range,
+      std::optional<ModuleInstanceInfo> module_instance_info,
+      std::string& function_name);
+
+  // Return next element in the callstack list.
+  std::optional<InlinedCallStackPtr> callee() const;
+
+  // Return module instance associated with the current element.
+  std::optional<ModuleInstanceInfo> module_instance() const;
+
+  // Returns the source range of the node
+  SourceRange source_range() const;
+
+  Function* function() const;
+
+  const std::string& function_name() const;
+
+  // Return callstack as a vector of [Function, SourceRange] pairs.
+  std::vector<InlinedCallStackEntry> vec();
+
+  void setCallee(std::optional<InlinedCallStackPtr>);
+
+  bool operator==(const InlinedCallStack& rhs) const {
+    // No need to compare fn_, since source_range equivalence check
+    // should suffice.
+    return (module_instance().has_value() ==
+            rhs.module_instance().has_value()) &&
+        (module_instance().has_value() &&
+         module_instance().value() == rhs.module_instance().value()) &&
+        callee() == rhs.callee() && source_range() == rhs.source_range();
+  }
+
+  bool operator!=(const InlinedCallStack& rhs) const {
+    return !(*this == rhs);
+  }
+};
+
+// {source range, node name, InlinedCallStack}
+// We store node name because same debug infor will be used for
+// profiling as well, so we need to know op names as well.
+using DebugInfoTuple =
+    std::tuple<SourceRange, std::string, InlinedCallStackPtr>;
+constexpr size_t kDebugInfoTupleSourceRangeIndex{0};
+constexpr size_t kDebugInfoTupleNodeNameIndex{1};
+constexpr size_t kDebugInfoTupleInlinedCSIndex{2};
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/subgraph_matcher.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/subgraph_matcher.h
new file mode 100644
index 0000000000000000000000000000000000000000..4756abd25a03c432169f10a2558f511afba6638e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/subgraph_matcher.h
@@ -0,0 +1,72 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+#include <unordered_map>
+#include <vector>
+
+namespace torch::jit {
+
+/**
+ * \brief A structure describing a match of a pattern in a graph.
+ *
+ * The structure contains an anchor node, from which the match was found, and
+ * match-maps for nodes and values. A match-map specifies the correspondance
+ * between nodes in the pattern graph (match-map keys) with nodes in the actual
+ * graph (match-map values). We keep such maps for both nodes and values.
+ */
+struct Match {
+  Node* anchor;
+  std::unordered_map<const Node*, Node*> nodes_map;
+  std::unordered_map<const Value*, Value*> values_map;
+};
+
+/**
+ * \brief Find all matches of a \p PATTERN in a \p GRAPH.
+ *
+ * The function returns a vector of match-descriptors (see description of
+ * `struct Match`).
+ *
+ * Matching rules:
+ *  - Pattern graph must contain a single block.
+ *  - Matched subgraphs do not span across different blocks.
+ *  - No uses outside the match are allowed, except for Param and Return nodes.
+ *  Basically, we're matching hammocks, not arbitrary subgraphs.
+ *  - The pattern graph must return only one value (i.e. it must have a single
+ *  node leading to return).
+ *  - Nodes that are not used in computation of the return value in the pattern
+ * graph are ignored during matching (IOW, we're essentially performing DCE on
+ * the pattern).
+ *  - Pattern graph nodes cannot alias. TODO: the check not implemented yet.
+ *  - Aliasing nodes in the graph cannot consitute a match (i.e. through all
+ * found matches, no nodes in the subgraph alias with each other). TODO: check
+ * not implemented yet.
+ *  - The matcher will not mutate either the pattern graph or the matched graph.
+ * The matched graph is taken as non-const so that Match may contain non-const
+ * pointers.  This enables clients of this API to use Match to drive mutations.
+ *
+ * Note [Multi-output Patterns]
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * Subgraph matcher provides limited support for multi-output patterns. With a
+ * single output pattern, a single scan through the graph is sufficient to
+ * find all the matches: given a starting node (an "anchor"), we can
+ * deterministically check whether a pattern matches a subgraph corresponding to
+ * this anchor node. For a general case of multi-output patterns, we would have
+ * N anchors, which would result in M^N comparisons (M is the size of the
+ * graph). Clearly this is computationally prohibitive.
+ *
+ * To overcome this, we impose some constraints on the multi-output patterns
+ * that we accept. We require that checking whether the pattern matches a
+ * subgraph would still be fully determined by a single node in the graph. To
+ * achieve this, we designate the first output in the pattern as the "main"
+ * output and assume that we can traverse up from this node to match the
+ * entire pattern.
+ *
+ * Corrolary 1: the order of outputs in the pattern matters!
+ * Corollary 2: patterns cannot contain any nodes not participating in the main
+ * output computation.
+ */
+std::vector<Match> TORCH_API
+findPatternMatches(const Graph& pattern, Graph& graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/type_hashing.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/type_hashing.h
new file mode 100644
index 0000000000000000000000000000000000000000..4d023ffadd206cf2085b709512ee255c8cf9affd
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/ir/type_hashing.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <ATen/core/jit_type.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+struct TORCH_API HashType {
+  size_t operator()(const TypePtr& type) const;
+  size_t operator()(const c10::ConstTypePtr& type) const;
+};
+
+struct EqualType {
+  bool operator()(const TypePtr& a, const TypePtr& b) const;
+  bool operator()(const c10::ConstTypePtr& a, const c10::ConstTypePtr& b) const;
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/jit_log.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/jit_log.h
new file mode 100644
index 0000000000000000000000000000000000000000..110dc51f5db3791e7d157786b3110dedc467eb90
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/jit_log.h
@@ -0,0 +1,127 @@
+#pragma once
+#include <c10/util/StringUtil.h>
+#include <torch/csrc/Export.h>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <unordered_map>
+
+// `TorchScript` offers a simple logging facility that can enabled by setting an
+// environment variable `PYTORCH_JIT_LOG_LEVEL`.
+
+// Logging is enabled on a per file basis. To enable logging in
+// `dead_code_elimination.cpp`, `PYTORCH_JIT_LOG_LEVEL` should be
+// set to `dead_code_elimination.cpp` or, simply, to `dead_code_elimination`
+// (i.e. `PYTORCH_JIT_LOG_LEVEL=dead_code_elimination`).
+
+// Multiple files can be logged by separating each file name with a colon `:` as
+// in the following example,
+// `PYTORCH_JIT_LOG_LEVEL=dead_code_elimination:guard_elimination`
+
+// There are 3 logging levels available for your use ordered by the detail level
+// from lowest to highest.
+
+// * `GRAPH_DUMP` should be used for printing entire graphs after optimization
+// passes
+// * `GRAPH_UPDATE` should be used for reporting graph transformations (i.e.
+// node deletion, constant folding, etc)
+// * `GRAPH_DEBUG` should be used for providing information useful for debugging
+//   the internals of a particular optimization pass or analysis
+
+// The default logging level is `GRAPH_DUMP` meaning that only `GRAPH_DUMP`
+// statements will be enabled when one specifies a file(s) in
+// `PYTORCH_JIT_LOG_LEVEL`.
+
+// `GRAPH_UPDATE` can be enabled by prefixing a file name with an `>` as in
+// `>alias_analysis`.
+// `GRAPH_DEBUG` can be enabled by prefixing a file name with an `>>` as in
+// `>>alias_analysis`.
+// `>>>` is also valid and **currently** is equivalent to `GRAPH_DEBUG` as there
+// is no logging level that is higher than `GRAPH_DEBUG`.
+
+namespace torch::jit {
+
+struct Node;
+struct Graph;
+
+enum class JitLoggingLevels {
+  GRAPH_DUMP = 0,
+  GRAPH_UPDATE,
+  GRAPH_DEBUG,
+};
+
+TORCH_API std::string get_jit_logging_levels();
+
+TORCH_API void set_jit_logging_levels(std::string level);
+
+TORCH_API void set_jit_logging_output_stream(std::ostream& out_stream);
+
+TORCH_API std::ostream& get_jit_logging_output_stream();
+
+TORCH_API std::string getHeader(const Node* node);
+
+TORCH_API std::string log_function(const std::shared_ptr<Graph>& graph);
+
+TORCH_API ::torch::jit::JitLoggingLevels jit_log_level();
+
+// Prefix every line in a multiline string \p IN_STR with \p PREFIX.
+TORCH_API std::string jit_log_prefix(
+    const std::string& prefix,
+    const std::string& in_str);
+
+TORCH_API std::string jit_log_prefix(
+    ::torch::jit::JitLoggingLevels level,
+    const char* fn,
+    int l,
+    const std::string& in_str);
+
+TORCH_API bool is_enabled(
+    const char* cfname,
+    ::torch::jit::JitLoggingLevels level);
+
+TORCH_API std::ostream& operator<<(
+    std::ostream& out,
+    ::torch::jit::JitLoggingLevels level);
+
+#define JIT_LOG(level, ...)                                         \
+  if (is_enabled(__FILE__, level)) {                                \
+    ::torch::jit::get_jit_logging_output_stream()                   \
+        << ::torch::jit::jit_log_prefix(                            \
+               level, __FILE__, __LINE__, ::c10::str(__VA_ARGS__)); \
+  }
+
+// tries to reconstruct original python source
+#define SOURCE_DUMP(MSG, G)                       \
+  JIT_LOG(                                        \
+      ::torch::jit::JitLoggingLevels::GRAPH_DUMP, \
+      MSG,                                        \
+      "\n",                                       \
+      ::torch::jit::log_function(G));
+// use GRAPH_DUMP for dumping graphs after optimization passes
+#define GRAPH_DUMP(MSG, G) \
+  JIT_LOG(                 \
+      ::torch::jit::JitLoggingLevels::GRAPH_DUMP, MSG, "\n", (G)->toString());
+// use GRAPH_UPDATE for reporting graph transformations (i.e. node deletion,
+// constant folding, CSE)
+#define GRAPH_UPDATE(...) \
+  JIT_LOG(::torch::jit::JitLoggingLevels::GRAPH_UPDATE, __VA_ARGS__);
+// use GRAPH_DEBUG to provide information useful for debugging a particular opt
+// pass
+#define GRAPH_DEBUG(...) \
+  JIT_LOG(::torch::jit::JitLoggingLevels::GRAPH_DEBUG, __VA_ARGS__);
+// use GRAPH_EXPORT to export a graph so that the IR can be loaded by a script
+#define GRAPH_EXPORT(MSG, G)                       \
+  JIT_LOG(                                         \
+      ::torch::jit::JitLoggingLevels::GRAPH_DEBUG, \
+      MSG,                                         \
+      "\n<GRAPH_EXPORT>\n",                        \
+      (G)->toString(),                             \
+      "</GRAPH_EXPORT>");
+
+#define GRAPH_DUMP_ENABLED \
+  (is_enabled(__FILE__, ::torch::jit::JitLoggingLevels::GRAPH_DUMP))
+#define GRAPH_UPDATE_ENABLED \
+  (is_enabled(__FILE__, ::torch::jit::JitLoggingLevels::GRAPH_UPDATE))
+#define GRAPH_DEBUG_ENABLED \
+  (is_enabled(__FILE__, ::torch::jit::JitLoggingLevels::GRAPH_DEBUG))
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/jit_opt_limit.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/jit_opt_limit.h
new file mode 100644
index 0000000000000000000000000000000000000000..5a7e86471e6cedfccca625092d25292ea19d67b9
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/jit_opt_limit.h
@@ -0,0 +1,37 @@
+#pragma once
+#include <torch/csrc/Export.h>
+#include <string>
+#include <unordered_map>
+
+// `TorchScript` offers a simple optimization limit checker
+// that can be configured through environment variable `PYTORCH_JIT_OPT_LIMIT`.
+// The purpose is to limit how many optimization you can make per pass.
+// This is useful for debugging any passes.
+
+// Opt limit checker is enabled on a per file basis (hence per pass). For
+// example, in `constant_propagation.cpp`, `PYTORCH_JIT_OPT_LIMIT` should be set
+// to `constant_propagation=<opt_limit>` or, simply, to
+// `constant_propagation=<opt_limit>` where <opt_limit> is the number of
+// optimizations you want to make for the pass. (i.e.
+// `PYTORCH_JIT_OPT_LIMIT="constant_propagation=<opt_limit>"`).
+
+// Multiple files can be configured by separating each file name with a colon
+// `:` as in the following example,
+// `PYTORCH_JIT_OPT_LIMIT="constant_propagation=<opt_limit>:dead_code_elimination=<opt_limit>"`
+
+// You can call opt limiter by calling JIT_OPT_ALLOWED. It will return true if
+// we haven't reached the optimization limit yet. Otherwise, it will return
+// false. Typical usage:
+
+// if (!JIT_OPT_ALLOWED) {
+//     GRAPH_DUMP(...); //supplied from jit_log
+//     return;
+// }
+
+namespace torch::jit {
+
+TORCH_API bool opt_limit(const char* pass_name);
+
+#define JIT_OPT_ALLOWED opt_limit(__FILE__)
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/code.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/code.h
new file mode 100644
index 0000000000000000000000000000000000000000..bcf2d47d79051361c672cb03f05c799b518c631c
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/code.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <vector>
+
+#include <ATen/core/ivalue.h>
+#include <ATen/core/operator_name.h>
+#include <torch/csrc/jit/runtime/instruction.h>
+
+namespace torch::jit::mobile {
+
+using Stack = std::vector<c10::IValue>;
+using DebugHandle = int64_t;
+
+class Function;
+
+struct Code {
+  std::vector<Instruction> instructions_;
+  std::vector<DebugHandle> debug_handles_;
+  std::vector<c10::OperatorName> op_names_;
+  std::vector<int> operator_input_sizes_;
+  std::vector<std::function<void(Stack&)>> operators_;
+  std::vector<c10::IValue> constants_;
+  std::vector<c10::TypePtr> types_;
+  // TODO After we actually export CALL instructions we can remove this.
+  // We may need a two-stage importing scheme, where we firstly construct all
+  // function objects, and then append referenced function pointers. This could
+  // be done in parseMethods().
+  std::vector<mobile::Function*> functions_;
+  size_t register_size_ = 0; // Aggregated output size.
+  // initialized means operators_ array is filled with operators
+  bool initialized = false;
+};
+
+} // namespace torch::jit::mobile
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/compatibility/backport.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/compatibility/backport.h
new file mode 100644
index 0000000000000000000000000000000000000000..c03e09eb42bc6b98c3805b6c39c6587d617c4cd4
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/compatibility/backport.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <c10/macros/Export.h>
+#include <istream>
+
+namespace torch::jit {
+
+TORCH_API bool _backport_for_mobile(
+    std::istream& in,
+    std::ostream& out,
+    const int64_t to_version);
+
+TORCH_API bool _backport_for_mobile(
+    std::istream& in,
+    const std::string& output_filename,
+    const int64_t to_version);
+
+TORCH_API bool _backport_for_mobile(
+    const std::string& input_filename,
+    std::ostream& out,
+    const int64_t to_version);
+
+TORCH_API bool _backport_for_mobile(
+    const std::string& input_filename,
+    const std::string& output_filename,
+    const int64_t to_version);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/compatibility/backport_manager.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/compatibility/backport_manager.h
new file mode 100644
index 0000000000000000000000000000000000000000..0d06a6f424b73ab1db57f64a23cc18d63ba3351e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/compatibility/backport_manager.h
@@ -0,0 +1,48 @@
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <unordered_map>
+
+namespace c10 {
+struct IValue;
+}
+
+namespace caffe2::serialize {
+class PyTorchStreamWriter;
+} // namespace caffe2::serialize
+
+namespace torch::jit {
+
+/*
+BackportManager manages a list of backport from n to n-1 function, and provides
+function to check if a specific function exists.
+*/
+class BackportManager final {
+ public:
+  bool hasBytecodeBackportFunction(const int64_t from_version) const;
+
+  std::unordered_map<
+      int64_t,
+      std::function<std::stringstream(std::stringstream&)>>&
+  bytecodeBackportFunctions() const;
+
+  bool backport(
+      std::istream& oss,
+      caffe2::serialize::PyTorchStreamWriter& final_writer,
+      int64_t from_version,
+      int64_t to_version) const;
+
+  BackportManager(BackportManager const&) = delete;
+  BackportManager& operator=(BackportManager const&) = delete;
+  BackportManager();
+
+ private:
+  // Registry of backport functions.
+  void registerBytecodeBackportFunction(
+      const int64_t from_version,
+      const std::function<std::stringstream(std::stringstream&)>&
+          backport_function);
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/compatibility/model_compatibility.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/compatibility/model_compatibility.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe1d85a58b3f4fb813cca18931ab0c12ddf6b404
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/compatibility/model_compatibility.h
@@ -0,0 +1,104 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <c10/macros/Export.h>
+#include <torch/csrc/jit/mobile/compatibility/runtime_compatibility.h>
+
+#include <istream>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+namespace caffe2::serialize {
+class PyTorchStreamReader;
+class ReadAdapterInterface;
+} // namespace caffe2::serialize
+
+namespace torch::jit {
+
+// The family of methods below to get bytecode version from a model
+// Throws if not passed in a well formed model
+TORCH_API uint64_t _get_model_bytecode_version(std::istream& in);
+
+TORCH_API uint64_t _get_model_bytecode_version(const std::string& filename);
+
+TORCH_API uint64_t _get_model_bytecode_version(
+    const std::shared_ptr<caffe2::serialize::ReadAdapterInterface>& rai);
+
+uint64_t _get_model_bytecode_version(
+    const std::vector<c10::IValue>& bytecode_ivalues);
+
+// The family of methods below to get the operator version from a model
+// Throws if not passed in a well formed model
+TORCH_API uint64_t _get_model_operator_version(std::istream& in);
+
+TORCH_API uint64_t _get_model_operator_version(const std::string& filename);
+
+TORCH_API uint64_t _get_model_operator_version(
+    std::shared_ptr<caffe2::serialize::ReadAdapterInterface> rai);
+
+// Utility Functions
+std::vector<c10::IValue> get_bytecode_ivalues(
+    caffe2::serialize::PyTorchStreamReader& reader);
+
+c10::IValue readArchive(
+    const std::string& archive_name,
+    caffe2::serialize::PyTorchStreamReader& stream_reader);
+
+bool check_zip_file(
+    const std::shared_ptr<caffe2::serialize::ReadAdapterInterface>& rai);
+
+// The family of methods below to get the root ops and information from a model
+TORCH_API std::unordered_map<std::string, OperatorInfo> _get_model_ops_and_info(
+    std::istream& in);
+
+TORCH_API std::unordered_map<std::string, OperatorInfo> _get_model_ops_and_info(
+    const std::string& filename);
+
+TORCH_API std::unordered_map<std::string, OperatorInfo> _get_model_ops_and_info(
+    std::shared_ptr<caffe2::serialize::ReadAdapterInterface> rai);
+
+// The family of methods below to get contained types from a model
+// Throws if not passed in a well formed model
+TORCH_API std::unordered_set<std::string> _get_mobile_model_contained_types(
+    std::istream& in);
+
+TORCH_API std::unordered_set<std::string> _get_mobile_model_contained_types(
+    const std::string& filename);
+
+TORCH_API std::unordered_set<std::string> _get_mobile_model_contained_types(
+    std::shared_ptr<caffe2::serialize::ReadAdapterInterface> rai);
+
+std::unordered_set<std::string> _get_mobile_model_contained_types(
+    const std::vector<c10::IValue>& bytecode_ivalues);
+
+// The family of methods below return the compatibility information of a model
+struct ModelCompatibilityInfo {
+  uint64_t bytecode_version;
+  std::unordered_map<std::string, OperatorInfo> operator_info;
+  std::unordered_set<std::string> type_table;
+  uint64_t operator_version;
+
+  // Factory Methods
+  static TORCH_API ModelCompatibilityInfo get(std::istream& in);
+  static TORCH_API ModelCompatibilityInfo get(const std::string& filename);
+  static TORCH_API ModelCompatibilityInfo
+  get(std::shared_ptr<caffe2::serialize::ReadAdapterInterface> rai);
+};
+
+enum ModelCompatibilityStatus {
+  OK = 1,
+  ERROR = 2,
+};
+
+struct ModelCompatCheckResult {
+  ModelCompatibilityStatus status;
+  std::vector<std::string> errors{};
+};
+// Takes in information about a runtime and a model and returns if the two are
+// compatible with one another.
+TORCH_API ModelCompatCheckResult is_compatible(
+    RuntimeCompatibilityInfo runtime_info,
+    const ModelCompatibilityInfo& model_info);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/compatibility/runtime_compatibility.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/compatibility/runtime_compatibility.h
new file mode 100644
index 0000000000000000000000000000000000000000..878c534eb9763bc6f42c8561449d7dadd04277bc
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/compatibility/runtime_compatibility.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include <c10/macros/Export.h>
+#include <optional>
+
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+
+namespace torch::jit {
+
+// Struct storing metadata of an operator that can be useful for versioning
+struct OperatorInfo {
+  // The number of arguments within the schema of the op
+  std::optional<int> num_schema_args;
+};
+
+struct RuntimeCompatibilityInfo {
+  std::pair<uint64_t, uint64_t> min_max_supported_bytecode_version;
+  std::unordered_map<std::string, OperatorInfo> operator_info;
+  std::unordered_set<std::string> supported_types;
+  std::pair<uint64_t, uint64_t> min_max_supported_opperator_versions;
+
+  // Factory Method
+  static TORCH_API RuntimeCompatibilityInfo get();
+};
+
+TORCH_API uint64_t _get_runtime_bytecode_version();
+
+TORCH_API std::pair<uint64_t, uint64_t> _get_runtime_bytecode_min_max_versions();
+
+TORCH_API std::pair<uint64_t, uint64_t>
+_get_runtime_operators_min_max_versions();
+
+TORCH_API std::unordered_map<std::string, OperatorInfo>
+_get_runtime_ops_and_info();
+
+TORCH_API std::unordered_set<std::string> _get_mobile_supported_types();
+
+TORCH_API std::unordered_set<std::string> _get_loaded_custom_classes();
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/debug_info.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/debug_info.h
new file mode 100644
index 0000000000000000000000000000000000000000..3e717aabdeac6ac3b26604ae9a89ba909f27f5ae
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/debug_info.h
@@ -0,0 +1,55 @@
+#pragma once
+#include <c10/util/flat_hash_map.h>
+#include <caffe2/serialize/inline_container.h>
+#include <torch/csrc/jit/api/compilation_unit.h>
+#include <torch/csrc/jit/ir/scope.h>
+#include <torch/csrc/jit/serialization/source_range_serialization.h>
+
+namespace torch::jit {
+/*
+ * MobileDebugTable:
+ * Deserializes debug_pkl and callstack_map records from PT model's zip archive
+ * and stores them in a map of debug handles to DebugInfoPair. Debug handles are
+ * unique per model and runtime, be in lite interpreter or delegate, an
+ * exception of BackendRuntimeException should raised using debug handles.
+ * getSourceDebugString method is responsible for translating debug
+ * handles to correspond debug information.
+ * This debug informatin includes stack trace of model level source code and
+ * module hierarchy where the exception occurred.
+ */
+class MobileDebugTable {
+ public:
+  MobileDebugTable() = default;
+  MobileDebugTable(
+      std::unique_ptr<caffe2::serialize::PyTorchStreamReader>& reader,
+      const std::shared_ptr<CompilationUnit>& cu);
+
+  template <typename It>
+  MobileDebugTable(It begin, It end) : callstack_ptr_map_(begin, end) {}
+
+  std::string getSourceDebugString(
+      const int64_t debug_handle,
+      const std::string& top_module_type_name = "ModuleTypeUnknown") const;
+  std::string getSourceDebugString(
+      const std::vector<int64_t>& debug_handles,
+      const std::string& top_module_type_name = "ModuleTypeUnknown") const;
+  std::string getModuleHierarchyInfo(
+      const int64_t debug_handle,
+      const std::string& top_module_type_name = "ModuleTypeUnknown") const;
+  std::string getModuleHierarchyInfo(
+      const std::vector<int64_t>& debug_handles,
+      const std::string& top_module_type_name = "ModuleTypeUnknown") const;
+
+  const ska::flat_hash_map<int64_t, DebugInfoTuple>& getCallStackPtrMap()
+      const {
+    return callstack_ptr_map_;
+  }
+
+ private:
+  std::pair<std::string, std::string> getSourceDebugModuleHierarchyInfo(
+      const std::vector<int64_t>& debug_handles,
+      const std::string& top_module_type_name = "ModuleTypeUnknown") const;
+  ska::flat_hash_map<int64_t, DebugInfoTuple> callstack_ptr_map_;
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/file_format.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/file_format.h
new file mode 100644
index 0000000000000000000000000000000000000000..36e0847f02ef47260e24577e3b18868afbdf6cbf
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/file_format.h
@@ -0,0 +1,194 @@
+#pragma once
+
+#include <array>
+#include <cerrno>
+#include <cstddef>
+#include <cstring>
+#include <fstream>
+#include <istream>
+#include <memory>
+
+#include <c10/core/CPUAllocator.h>
+#include <c10/core/impl/alloc_cpu.h>
+#include <caffe2/serialize/read_adapter_interface.h>
+
+#if defined(HAVE_MMAP)
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#endif
+
+/**
+ * @file
+ *
+ * Helpers for identifying file formats when reading serialized data.
+ *
+ * Note that these functions are declared inline because they will typically
+ * only be called from one or two locations per binary.
+ */
+
+namespace torch::jit {
+
+/**
+ * The format of a file or data stream.
+ */
+enum class FileFormat {
+  UnknownFileFormat = 0,
+  FlatbufferFileFormat,
+  ZipFileFormat,
+};
+
+/// The size of the buffer to pass to #getFileFormat(), in bytes.
+constexpr size_t kFileFormatHeaderSize = 8;
+constexpr size_t kMaxAlignment = 16;
+
+/**
+ * Returns the likely file format based on the magic header bytes in @p header,
+ * which should contain the first bytes of a file or data stream.
+ */
+// NOLINTNEXTLINE(facebook-hte-NamespaceScopedStaticDeclaration)
+static inline FileFormat getFileFormat(const char* data) {
+  // The size of magic strings to look for in the buffer.
+  static constexpr size_t kMagicSize = 4;
+
+  // Bytes 4..7 of a Flatbuffer-encoded file produced by
+  // `flatbuffer_serializer.h`. (The first four bytes contain an offset to the
+  // actual Flatbuffer data.)
+  static constexpr std::array<char, kMagicSize> kFlatbufferMagicString = {
+      'P', 'T', 'M', 'F'};
+  static constexpr size_t kFlatbufferMagicOffset = 4;
+
+  // The first four bytes of a ZIP file.
+  static constexpr std::array<char, kMagicSize> kZipMagicString = {
+      'P', 'K', '\x03', '\x04'};
+
+  // Note that we check for Flatbuffer magic first. Since the first four bytes
+  // of flatbuffer data contain an offset to the root struct, it's theoretically
+  // possible to construct a file whose offset looks like the ZIP magic. On the
+  // other hand, bytes 4-7 of ZIP files are constrained to a small set of values
+  // that do not typically cross into the printable ASCII range, so a ZIP file
+  // should never have a header that looks like a Flatbuffer file.
+  if (std::memcmp(
+          data + kFlatbufferMagicOffset,
+          kFlatbufferMagicString.data(),
+          kMagicSize) == 0) {
+    // Magic header for a binary file containing a Flatbuffer-serialized mobile
+    // Module.
+    return FileFormat::FlatbufferFileFormat;
+  } else if (std::memcmp(data, kZipMagicString.data(), kMagicSize) == 0) {
+    // Magic header for a zip file, which we use to store pickled sub-files.
+    return FileFormat::ZipFileFormat;
+  }
+  return FileFormat::UnknownFileFormat;
+}
+
+/**
+ * Returns the likely file format based on the magic header bytes of @p data.
+ * If the stream position changes while inspecting the data, this function will
+ * restore the stream position to its original offset before returning.
+ */
+// NOLINTNEXTLINE(facebook-hte-NamespaceScopedStaticDeclaration)
+static inline FileFormat getFileFormat(std::istream& data) {
+  FileFormat format = FileFormat::UnknownFileFormat;
+  std::streampos orig_pos = data.tellg();
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  std::array<char, kFileFormatHeaderSize> header;
+  data.read(header.data(), header.size());
+  if (data.good()) {
+    format = getFileFormat(header.data());
+  }
+  data.seekg(orig_pos, data.beg);
+  return format;
+}
+
+/**
+ * Returns the likely file format based on the magic header bytes of the file
+ * named @p filename.
+ */
+// NOLINTNEXTLINE(facebook-hte-NamespaceScopedStaticDeclaration)
+static inline FileFormat getFileFormat(const std::string& filename) {
+  std::ifstream data(filename, std::ifstream::binary);
+  return getFileFormat(data);
+}
+
+// NOLINTNEXTLINE(facebook-hte-NamespaceScopedStaticDeclaration)
+static void file_not_found_error() {
+  std::stringstream message;
+  message << "Error while opening file: ";
+  if (errno == ENOENT) {
+    message << "no such file or directory" << '\n';
+  } else {
+    message << "error no is: " << errno << '\n';
+  }
+  TORCH_CHECK(false, message.str());
+}
+
+// NOLINTNEXTLINE(facebook-hte-NamespaceScopedStaticDeclaration)
+static inline std::tuple<std::shared_ptr<char>, size_t> get_file_content(
+    const char* filename) {
+#if defined(HAVE_MMAP)
+  int fd = open(filename, O_RDONLY);
+  if (fd < 0) {
+    // failed to open file, chances are it's no such file or directory.
+    file_not_found_error();
+  }
+  struct stat statbuf{};
+  fstat(fd, &statbuf);
+  size_t size = statbuf.st_size;
+  void* ptr = mmap(nullptr, statbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
+  close(fd);
+  auto deleter = [statbuf](char* ptr) { munmap(ptr, statbuf.st_size); };
+  std::shared_ptr<char> data(reinterpret_cast<char*>(ptr), deleter);
+#else
+  FILE* f = fopen(filename, "rb");
+  if (f == nullptr) {
+    file_not_found_error();
+  }
+  fseek(f, 0, SEEK_END);
+  size_t size = ftell(f);
+  fseek(f, 0, SEEK_SET);
+  // make sure buffer size is multiple of alignment
+  size_t buffer_size = (size / kMaxAlignment + 1) * kMaxAlignment;
+  std::shared_ptr<char> data(
+      static_cast<char*>(c10::alloc_cpu(buffer_size)), c10::free_cpu);
+  fread(data.get(), size, 1, f);
+  fclose(f);
+#endif
+  return std::make_tuple(data, size);
+}
+
+// NOLINTNEXTLINE(facebook-hte-NamespaceScopedStaticDeclaration)
+static inline std::tuple<std::shared_ptr<char>, size_t> get_stream_content(
+    std::istream& in) {
+  // get size of the stream and reset to orig
+  std::streampos orig_pos = in.tellg();
+  in.seekg(orig_pos, std::ios::end);
+  const long size = in.tellg();
+  in.seekg(orig_pos, in.beg);
+
+  // read stream
+  // NOLINT make sure buffer size is multiple of alignment
+  size_t buffer_size = (size / kMaxAlignment + 1) * kMaxAlignment;
+  std::shared_ptr<char> data(
+      static_cast<char*>(c10::alloc_cpu(buffer_size)), c10::free_cpu);
+  in.read(data.get(), size);
+
+  // reset stream to original position
+  in.seekg(orig_pos, in.beg);
+  return std::make_tuple(data, size);
+}
+
+// NOLINTNEXTLINE(facebook-hte-NamespaceScopedStaticDeclaration)
+static inline std::tuple<std::shared_ptr<char>, size_t> get_rai_content(
+    caffe2::serialize::ReadAdapterInterface* rai) {
+  size_t buffer_size = (rai->size() / kMaxAlignment + 1) * kMaxAlignment;
+  std::shared_ptr<char> data(
+      static_cast<char*>(c10::alloc_cpu(buffer_size)), c10::free_cpu);
+  rai->read(
+      0, data.get(), rai->size(), "Loading ReadAdapterInterface to bytes");
+  return std::make_tuple(data, buffer_size);
+}
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/flatbuffer_loader.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/flatbuffer_loader.h
new file mode 100644
index 0000000000000000000000000000000000000000..22e99152dda8d7789557f94ee6031e22f944ab79
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/flatbuffer_loader.h
@@ -0,0 +1,134 @@
+#pragma once
+
+#include <istream>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <ATen/core/ivalue.h>
+#include <c10/core/Device.h>
+#include <c10/macros/Macros.h>
+#include <torch/csrc/jit/mobile/module.h>
+#include <optional>
+
+/**
+ * Defines the public API for loading flatbuffer-serialized mobile modules.
+ * Note that this header must not include or depend on flatbuffer-defined
+ * types, to avoid leaking those details to PyTorch clients.
+ */
+
+namespace torch::jit {
+
+/// All non-copied data pointers provided to `parse_and_initialize_*` functions
+/// must be aligned to this boundary. Since the Module will point directly into
+/// the data, this alignment is necessary to ensure that certain types/structs
+/// are properly aligned.
+constexpr size_t kFlatbufferDataAlignmentBytes = 16;
+
+/// Maps file names to file contents.
+using ExtraFilesMap = std::unordered_map<std::string, std::string>;
+
+// On high level, to produce a Module from a file on disk, we need to go
+// through the follow steps:
+// 1. Read: Read the file from disk -> memory
+// 2. Deserialize: Parse the bytes to produce some in memory manipulable
+//    structure
+// 3. Module initialization: Produce mobile::Module out of the structure
+//    produced in 2.
+// Under this context, the structure described in 2. is the flatbuffer-defined
+// type mobile::serialization::Module. However, this step/type is not visible in
+// the public API.
+
+// Parse a mobile::Module from raw bytes.
+//
+// This function does steps 2+3 described above.
+//
+// Does not take ownership of `data`; if you want it to take ownership, see the
+// shared_ptr overload of this function.
+//
+// If should_copy_tensor_memory is true, then the returned module will NOT have
+// refences to `data`, so `data` can be freed immediately.
+//
+// If should_copy_tensor_memory is false, then returned module will have tensors
+// that points inside of `data`; the caller will need to make sure that `data`
+// outlives the returned Module. Also, `data` must be aligned to
+// kFlatbufferDataAlignmentBytes.
+TORCH_API mobile::Module parse_and_initialize_mobile_module(
+    void* data,
+    size_t size, // of `data`, in bytes.
+    std::optional<at::Device> device = std::nullopt,
+    ExtraFilesMap* extra_files = nullptr,
+    bool should_copy_tensor_memory = false);
+
+// Parse a mobile::Module from raw bytes.
+//
+// This function does steps 2+3 described above.
+//
+// The returned Module holds a reference to `data`, which must be aligned to
+// kFlatbufferDataAlignmentBytes.
+//
+// If you do not want the Module to hold a reference to `data`, see the raw
+// pointer overload of this function.
+TORCH_API mobile::Module parse_and_initialize_mobile_module(
+    std::shared_ptr<char> data,
+    size_t size, // of `data`, in bytes.
+    std::optional<at::Device> device = std::nullopt,
+    ExtraFilesMap* extra_files = nullptr);
+
+// Parse a mobile::Module from raw bytes, also returning JIT-related metadata.
+//
+// This is the same as parse_and_initialize_mobile_module() except that it also
+// extracts JIT source files and constants. Can be used to construct a
+// jit::Module.
+TORCH_API mobile::Module parse_and_initialize_mobile_module_for_jit(
+    void* data,
+    size_t size, // of `data`, in bytes.
+    ExtraFilesMap& jit_sources,
+    std::vector<IValue>& jit_constants,
+    std::optional<at::Device> device = std::nullopt,
+    ExtraFilesMap* extra_files = nullptr);
+
+// Load a mobile::Module from a filepath.
+//
+// This function does steps 1+2+3 described above.
+//
+// We need to have this as a convienience because Python API will need to wrap
+// this. C++ clients should use one of the versions of
+// parse_and_initialize_mobile_module() so they can manage the raw data more
+// directly.
+TORCH_API mobile::Module load_mobile_module_from_file(
+    const std::string& filename,
+    std::optional<at::Device> device = std::nullopt,
+    ExtraFilesMap* extra_files = nullptr);
+
+TORCH_API uint64_t get_bytecode_version(std::istream& in);
+TORCH_API uint64_t get_bytecode_version(const std::string& filename);
+TORCH_API uint64_t get_bytecode_version_from_bytes(char* flatbuffer_content);
+
+TORCH_API mobile::ModuleInfo get_module_info_from_flatbuffer(
+    char* flatbuffer_content);
+
+// The methods below are less efficient because it need to read the stream in
+// its entirity to a buffer
+TORCH_API mobile::Module load_mobile_module_from_stream_with_copy(
+    std::istream& in,
+    std::optional<at::Device> device = std::nullopt,
+    ExtraFilesMap* extra_files = nullptr);
+
+TORCH_API mobile::Module parse_flatbuffer_no_object(
+    std::shared_ptr<char> data,
+    size_t size,
+    std::optional<at::Device> device);
+
+TORCH_API mobile::Module parse_and_initialize_mobile_module(
+    void* data,
+    size_t,
+    std::optional<at::Device>,
+    ExtraFilesMap* extra_files,
+    bool should_copy_tensor_memory);
+
+// no op, TODO(qihan) delete
+TORCH_API bool register_flatbuffer_loader();
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/frame.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/frame.h
new file mode 100644
index 0000000000000000000000000000000000000000..42c91ea1c2cd0f590cbfc964068f96bea1b8ef97
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/frame.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <cstddef>
+
+#include <torch/csrc/jit/mobile/code.h>
+#include <optional>
+
+namespace torch::jit::mobile {
+
+class Frame {
+ public:
+  explicit Frame(const Code& code) : code_(code) {}
+  const Code& getCode() const {
+    return code_;
+  }
+
+  void step() {
+    pc_++;
+  }
+
+  void jump(size_t n) {
+    pc_ += n;
+  }
+
+  size_t getPC() const {
+    return pc_;
+  }
+
+  const Instruction& getInstruction() const {
+    return code_.instructions_.at(pc_);
+  }
+
+  std::optional<int64_t> getDebugHandle() const {
+    return getDebugHandle(pc_);
+  }
+
+  std::optional<int64_t> getDebugHandle(size_t pc) const {
+    if (pc >= code_.debug_handles_.size()) {
+      return {};
+    }
+    return code_.debug_handles_[pc];
+  }
+
+ private:
+  const Code& code_;
+  size_t pc_{0};
+};
+
+} // namespace torch::jit::mobile
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/function.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/function.h
new file mode 100644
index 0000000000000000000000000000000000000000..01ab5e6be0e270ef1045da8169ddd85c7e2621d5
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/function.h
@@ -0,0 +1,84 @@
+#pragma once
+
+#include <vector>
+
+#include <ATen/core/function.h>
+#include <ATen/core/function_schema.h>
+#include <ATen/core/ivalue.h>
+#include <torch/csrc/jit/mobile/code.h>
+
+namespace torch::jit {
+enum OpCode : uint8_t;
+struct Instruction;
+struct OperatorString;
+
+namespace mobile {
+
+class TORCH_API Function : public torch::jit::Function {
+ public:
+  explicit Function(c10::QualifiedName name);
+  Function(
+      c10::QualifiedName name,
+      Code code,
+      std::optional<c10::FunctionSchema> schema);
+  void run(Stack& stack) override;
+  at::IValue operator()(Stack& stack);
+  void ensure_defined() override {}
+  size_t num_inputs() const override;
+  const c10::QualifiedName& qualname() const override;
+  bool call(Stack&, c10::function_ref<void(const mobile::Code&)>) override;
+
+  // NOTE: the APIs below is dangerous: if you call append_instruction with
+  // dbg_handle and then call it without; then the dbg_handle will become
+  // misaligned. Therefore only use ONE variant at time.
+  void append_instruction(OpCode op, int64_t X, int64_t N, int64_t dbg_handle);
+  void append_instruction(OpCode op, int64_t X, int64_t N);
+  void append_operator(
+      const std::string& name,
+      const std::string& overload_name,
+      const std::optional<int>& num_specified_args);
+  void append_constant(const c10::IValue& constant);
+  void append_type(const c10::TypePtr& type);
+  void append_function(mobile::Function& func);
+
+  void set_register_size(size_t size);
+
+  int64_t get_debug_handle(size_t pc) const;
+  const Code& get_code() const;
+  Code& get_code();
+
+  torch::jit::Function& setSchema(c10::FunctionSchema schema) override;
+  bool hasSchema() const;
+  const c10::FunctionSchema& getSchema() const override;
+
+  // Returns the debug handle corresponding to where the execution
+  // is halted due to exception.
+  // If no corresponding debug handle is found then -1 is returned.
+  const std::vector<int64_t>& getExceptionDebugHandles() const;
+  static Function& registerFunc(
+      const std::string& qualified_name,
+      const std::vector<Instruction>& instructions,
+      const std::vector<c10::IValue>& constants,
+      const std::vector<c10::TypePtr>& types,
+      const size_t register_size);
+
+  // if not initialize, initialize by loading operators.
+  // return true of all op loaded, return false if some op is not found
+  // in the current runtime. Then, the ops that did not found will be filled
+  // in unsupported_op_names
+  bool initialize_operators(bool should_check_operators);
+
+ private:
+  c10::QualifiedName name_;
+  Code code_;
+  std::optional<c10::FunctionSchema> schema_; // (byte-code version 4+)
+};
+
+std::optional<std::function<void(Stack&)>> makeOperatorFunction(
+    const c10::OperatorName& opname,
+    std::optional<int> num_specified_args);
+
+TORCH_API std::string operator_str(const c10::OperatorName& opname);
+
+} // namespace mobile
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/import.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/import.h
new file mode 100644
index 0000000000000000000000000000000000000000..5edd5cfe483126ea0e1e56b04542548a180a918f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/import.h
@@ -0,0 +1,108 @@
+#pragma once
+#include <torch/csrc/jit/mobile/module.h>
+#include <torch/csrc/jit/mobile/parse_operators.h>
+
+#include <istream>
+#include <memory>
+
+#include <caffe2/serialize/file_adapter.h>
+
+namespace torch::jit {
+using caffe2::serialize::ReadAdapterInterface;
+using ExtraFilesMap = std::unordered_map<std::string, std::string>;
+
+constexpr const char* kArchiveNameBytecode = "bytecode";
+constexpr const char* kArchiveNameConstants = "constants";
+constexpr const char* kArchiveNameVersion = "version";
+
+// The family of methods below load a serialized Mobile Module
+// into a mobile::Module object.
+TORCH_API mobile::Module _load_for_mobile(
+    std::istream& in,
+    std::optional<at::Device> device,
+    ExtraFilesMap& extra_file,
+    uint64_t module_load_options = kDefaultMobileLoadOptions);
+
+TORCH_API mobile::Module _load_for_mobile(
+    const std::string& filename,
+    std::optional<at::Device> device,
+    ExtraFilesMap& extra_files);
+
+TORCH_API mobile::Module _load_for_mobile(
+    std::unique_ptr<ReadAdapterInterface> rai,
+    std::optional<c10::Device> device,
+    ExtraFilesMap& extra_files,
+    uint64_t module_load_options = kDefaultMobileLoadOptions);
+
+TORCH_API mobile::Module _load_for_mobile(
+    const std::string& filename,
+    std::optional<at::Device> device,
+    ExtraFilesMap& extra_files,
+    uint64_t module_load_options);
+
+TORCH_API mobile::Module _load_for_mobile(
+    std::istream& in,
+    std::optional<at::Device> device = std::nullopt);
+
+TORCH_API mobile::Module _load_for_mobile(
+    const std::string& filename,
+    std::optional<at::Device> device = std::nullopt);
+
+TORCH_API mobile::Module _load_for_mobile(
+    std::unique_ptr<ReadAdapterInterface> rai,
+    std::optional<c10::Device> device = std::nullopt);
+
+/**
+ * Load only the contents of the "extra/" files whose names are
+ * passed in the map (extra_files). Populate the corresponding values
+ * with the contents of those files. Do not attempt to load the entire
+ * model, and stop once the extra files have been extracted.
+ *
+ * This API is needed to be able to load GPU models on linux CPU
+ * machines and extract only the extra files so that we can inspect
+ * the metadata that was added to the .ptl archive when it was
+ * generated.
+ *
+ */
+void _load_extra_only_for_mobile(
+    const std::string& filename,
+    std::optional<at::Device> device,
+    ExtraFilesMap& extra_files);
+
+// Currently used by both mobile/import.cpp and model_compatibility.cpp.
+// Should be removed after model_compatibility.cpp start using simplified
+// version type_resolver and obj_loader.
+at::TypePtr resolveTypeNameMobile(
+    const c10::QualifiedName& qn,
+    const std::shared_ptr<CompilationUnit>& compilation_unit);
+c10::StrongTypePtr typeResolverMobile(
+    const c10::QualifiedName& qn,
+    const std::shared_ptr<CompilationUnit>& compilation_unit);
+c10::intrusive_ptr<c10::ivalue::Object> objLoaderMobile(
+    const at::StrongTypePtr& type,
+    const at::IValue& input,
+    mobile::CompilationUnit& mobile_compilation_unit);
+
+// Given a reader, which has access to a model file,
+// return true if there exists tensors in `bytecode` archive
+bool isTensorInBytecodeArchive(
+    caffe2::serialize::PyTorchStreamReader& stream_reader);
+
+namespace mobile {
+
+/**
+ * Given a torch::jit::mobile::Module, return a set of operator names
+ * (with overload name) that are used by any method in this mobile
+ * Mobile. This method runs through the bytecode for all methods
+ * in the specified model (module), and extracts all the root
+ * operator names. Root operators are operators that are called
+ * directly by the model (as opposed to non-root operators, which
+ * may be called transitively by the root operators).
+ *
+ */
+TORCH_API std::set<std::string> _export_operator_list(
+    torch::jit::mobile::Module& module);
+
+} // namespace mobile
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/import_data.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/import_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..7442ee1faf0f1d7239a14c6b511981d225f8797b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/import_data.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <ATen/core/TensorBase.h>
+#include <c10/core/Device.h>
+#include <torch/csrc/jit/mobile/module.h>
+#include <optional>
+
+#include <istream>
+#include <map>
+#include <string>
+
+namespace torch::jit {
+
+/**
+ * Loads named parameters from the serialized data in @p in.
+ *
+ * Calls #TORCH_CHECK() if the data format is not recognized.
+ */
+TORCH_API std::map<std::string, at::Tensor> _load_parameters(
+    std::istream& in,
+    std::optional<at::Device> device = std::nullopt);
+
+/**
+ * Loads named parameters from the serialized data in @p filename.
+ *
+ * Calls #TORCH_CHECK() if the data format is not recognized.
+ */
+TORCH_API std::map<std::string, at::Tensor> _load_parameters(
+    const std::string& filename,
+    std::optional<at::Device> device = std::nullopt);
+
+// NOTE: Please prefer using _load_parameters over using the function below.
+TORCH_API std::map<std::string, at::Tensor> mobile_module_to_parameter_map(
+    const mobile::Module& module);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/import_export_common.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/import_export_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..204d44814db104395f6a4c7f4988d6a12cbd55cf
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/import_export_common.h
@@ -0,0 +1,15 @@
+#pragma once
+
+/**
+ * @file
+ * Declarations shared between import_data.cpp and export_data.cpp
+ */
+
+namespace torch::jit::mobile::internal {
+/**
+ * The name of the mobile::Module attribute which contains saved parameters, as
+ * a Dict of names to Tensors. Only used for Flatbuffer serialization.
+ */
+// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
+constexpr char kSavedParametersAttributeName[] = "data";
+} // namespace torch::jit::mobile::internal
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/interpreter.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/interpreter.h
new file mode 100644
index 0000000000000000000000000000000000000000..ff6b41b3fa357cc45e1fe0635aac69576310dd71
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/interpreter.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <vector>
+
+#include <torch/csrc/jit/mobile/code.h>
+#include <torch/csrc/jit/mobile/frame.h>
+
+namespace torch::jit::mobile {
+
+struct InterpreterState {
+  TORCH_API explicit InterpreterState(const Code& code);
+  TORCH_API bool run(Stack& stack);
+
+ private:
+  void enterFrame(const Code&);
+  void leaveFrame();
+  void saveExceptionDebugHandles();
+  void callFunction(torch::jit::Function& f, Stack& stack);
+
+  c10::IValue& reg(size_t reg);
+  std::vector<c10::IValue> registers_;
+  std::vector<Frame> frames_;
+};
+
+const std::vector<DebugHandle>& getInterpretersExceptionDebugHandles();
+} // namespace torch::jit::mobile
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/method.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/method.h
new file mode 100644
index 0000000000000000000000000000000000000000..c4c4facc612b051f4f6fea0098b1bba85a793037
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/method.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <torch/csrc/jit/mobile/function.h>
+
+namespace torch::jit::mobile {
+
+class Module;
+
+struct TORCH_API Method {
+  Method(const Module* owner, Function* function);
+
+  void run(Stack& stack) const;
+  void run(Stack&& stack) const {
+    run(stack);
+  }
+
+  c10::IValue operator()(std::vector<c10::IValue> stack) const;
+
+  const std::string& name() const {
+    return function_->name();
+  }
+
+  int64_t get_debug_handle(size_t pc) const {
+    return function_->get_debug_handle(pc);
+  }
+
+  Function& function() const {
+    return *function_;
+  }
+
+ private:
+  // Methods are uniquely owned by a single module.
+  // This raw pointer allows referencing the module
+  const Module* owner_;
+
+  // Underlying unbound function
+  Function* function_;
+};
+
+} // namespace torch::jit::mobile
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/model_tracer/BuildFeatureTracer.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/model_tracer/BuildFeatureTracer.h
new file mode 100644
index 0000000000000000000000000000000000000000..10ece8f6a7837dc9155c253a077cb2290b2489a7
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/model_tracer/BuildFeatureTracer.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <ATen/record_function.h>
+#include <c10/util/Synchronized.h>
+#include <map>
+#include <set>
+#include <string>
+
+namespace torch::jit::mobile {
+
+/* The BuildFeatureTracer class handles the attachment and removal of a
+ * recording callback that traces the invocation of code that handles executing
+ * generic build features.
+ *
+ * You can get the set of used build features using
+ * getBuildFeatures().
+ *
+ * Note: This class is not thread safe or re-entrant, and should not be used
+ * across multiple threads of execution.
+ *
+ */
+struct BuildFeatureTracer final {
+  at::CallbackHandle handle_;
+  /* These are the custom class names (constant
+   * character string) which shows up in code.
+   */
+  typedef std::set<std::string> build_feature_type;
+
+  BuildFeatureTracer();
+  static c10::Synchronized<build_feature_type>& getBuildFeatures();
+
+  ~BuildFeatureTracer() {
+    at::removeCallback(handle_);
+  }
+};
+
+} // namespace torch::jit::mobile
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/model_tracer/CustomClassTracer.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/model_tracer/CustomClassTracer.h
new file mode 100644
index 0000000000000000000000000000000000000000..db76cf5c67b20bcaa893f5954dcf2514ea990c34
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/model_tracer/CustomClassTracer.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <ATen/record_function.h>
+#include <c10/util/Synchronized.h>
+#include <map>
+#include <set>
+#include <string>
+
+namespace torch::jit::mobile {
+
+/* The CustomClassTracer class handles the attachment and removal of a recording
+ * callback that traces the invocation of code that handles loading custom
+ * classes on mobile.
+ *
+ * You can get the set of used custom classes using
+ * getLoadedClasses().
+ *
+ * Note: This class is not thread safe or re-entrant, and should not be used
+ * across multiple threads of execution.
+ *
+ */
+struct CustomClassTracer final {
+  at::CallbackHandle handle_;
+  /* These are the custom class names (constant
+   * character string) which shows up in code.
+   */
+  typedef std::set<std::string> custom_classes_type;
+
+  CustomClassTracer();
+  static c10::Synchronized<custom_classes_type>& getLoadedClasses();
+
+  ~CustomClassTracer() {
+    at::removeCallback(handle_);
+  }
+};
+
+} // namespace torch::jit::mobile
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/model_tracer/KernelDTypeTracer.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/model_tracer/KernelDTypeTracer.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a5f89b8aec2446b223edc486aee12704ab259ce
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/model_tracer/KernelDTypeTracer.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <ATen/record_function.h>
+#include <c10/util/Synchronized.h>
+#include <map>
+#include <set>
+#include <string>
+
+namespace torch::jit::mobile {
+/* The KernelDTypeTracer class handles the attachment and removal of a recording
+ * callback that traces the invocation of code that handles specific dtypes in
+ * kernel function implementations that are tagged with specific tags.
+ *
+ * You can get the set of kernel tags and the dtypes using
+ * getCalledKernelTags().
+ *
+ * Note: This class is not thread safe or re-entrant, and should not be used
+ * across multiple threads of execution.
+ *
+ */
+struct KernelDTypeTracer final {
+  at::CallbackHandle handle_;
+  /* The key of the map below (std::string) is the kernel tag name (constant
+   * character string) which shows up in code. The value part of type
+   * std::set<std::string> is the collection of dtypes for which we need to
+   * generate code for the said kernel tag.
+   */
+  typedef std::map<std::string, std::set<std::string>> kernel_tags_type;
+
+  KernelDTypeTracer();
+  static c10::Synchronized<kernel_tags_type>& getCalledKernelTags();
+
+  ~KernelDTypeTracer() {
+    at::removeCallback(handle_);
+  }
+};
+} // namespace torch::jit::mobile
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/model_tracer/MobileModelRunner.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/model_tracer/MobileModelRunner.h
new file mode 100644
index 0000000000000000000000000000000000000000..e6796f0c1b169c73c5dae0e668ecadedb64fe316
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/model_tracer/MobileModelRunner.h
@@ -0,0 +1,146 @@
+#pragma once
+
+#include <mutex>
+#include <sstream>
+
+#include <torch/csrc/autograd/grad_mode.h>
+#include <torch/csrc/jit/mobile/import.h>
+#include <torch/csrc/jit/mobile/module.h>
+#include <torch/csrc/jit/serialization/export.h>
+#include <torch/script.h>
+
+namespace torch::jit::mobile {
+
+class MobileModelRunner {
+  std::shared_ptr<torch::jit::mobile::Module> module_;
+
+ public:
+  explicit MobileModelRunner(std::string const& file_path) {
+    module_ = std::make_shared<torch::jit::mobile::Module>(
+        torch::jit::_load_for_mobile(file_path));
+  }
+
+  MobileModelRunner(
+      std::string const& file_path,
+      uint64_t module_load_options) {
+    std::unordered_map<std::string, std::string> extra_files;
+    module_ = std::make_shared<torch::jit::mobile::Module>(
+        torch::jit::_load_for_mobile(
+            file_path,
+            at::Device(at::DeviceType::CPU, 0),
+            extra_files,
+            module_load_options));
+  }
+
+  MobileModelRunner(std::stringstream oss) {
+    module_ = std::make_shared<torch::jit::mobile::Module>(
+        torch::jit::_load_for_mobile(oss, at::Device(at::DeviceType::CPU, 0)));
+  }
+
+  /**
+   * Returns true if the list of operators passed in has a Metal GPU operator,
+   * and false otherwise.
+   *
+   */
+  static bool set_has_metal_gpu_operators(std::set<std::string> const& op_list);
+
+  /**
+   * Fetches the set of root operators in the file "extra/mobile_info.json"
+   * within the .ptl archive at location file_path.
+   *
+   * An exception is thrown if:
+   *
+   * 1. The file at file_path does not exist, or
+   * 2. The contents of extra/mobile_info.json is not a JSON, or
+   * 3. The file extra/mobile_info.json does not exist, or
+   * 4. The JSON is malformed in some way and the operator list can not be
+   * extracted correctly.
+   *
+   */
+  static std::set<std::string> get_operators_from_mobile_info_json(
+      std::string const& file_path);
+
+  static std::vector<std::vector<at::IValue>> ivalue_to_bundled_inputs(
+      const c10::IValue& bundled_inputs);
+
+  static std::unordered_map<std::string, std::string>
+  ivalue_to_bundled_inputs_map(const c10::IValue& bundled_inputs);
+
+  /**
+   * Fetches all the bundled inputs of the loaded mobile model.
+   *
+   * A bundled input itself is of type std::vector<at::IValue> and the
+   * elements of this vector<> are the arguments that the "forward"
+   * method of the model accepts. i.e. each of the at::IValue is a
+   * single argument to the model's "forward" method.
+   *
+   * The outer vector holds a bundled input. For models with bundled
+   * inputs, the outer most vector will have size > 0.
+   */
+  std::vector<std::vector<at::IValue>> get_all_bundled_inputs();
+
+  /**
+   * Fetches all the bundled inputs for all functions of the loaded mobile
+   * model.
+   *
+   * The mapping is from 'function_names' eg 'forward' to bundled inputs for
+   * that function
+   *
+   * A bundled input itself is of type std::vector<at::IValue> and the
+   * elements of this vector<> are the arguments that the corresponding
+   * method of the model accepts. i.e. each of the at::IValue in the entry
+   * for forward is a single argument to the model's "forward" method.
+   *
+   * The outer vector of each value holds a bundled input. For models with
+   * bundled inputs, the outer most vector will have size > 0.
+   */
+  std::unordered_map<std::string, std::vector<std::vector<at::IValue>>>
+  get_many_functions_bundled_inputs();
+
+  /**
+   * Returns true if a model possesses get_bundled_inputs_functions_and_info()
+   */
+  bool has_new_style_bundled_inputs() const {
+    return module_->find_method("get_bundled_inputs_functions_and_info") !=
+        std::nullopt;
+  }
+
+  /**
+   * For each tensor in bundled inputs, call the user-provided function 'func'.
+   */
+  void for_each_tensor_in_bundled_inputs(
+      std::function<void(const ::at::Tensor&)> const& func);
+
+  /**
+   * Get the root operators directly called by this model's Bytecode.
+   */
+  std::set<std::string> get_root_operators() {
+    return torch::jit::mobile::_export_operator_list(*module_);
+  }
+
+  /**
+   * Runs the model against all of the provided inputs using the model's
+   * "forward" method. Returns an std::vector<at::IValue>, where each element
+   * of the returned vector is one of the return values from calling forward().
+   */
+  std::vector<at::IValue> run_with_inputs(
+      std::vector<std::vector<at::IValue>> const& bundled_inputs);
+
+  /**
+   * Runs the model against all of the provided inputs for all the specified
+   * function. Returns an std::vector<at::IValue>, where each element
+   * of the returned vector is one of the return values from calling the
+   * method named "function_name" on this model.
+   */
+  std::vector<at::IValue> run_with_inputs(
+      const std::string& function_name,
+      std::vector<std::vector<at::IValue>> const& bundled_inputs) const;
+
+  /**
+   * Attempts to run all functions in the passed in list if they exist. All
+   * funcs should require no args
+   */
+  void run_argless_functions(const std::vector<std::string>& functions);
+};
+
+} // namespace torch::jit::mobile
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/model_tracer/OperatorCallTracer.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/model_tracer/OperatorCallTracer.h
new file mode 100644
index 0000000000000000000000000000000000000000..90d4bb07cd1955c2e949ae07070f03165c14e124
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/model_tracer/OperatorCallTracer.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <ATen/record_function.h>
+#include <c10/util/Synchronized.h>
+
+namespace torch::jit::mobile {
+/* The OperatorCallTracer class handles the attachment and removal of a
+ * recording callback that traces invocation of ATen (and other) PyTorch
+ * operators that get called via the Dispatcher.
+ *
+ * You can get the set of operators that were called (op_name.overload_name)
+ * using getCalledOperators().
+ *
+ * Note: This class is not thread safe or re-entrant, and should not be used
+ * across multiple threads of execution.
+ *
+ */
+struct OperatorCallTracer final {
+  at::CallbackHandle handle_;
+
+  OperatorCallTracer();
+
+  static c10::Synchronized<std::set<std::string>>& getCalledOperators() {
+    static c10::Synchronized<std::set<std::string>> called_operators_;
+    return called_operators_;
+  }
+
+  ~OperatorCallTracer() {
+    at::removeCallback(handle_);
+  }
+};
+} // namespace torch::jit::mobile
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/model_tracer/TensorUtils.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/model_tracer/TensorUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..db8ba3e900d4406df8feaaaf96fc7196effd1c05
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/model_tracer/TensorUtils.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+
+namespace torch::jit::mobile {
+/**
+ * Recursively scan the IValue object, traversing lists, tuples, dicts, and stop
+ * and call the user provided callback function 'func' when a Tensor is found.
+ */
+void for_each_tensor_in_ivalue(
+    const ::c10::IValue& iv,
+    std::function<void(const ::at::Tensor&)> const& func);
+} // namespace torch::jit::mobile
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/model_tracer/TracerRunner.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/model_tracer/TracerRunner.h
new file mode 100644
index 0000000000000000000000000000000000000000..e1271f35261d4d860b593634ce1790886a8c532e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/model_tracer/TracerRunner.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <set>
+#include <string>
+#include <vector>
+
+#include <ATen/core/ivalue.h>
+#include <torch/csrc/jit/mobile/model_tracer/BuildFeatureTracer.h>
+#include <torch/csrc/jit/mobile/model_tracer/CustomClassTracer.h>
+#include <torch/csrc/jit/mobile/model_tracer/KernelDTypeTracer.h>
+
+namespace torch::jit::mobile {
+
+const std::vector<std::string> always_included_traced_ops = {
+    // The following are called from setup sections.
+    "aten::resize_",
+    "aten::slice.Tensor",
+};
+
+struct TracerResult {
+  std::set<std::string> root_ops;
+  std::set<std::string> traced_operators;
+  KernelDTypeTracer::kernel_tags_type called_kernel_tags;
+  CustomClassTracer::custom_classes_type loaded_classes;
+  BuildFeatureTracer::build_feature_type build_features;
+  std::set<std::string> enabled_backends;
+};
+
+/**
+ * Trace a single model and return the TracerResult.
+ */
+TracerResult trace_run(const std::string& input_module_path);
+
+/**
+ * Trace multiple models and return the TracerResult.
+ */
+TracerResult trace_run(const std::vector<std::string>& input_module_paths);
+
+} // namespace torch::jit::mobile
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/module.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/module.h
new file mode 100644
index 0000000000000000000000000000000000000000..060a106fdcf5783d8e9541a9c7b1d39081da3aec
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/module.h
@@ -0,0 +1,193 @@
+#pragma once
+#include <ATen/core/jit_type.h>
+#include <torch/csrc/jit/mobile/debug_info.h>
+#include <torch/csrc/jit/mobile/function.h>
+#include <torch/csrc/jit/mobile/method.h>
+#include <torch/csrc/jit/mobile/quantization.h>
+
+#include <utility>
+
+namespace torch::jit::mobile {
+using Stack = std::vector<c10::IValue>;
+
+// A CompilationUnit object is the one that gets executed by the lite
+// interpreter.
+//
+// A CompilationUnit object contains a list of Method Objects. These are methods
+// that appear in the original PyTorch Model. These method correspond to Python
+// member functions of the Model class.
+//
+// Methods in turn contain a Function, and a back-pointer to the Module that
+// owns this Method instance.
+//
+// A Function contains a Code Object (code_) which is defined in interpreter.h
+//
+// A Code object contains the following:
+//
+// std::vector<Instruction> instructions_;
+// std::vector<c10::OperatorName> op_names_;
+// std::vector<std::function<void(Stack&)>> operators_;
+// std::vector<c10::IValue> constants_;
+// std::vector<c10::TypePtr> types_;
+// size_t register_size_; // Aggregated output size.
+//
+class CompilationUnit {
+ public:
+  void register_function(std::unique_ptr<Function> fn);
+  std::vector<std::unique_ptr<Function>>& methods() {
+    return methods_;
+  }
+  const std::vector<std::unique_ptr<Function>>& methods() const {
+    return methods_;
+  }
+  Function* find_function(const c10::QualifiedName& qn);
+  const Function* find_function(const c10::QualifiedName& qn) const;
+
+  void unsafeRemoveFunction(const int64_t index) {
+    methods_.erase(methods_.begin() + index);
+  }
+
+ private:
+  std::vector<std::unique_ptr<Function>> methods_;
+};
+
+// A Torch Mobile Module is a representation of the model (trained in case
+// of inference). A Mobile Module contains
+//
+// 1. data (object_)
+// 2. metadata (optional) about the model (metadata_ from the metadata.pkl
+//    file added after training)
+// 3. Compilation Unit (cu_)
+//
+class TORCH_API Module {
+ public:
+  Module(
+      c10::intrusive_ptr<c10::ivalue::Object> object,
+      std::shared_ptr<CompilationUnit> cu)
+      : object_(std::move(object)), cu_(std::move(cu)) {}
+  Module() = default;
+  Method get_method(const std::string& method_name) const;
+  template <typename... Types>
+  c10::IValue run_method(const std::string& method_name, Types&&... args) {
+    return get_method(method_name)({IValue(std::forward<Types>(args))...});
+  }
+  c10::IValue forward(std::vector<c10::IValue> inputs) {
+    return get_method("forward")(std::move(inputs));
+  }
+  std::optional<Method> find_method(const std::string& basename) const;
+
+  const std::string name() const {
+    return object_->name();
+  }
+  const std::vector<at::IValue>& slots() const {
+    return object_->slots();
+  }
+  const c10::intrusive_ptr<c10::ivalue::Object> _ivalue() const {
+    return object_;
+  }
+  const std::vector<at::Tensor> parameters() const;
+  const std::map<std::string, at::Tensor> named_parameters() const;
+  std::string get_forward_method_debug_info(int64_t debug_handle) const;
+  std::string getModuleHierarchy(const int64_t debug_handle) const;
+  std::string getCallStack(const int64_t debug_handle) const;
+  /// Enables "training" mode.
+  void train(bool on = true);
+  /// Calls train(false) to enable "eval" mode.
+  void eval() {
+    train(/*on=*/false);
+  }
+  /// True if the module is in training mode.
+  bool is_training() const;
+  const std::unordered_map<std::string, std::string> getMetadata() const {
+    return metadata_;
+  }
+  void setMetadata(
+      const std::unordered_map<std::string, std::string>& metadata) {
+    metadata_ = metadata;
+  }
+  const std::vector<Method> get_methods() const;
+
+  c10::IValue attr(const std::string& name, c10::IValue or_else) const {
+    if (auto r = object_->type()->findAttributeSlot(name)) {
+      return object_->getSlot(*r);
+    }
+    if (auto r = object_->type()->findConstantSlot(name)) {
+      return object_->type()->getConstant(*r);
+    }
+    return or_else;
+  }
+
+  void setDebugTable(MobileDebugTable&& debug_table) {
+    debug_table_ = std::move(debug_table);
+  }
+  const MobileDebugTable& getDebugTable() const {
+    return debug_table_;
+  }
+
+  void setHasDebugHandles(bool has_debug_handles) {
+    has_debug_handles_ = has_debug_handles;
+  }
+
+  bool hasDebugHandles() const {
+    return has_debug_handles_;
+  }
+
+  const CompilationUnit& compilation_unit() const {
+    return *cu_;
+  }
+
+  void set_delete_memory(std::shared_ptr<char> delete_mem) {
+    mem_to_delete_ = std::move(delete_mem);
+  }
+
+  void set_min_operator_version(int64_t version) {
+    min_operator_version_ = version;
+  }
+
+  int64_t min_operator_version() const {
+    return min_operator_version_;
+  }
+
+  void set_bytecode_version(int64_t version) {
+    bytecode_version_ = version;
+  }
+
+  int64_t bytecode_version() const {
+    return bytecode_version_;
+  }
+
+ private:
+  friend class quantization::PTQQuanizationHelper;
+
+  bool compareMethodSchemas(
+      const std::string& name_1,
+      const std::string& name_2);
+
+  void unsafeRemoveMethod(const std::string& basename);
+
+  void unsafeCopyMethod(
+      const std::string& new_method_name,
+      const Function& to_be_copied);
+
+  c10::intrusive_ptr<c10::ivalue::Object> object_;
+  std::unordered_map<std::string, std::string> metadata_;
+  std::shared_ptr<CompilationUnit> cu_;
+  MobileDebugTable debug_table_;
+  bool has_debug_handles_ = false;
+  int64_t min_operator_version_ = 4;
+  int64_t bytecode_version_ = 4;
+
+  // Extra handle for the module to delete when itself is deleted
+  std::shared_ptr<char> mem_to_delete_;
+};
+
+struct TORCH_API ModuleInfo {
+  uint64_t bytecode_version;
+  uint64_t operator_version;
+  std::unordered_map<std::string, int> opname_to_num_args;
+  std::unordered_set<std::string> function_names;
+  std::unordered_set<std::string> type_names;
+};
+TORCH_API ModuleInfo get_module_info(const mobile::Module& module);
+
+} // namespace torch::jit::mobile
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/nnc/aot_compiler.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/nnc/aot_compiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..138109322497dba2d7eef76537cb4fb12d7a270b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/nnc/aot_compiler.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/mobile/nnc/context.h>
+
+namespace torch::jit::mobile::nnc {
+
+// Performs Ahead Of Time compilation of a given method in a model
+// returning the compiled function and LLVM assembly code
+TORCH_API std::pair<std::unique_ptr<Function>, const std::string> aotCompile(
+    const std::string& method_name,
+    std::shared_ptr<Graph>& subgraph,
+    const std::vector<std::vector<int64_t>>& sizes,
+    const std::vector<at::ScalarType>& types,
+    const std::string& kernel_func_name = "func");
+
+} // namespace torch::jit::mobile::nnc
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/nnc/context.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/nnc/context.h
new file mode 100644
index 0000000000000000000000000000000000000000..f1afb97126f2ac7643fc9205db44dd58d0c559d9
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/nnc/context.h
@@ -0,0 +1,223 @@
+#pragma once
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <ATen/core/ivalue.h>
+#include <c10/core/ScalarType.h>
+
+namespace torch::jit::mobile::nnc {
+
+// Specify the requirements on an input tensor.
+// TODO: support input tensor with dynamic shape (PR #54982)
+struct TORCH_API InputSpec {
+  InputSpec() = default;
+
+  // Deserialize the spec from an IValue.
+  explicit InputSpec(const c10::IValue& value);
+
+  // Serialize the spec into an IValue.
+  [[nodiscard]] c10::IValue serialize() const;
+
+  // Check whether the input tensor adheres to the spec.
+  [[nodiscard]] bool validate(const at::Tensor& input) const;
+
+  std::vector<int64_t> sizes_;
+  c10::ScalarType dtype_{c10::ScalarType::Undefined};
+};
+
+// Specify the sizes/dtype/... of output tensor to preallocate the output.
+// TODO: support the case where kernel allocates output tensors dynamically.
+struct TORCH_API OutputSpec {
+  OutputSpec() = default;
+
+  // Deserialize the spec from an IValue.
+  explicit OutputSpec(const c10::IValue& value);
+
+  // Serialize the spec into an IValue.
+  [[nodiscard]] c10::IValue serialize() const;
+
+  // Allocate an output tensor in accordance with the spec.
+  [[nodiscard]] at::Tensor allocate() const;
+
+  std::vector<int64_t> sizes_;
+  c10::ScalarType dtype_{c10::ScalarType::Undefined};
+  std::optional<double> qscale_;
+  std::optional<int64_t> qzero_;
+};
+
+// Hold the temporary buffers / states needed during the execution.
+struct TORCH_API ExecutionState {
+  ExecutionState() = default;
+  ExecutionState(const ExecutionState&) = delete;
+  ExecutionState(ExecutionState&&) = default;
+  ExecutionState& operator=(const ExecutionState&) = delete;
+  ExecutionState& operator=(ExecutionState&&) = default;
+
+  // Preallocated buffers needed by the NNC kernel.
+  std::vector<c10::DataPtr> preallocations_;
+
+  // The NNC kernel expects the following arguments layout:
+  //   input tensor 1
+  //   ...
+  //   input tensor INPUT_NUM
+  //   output tensor 1
+  //   ...
+  //   output tensor OUTPUT_NUM
+  //   parameter tensor 1
+  //   ...
+  //   parameter tensor PARAM_NUM
+  //   temporary buffer 1
+  //   ...
+  //   temporary buffer BUFFER_NUM
+  std::vector<void*> arguments_;
+};
+
+// Specify how to allocate temporary buffers at initialization.
+struct TORCH_API MemoryPlan {
+  MemoryPlan() = default;
+
+  explicit MemoryPlan(const c10::IValue& value);
+
+  [[nodiscard]] c10::IValue serialize() const;
+
+  void allocate(ExecutionState* state) const;
+
+  std::vector<int64_t> buffer_sizes_;
+};
+
+// Location of a symbolic shape among dimensions of the inputs
+struct TORCH_API SymbolicShapePosition {
+  SymbolicShapePosition() = default;
+  SymbolicShapePosition(int64_t input_idx, int64_t dim_idx)
+      : input_idx_(input_idx), dim_idx_(dim_idx) {}
+
+  int64_t input_idx_;
+  int64_t dim_idx_;
+};
+
+// Represents a compiled NNC function which has a 1-1 correspondence with a
+// `Method` (e.g. `forward`). It's similar as torch::jit::mobile::Function.
+class TORCH_API Function {
+ public:
+  explicit Function() = default;
+
+  // Deserialize from an IValue that is generated by the 'serialize()' method.
+  explicit Function(const c10::IValue& value);
+
+  // Serialize into an IValue.
+  c10::IValue serialize() const;
+
+  // Execute the compiled NNC function.
+  c10::impl::GenericList run(const c10::impl::GenericList& inputs) const;
+
+  // The name of the function as specified in the model code.
+  c10::QualifiedName name() const {
+    return name_;
+  }
+
+  void set_name(const c10::QualifiedName& name) {
+    name_ = name;
+  }
+
+  // The unique id of the generated NNC kernel corresponding to the function.
+  const std::string& nnc_kernel_id() const {
+    return nnc_kernel_id_;
+  }
+
+  void set_nnc_kernel_id(const std::string& name) {
+    nnc_kernel_id_ = name;
+  }
+
+  // The parameters (e.g. weights / bias tensors) to be passed to the generated
+  // NNC kernel.
+  const c10::impl::GenericList& parameters() const {
+    return parameters_;
+  }
+
+  void set_parameters(const c10::impl::GenericList& parameters) {
+    parameters_ = parameters;
+  }
+
+  const std::vector<InputSpec>& input_specs() const {
+    return input_specs_;
+  }
+
+  void set_input_specs(const std::vector<InputSpec>& input_specs) {
+    input_specs_ = input_specs;
+  }
+
+  const std::vector<OutputSpec>& output_specs() const {
+    return output_specs_;
+  }
+
+  void set_output_specs(const std::vector<OutputSpec>& output_specs) {
+    output_specs_ = output_specs;
+  }
+
+  const MemoryPlan& memory_plan() const {
+    return memory_plan_;
+  }
+
+  void set_memory_plan(const MemoryPlan& memory_plan) {
+    memory_plan_ = memory_plan;
+  }
+
+  const std::vector<SymbolicShapePosition>& sym_shape_positions() const {
+    return sym_shape_positions_;
+  }
+
+  void set_sym_shape_positions(
+      const std::vector<SymbolicShapePosition>& sym_shape_pos) {
+    sym_shape_positions_ = sym_shape_pos;
+  }
+
+ private:
+  void init_execution_state() const;
+
+  c10::QualifiedName name_;
+  std::string nnc_kernel_id_;
+  c10::impl::GenericList parameters_{at::AnyType::get()};
+  std::vector<InputSpec> input_specs_;
+  std::vector<OutputSpec> output_specs_;
+  std::vector<SymbolicShapePosition> sym_shape_positions_;
+  MemoryPlan memory_plan_;
+  mutable std::unique_ptr<ExecutionState> execution_state_;
+};
+
+// CompilationUnit consists of a set of compiled NNC functions. It has a 1-1
+// correspondence with a `Module`.
+// It's similar as torch::jit::mobile::CompilationUnit.
+class TORCH_API CompilationUnit {
+ public:
+  CompilationUnit() = default;
+  CompilationUnit(const CompilationUnit&) = delete;
+  CompilationUnit(CompilationUnit&&) = default;
+  CompilationUnit& operator=(const CompilationUnit&) = delete;
+  CompilationUnit& operator=(CompilationUnit&&) = default;
+
+  // Deserialize from an IValue that is generated by the 'serialize()' method.
+  explicit CompilationUnit(const c10::IValue& value);
+
+  // Serialize all registered functions into an IValue. The IValue will be save
+  // into the compiled TorchScript model file ahead-of-time on the host, and
+  // will be deserialized at runtime on the target device.
+  [[nodiscard]] c10::IValue serialize() const;
+
+  // Execute a registered function.
+  [[nodiscard]] c10::impl::GenericList run(
+      const c10::QualifiedName& function_name,
+      const c10::impl::GenericList& inputs) const;
+
+  // Register a function to the compilation unit.
+  void register_function(std::unique_ptr<Function> fn);
+
+ private:
+  [[nodiscard]] Function* find_function(const c10::QualifiedName& qn) const;
+
+  std::unordered_map<c10::QualifiedName, std::unique_ptr<Function>> functions_;
+};
+
+} // namespace torch::jit::mobile::nnc
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/nnc/registry.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/nnc/registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..53e9ff15476e496fd526b2581510483607f5f9f6
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/nnc/registry.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <c10/util/Exception.h>
+#include <c10/util/Registry.h>
+
+namespace torch::jit::mobile::nnc {
+
+using nnc_kernel_function_type = int(void**);
+
+struct TORCH_API NNCKernel {
+  virtual ~NNCKernel() = default;
+  virtual int execute(void** /* args */) = 0;
+};
+
+TORCH_DECLARE_REGISTRY(NNCKernelRegistry, NNCKernel);
+
+#define REGISTER_NNC_KERNEL(id, kernel, ...)     \
+  extern "C" {                                   \
+  nnc_kernel_function_type kernel;               \
+  }                                              \
+  struct NNCKernel_##kernel : public NNCKernel { \
+    int execute(void** args) override {          \
+      return kernel(args);                       \
+    }                                            \
+  };                                             \
+  C10_REGISTER_TYPED_CLASS(NNCKernelRegistry, id, NNCKernel_##kernel);
+
+namespace registry {
+
+inline bool has_nnc_kernel(const std::string& id) {
+  return NNCKernelRegistry()->Has(id);
+}
+
+inline std::unique_ptr<NNCKernel> get_nnc_kernel(const std::string& id) {
+  return NNCKernelRegistry()->Create(id);
+}
+
+} // namespace registry
+
+} // namespace torch::jit::mobile::nnc
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/observer.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/observer.h
new file mode 100644
index 0000000000000000000000000000000000000000..114d4cee924e16259e0244eef044f5c37b6948f9
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/observer.h
@@ -0,0 +1,110 @@
+#pragma once
+
+#include <c10/util/ThreadLocalDebugInfo.h>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace torch {
+
+class MobileDebugInfo : public c10::DebugInfoBase {
+ public:
+  const std::string& getModelName() {
+    return model_name_;
+  }
+
+  void setModelName(const std::string& model_name) {
+    model_name_ = model_name;
+  }
+
+  const std::string& getMethodName() {
+    return method_name_;
+  }
+
+  void setMethodName(const std::string& method_name) {
+    method_name_ = method_name;
+  }
+
+  size_t getOpIdx() {
+    return op_idx_;
+  }
+
+  void setOpIdx(size_t op_idx) {
+    op_idx_ = op_idx;
+  }
+
+ private:
+  std::string model_name_;
+  std::string method_name_;
+  // TODO: Kimish
+  // If we launch a thread such as for at::launch, interepter continuation
+  // and if the caching allocator is enabled in the base thread
+  // then, in order to propagate this information, that is caching allocator
+  // is enabled, across thread boundaries we can use the mechanism provided
+  // by ThreadLocalDebugInfo
+  // Once the thread local MobileDebugInfo is accessible in the launched
+  // thread, it can be accessed in that thread and that thread can set
+  // its own thread local CachingAllocatorInfo.
+  // However, we cannot expect every launched thread to extract and set
+  // its own thread local copy of CachingAllocatorInfo.
+  // But this can be done in lite interpreter, where in the run method
+  // it can do info =
+  // c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::MOBILE_RUNTIME_INFO))
+  // .get_caching_allocator_info();
+  // GetThreadLocalCachingAllocatorInfo() = info;
+  // Other option is to have MobileDebugInfo itself be the place where thread
+  // local copy of CachingAllocatorInfo is stored. Then
+  // DefaultMobileCPUAllocator inspects this to decide if to use
+  // CachingAllocator. However, current lite interpreter does not support FORK,
+  // thus from the run method of lite interpreter we are not really gonna launch
+  // another instance of lite interpreter in a different thread. So for now not
+  // getting bothered about passing CachingAllocatorInfo across thread
+  // boundaries. c10::CachingAllocatorInfo caching_allocator_info;
+  size_t op_idx_ = 0;
+};
+
+class MobileModuleObserver {
+ public:
+  virtual ~MobileModuleObserver() = default;
+
+  virtual void onEnterRunMethod(const int32_t) {}
+  virtual void onExitRunMethod(
+      const std::unordered_map<std::string, std::string>&,
+      const std::string&,
+      const int32_t) {}
+  virtual void onFailRunMethod(
+      const std::unordered_map<std::string, std::string>&,
+      const std::string&,
+      const int32_t,
+      const char*) {}
+  virtual void onEnterLoadModel(const int32_t) {}
+  virtual void onExitLoadModel(
+      const int32_t,
+      const std::unordered_map<std::string, std::string>&) {
+  } // key: filename, value: file content
+  virtual void onFailLoadModel(const int32_t, const char*) {}
+  virtual void onFailLoadModel(
+      const int32_t,
+      const char*,
+      const std::unordered_map<std::string, std::string>&) {}
+  virtual std::vector<std::string> getDefaultExtraFiles() = 0;
+  virtual std::unordered_map<std::string, std::string> processMetadataFromExtra(
+      const std::unordered_map<std::string, std::string>&) = 0;
+};
+
+class MobileObserverConfig {
+ public:
+  void setModuleObserver(std::unique_ptr<MobileModuleObserver> reporter) {
+    module_observer_ = std::move(reporter);
+  }
+  MobileModuleObserver* getModuleObserver() {
+    return module_observer_.get();
+  }
+
+ private:
+  std::unique_ptr<MobileModuleObserver> module_observer_;
+};
+
+MobileObserverConfig& observerConfig();
+
+} // namespace torch
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/parse_bytecode.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/parse_bytecode.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb7a7841e50d8cd35a3815c0e5cc03e5002acd0a
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/parse_bytecode.h
@@ -0,0 +1,21 @@
+#pragma once
+#include <torch/csrc/jit/mobile/function.h>
+
+namespace torch::jit::mobile {
+using c10::IValue;
+TORCH_API void parseInstructions(
+    const std::string& function_name,
+    c10::ivalue::TupleElements&& ins_list,
+    c10::ivalue::TupleElements& debug_handles_m_tuple,
+    mobile::Function* function);
+TORCH_API void parseConstants(
+    const c10::ivalue::TupleElements& consts_list,
+    mobile::Function* function);
+TORCH_API void parseTypes(
+    const c10::ivalue::TupleElements& types_list,
+    mobile::Function* function);
+TORCH_API void parseRegisterSize(size_t rsize, mobile::Function* function);
+TORCH_API void applyUpgrader(
+    mobile::Function* function,
+    uint64_t operator_version);
+} // namespace torch::jit::mobile
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/parse_operators.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/parse_operators.h
new file mode 100644
index 0000000000000000000000000000000000000000..41e630a70c4668054723e0e287fa45e8977d3a9b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/parse_operators.h
@@ -0,0 +1,25 @@
+#pragma once
+#include <torch/csrc/jit/mobile/function.h>
+
+namespace torch::jit {
+using c10::IValue;
+
+enum MobileModuleLoadOptions {
+  OPERATOR_CHECK = 1,
+  // PARSE_ALL_EXTRA_FILE_MAPS is used to gate for ExtraFileMaps to pull all
+  // files automatically without explicit entries mapping. Refer to PR for a
+  // detail: https://github.com/pytorch/pytorch/pull/99747
+  PARSE_ALL_EXTRA_FILE_MAPS = 2,
+};
+
+const uint64_t kDefaultMobileLoadOptions =
+    MobileModuleLoadOptions::OPERATOR_CHECK;
+
+namespace mobile {
+
+TORCH_API void parseOperators(
+    c10::ivalue::TupleElements&& ops_list,
+    const uint64_t& module_load_options,
+    mobile::Function* function);
+} // namespace mobile
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/prim_ops_registery.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/prim_ops_registery.h
new file mode 100644
index 0000000000000000000000000000000000000000..1d4dd24e3264a606117c73eee8f50ab79f341a09
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/prim_ops_registery.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <functional>
+#include <vector>
+
+namespace torch::jit::mobile {
+
+using Stack = std::vector<c10::IValue>;
+
+void registerPrimOpsFunction(
+    const std::string& name,
+    const std::function<void(Stack&)>& fn);
+
+bool hasPrimOpsFn(const std::string& name);
+
+std::function<void(Stack&)>& getPrimOpsFn(const std::string& name);
+
+class prim_op_fn_register {
+ public:
+  prim_op_fn_register(
+      const std::string& name,
+      const std::function<void(Stack&)>& fn) {
+    registerPrimOpsFunction(name, fn);
+  }
+};
+
+} // namespace torch::jit::mobile
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/profiler_edge.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/profiler_edge.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e5191cf783cc2b3205a460a10ff0fd5dd280b65
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/profiler_edge.h
@@ -0,0 +1,115 @@
+#pragma once
+#include <torch/csrc/autograd/profiler_kineto.h>
+#include <torch/csrc/jit/mobile/module.h>
+
+namespace torch::jit::mobile {
+
+// If we dont have kineto available then edge profiler does not
+// work since it relies on Kineto
+#ifdef USE_KINETO
+class TORCH_API KinetoEdgeCPUProfiler {
+ public:
+  // This profiler only profiles KINETO events
+  // No GPU_FALLBACK or NVTX
+  /*
+   * @param m is the instance of mobile Module which is being profiled.
+   *        Note that this implies that KinetoEdgeCPUProfiler can be used
+   *        to profile specific Module (see usage below), unliked ProfilerKineto
+   *        which can profile pytorch runtime in arbitrary scope.
+   * @param fname is the name of the file to which chrome trace is written.
+   * @param report_input_shapes: whether to record shapes of op's inputs.
+   * @param with_stack: whether to record model's python stacktrace for the op.
+   * @param with_flops: whether to report flops corresponding to the op.
+   * @param with_modules: whether to report original python module
+   *        hierarchy to which the op belongs.
+   * @param events
+   * @param adjust_vulkan_timestamps: whether to adjust vulkan timestamps from
+   *        query pool to align with cpu event times
+   *
+   * Usage pattern for this profiler must be as follows:
+   *
+   * {
+   *   KinetoEdgeCPUProfiler(m, filename, args);
+   *   m.forward(...);
+   * }
+   *
+   * The reason being that KinetoEdgeCPUProfiler has a dependency on Module
+   * and thus it must not outlive it.
+   *
+   * Thus, when KinetoEdgeCPUProfiler is used as RAII to do profiling
+   * within certain scope. In that scope, the captured reference to
+   * Module will outlive KinetoEdgeCPUProfiler. This is gauranteed because
+   * KinetoEdgeCPUProfiler must be constructed later than Module, on stack.
+   *
+   * An example of the anti-pattern and wrong usage is:
+   *
+   * std::shared_ptr<KinetoMobileCPUProfiler> profiler(m, filename, args);
+   * m.forward(...);
+   *
+   * Since KinetoEdgeCPUProfiler object would then be constructed on heap
+   * with its lifetime managed manually or via smart pointers.
+   */
+  KinetoEdgeCPUProfiler(
+      const torch::jit::mobile::Module& m,
+      const std::string& fname,
+      const bool report_input_shapes = false,
+      const bool profile_memory = false,
+      const bool with_stack = false,
+      const bool with_flops = false,
+      const bool with_modules = false,
+      std::vector<std::string> events = {},
+      const bool adjust_vulkan_timestamps = false);
+
+  const std::unique_ptr<torch::autograd::profiler::ProfilerResult>&
+  disableProfiler();
+  const std::unique_ptr<torch::autograd::profiler::ProfilerResult>&
+  getProfilerResult();
+  void recordBackendEvent(
+      const int64_t start_time_us,
+      const int64_t end_time_us,
+      const int64_t debug_handle,
+      const std::string& event_name,
+      const std::string& backend_name);
+  void recordBackendMemoryEvent(
+      void* ptr,
+      int64_t alloc_size,
+      size_t total_allocated,
+      size_t total_reserved,
+      c10::Device device);
+
+  ~KinetoEdgeCPUProfiler();
+
+ private:
+  /*
+   * We store a reference to Module to make such dependency explicit, since
+   * a Module reference is already stored in a functor.
+   */
+  const mobile::Module& m_;
+  std::string trace_file_name_;
+  std::unique_ptr<torch::autograd::profiler::ProfilerResult> profiler_result_;
+};
+
+TORCH_API KinetoEdgeCPUProfiler* getCurrentEdgeProfiler();
+
+#define RECORD_BACKEND_EVENT_TO_EDGE_PROFILER(                               \
+    start_time_us, end_time_us, debug_handle, event_name, backend_name)      \
+  if (mobile::getCurrentEdgeProfiler()) {                                    \
+    mobile::getCurrentEdgeProfiler()->recordBackendEvent(                    \
+        start_time_us, end_time_us, debug_handle, event_name, backend_name); \
+  }
+
+#define RECORD_BACKEND_MEMORY_EVENT_TO_EDGE_PROFILER(              \
+    ptr, alloc_size, total_allocated, total_reserved, device)      \
+  if (mobile::getCurrentEdgeProfiler()) {                          \
+    mobile::getCurrentEdgeProfiler()->recordBackendMemoryEvent(    \
+        ptr, alloc_size, total_allocated, total_reserved, device); \
+  }
+#else
+
+#define RECORD_BACKEND_EVENT_TO_EDGE_PROFILER( \
+    start_time_us, end_time_us, debug_handle, event_name, backend_name)
+
+#define RECORD_BACKEND_MEMORY_EVENT_TO_EDGE_PROFILER( \
+    ptr, alloc_size, total_allocated, total_reserved, device)
+#endif
+} // namespace torch::jit::mobile
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/promoted_prim_ops.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/promoted_prim_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..5a4baa7da94c2a4c9712357cb937c469272f5793
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/promoted_prim_ops.h
@@ -0,0 +1,61 @@
+#pragma once
+#include <torch/csrc/jit/mobile/prim_ops_registery.h>
+#include <torch/csrc/jit/mobile/register_ops_common_utils.h>
+
+namespace torch::jit {
+
+void tupleIndex(Stack& stack);
+
+void raiseException(Stack& stack);
+
+void is(Stack& stack);
+
+void unInitialized(Stack& stack);
+
+void isNot(Stack& stack);
+
+void aten_format(Stack& stack);
+
+void size(Stack& stack);
+
+void sym_size(Stack& stack);
+
+void sym_size_int(Stack& stack);
+
+void sym_stride_int(Stack& stack);
+
+void sym_numel(Stack& stack);
+
+void sym_storage_offset(Stack& stack);
+
+void sym_stride(Stack& stack);
+
+void device(Stack& stack);
+
+void device_with_index(Stack& stack);
+
+void dtype(Stack& stack);
+
+void layout(Stack& stack);
+
+void toPrimDType(Stack& stack);
+
+void dim(Stack& stack);
+
+void _not(Stack& stack);
+
+void boolTensor(Stack& stack);
+
+void toList(Stack& stack);
+
+void numToTensorScalar(Stack& stack);
+
+void isCuda(Stack& stack);
+
+void numToTensorBool(Stack& stack);
+
+void dictIndex(Stack& stack);
+
+void raiseExceptionWithMessage(Stack& stack);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/quantization.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/quantization.h
new file mode 100644
index 0000000000000000000000000000000000000000..2b3c860c19f7f35f580b1eca6d799fc8472a407e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/quantization.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <c10/macros/Export.h>
+#include <string>
+
+namespace torch::jit::mobile {
+class Module;
+namespace quantization {
+/*
+ * Device side PTQ API.
+ * Once the model has been prepared for quantization on server side, such model
+ * is sent to device. On device side the model is further trained. At the end of
+ * the training, before the model is readied for inference, we need to quantize
+ * the model.
+ * Usage of this API is as follows.
+ * PTQQuanizationHelper ptq_helper;
+ * ptq_helper.quantize_dynamic(m, "forward");
+ * Args:
+ * m: Captured by reference, an instance of mobile::Module. This module will be
+ * mutated in place to replace its <method_name> method with quantized
+ * equivalent. method:name: Name of the method to be quantized. AOT preparation
+ * for quantization must also have been done for this method. Returns: In place
+ * mutated `m` whose size should be smaller due to weight quantization and whose
+ * <method_name> method should use quantized ops
+ */
+class TORCH_API PTQQuanizationHelper {
+ public:
+  PTQQuanizationHelper() = default;
+  void quantize_dynamic(
+      torch::jit::mobile::Module& m,
+      const std::string& method_name);
+};
+} // namespace quantization
+} // namespace torch::jit::mobile
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/register_ops_common_utils.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/register_ops_common_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..b43571da0c77d3f49771ed0bc56b575e633379e1
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/register_ops_common_utils.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <ATen/Context.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/stack.h>
+#include <torch/csrc/jit/runtime/jit_exception.h>
+#include <torch/csrc/jit/runtime/vararg_functions.h>
+
+namespace torch::jit {
+
+inline void noop(Stack& n) {}
+
+int64_t normalizeIndex(int64_t idx, int64_t list_size);
+
+// reference function THPVariable_to in python_variable_methods.cpp
+[[maybe_unused]] static at::Tensor to_dispatch(
+    at::Tensor self,
+    std::optional<at::Device> device,
+    std::optional<at::ScalarType> scalarType,
+    bool non_blocking,
+    bool copy) {
+  if (device && device->is_cuda()) {
+    at::globalContext().lazyInitDevice(c10::DeviceType::CUDA);
+  }
+  if (!device && !scalarType && !copy) {
+    return self;
+  } else if (!device) {
+    return self.to(*scalarType, non_blocking, copy);
+  } else if (!scalarType) {
+    return self.to(*device, non_blocking, copy);
+  } else {
+    return self.to(*device, *scalarType, non_blocking, copy);
+  }
+}
+
+// Convert the tensor pointed to by \p data to a nested list. \p dim is the
+// number of dimensions in the tensor and \p cur_dim is the dimension being
+// processed by the current invocation. \p ty is the expected output IR type of
+// the operation. \p is the scalar type of \p data. \p sizes and \p strides are
+// the sizes and strides of the tensor operand and \p element_size is the size
+// in bytes of one tensor element.
+IValue tensorToListRecursive(
+    char* data,
+    int64_t cur_dim,
+    int64_t num_tensor_dims,
+    at::TypePtr ty,
+    at::ScalarType scalar_ty,
+    at::IntArrayRef sizes,
+    at::IntArrayRef strides,
+    size_t element_size);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/train/export_data.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/train/export_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7883da390f870d74a2b66bd1a09eeb083880d33
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/train/export_data.h
@@ -0,0 +1,51 @@
+#pragma once
+
+#include <torch/csrc/jit/mobile/module.h>
+
+namespace torch::jit {
+
+/**
+ * Serializes the provided tensor map to the provided stream.
+ *
+ * @param[in] map The tensors to serialize.
+ * @param[in] out The stream to write the serialized data to.
+ * @param[in] use_flatbuffer If true, use Flatbuffers to serialize the data.
+ *     If false, use Pickle.
+ */
+TORCH_API void _save_parameters(
+    const std::map<std::string, at::Tensor>& map,
+    std::ostream& out,
+    bool use_flatbuffer = false);
+
+/**
+ * Serializes the provided tensor map to a file.
+ *
+ * @param[in] map The tensors to serialize.
+ * @param[in] filename The stem of the file name to write to. If
+ *     @p use_flatbuffer is false, the extension ".pkl" will be appended. If
+ *     @p use_flatbuffer is true, the extension ".ff" will be appended.
+ * @param[in] use_flatbuffer If true, use Flatbuffers to serialize the data.
+ *     If false, use Pickle.
+ */
+TORCH_API void _save_parameters(
+    const std::map<std::string, at::Tensor>& map,
+    const std::string& filename,
+    bool use_flatbuffer = false);
+
+namespace mobile {
+
+// NOTE: Please prefer using _save_parameters directly over using the 2
+// functions below.
+TORCH_API mobile::Module tensor_dict_to_mobile(
+    const c10::Dict<std::string, at::Tensor>& dict);
+
+c10::Dict<std::string, at::Tensor> tensor_map_to_dict(
+    const std::map<std::string, at::Tensor>& map);
+
+} // namespace mobile
+
+extern void (*_save_mobile_module_to)(
+    const mobile::Module& module,
+    const std::function<size_t(const void*, size_t)>& writer_func);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/train/optim/sgd.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/train/optim/sgd.h
new file mode 100644
index 0000000000000000000000000000000000000000..9587a3785e3a50cd0b926acd08d32e2aa3727ce9
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/train/optim/sgd.h
@@ -0,0 +1,125 @@
+#pragma once
+
+#include <torch/arg.h>
+#include <torch/types.h>
+
+#include <utility>
+#include <vector>
+
+namespace torch::jit::mobile {
+
+class SGDParamState {
+  TORCH_ARG(torch::Tensor, momentum_buffer);
+
+ public:
+  std::unique_ptr<SGDParamState> clone() const {
+    return std::make_unique<SGDParamState>(
+        static_cast<const SGDParamState&>(*this));
+  }
+  friend bool operator==(const SGDParamState& lhs, const SGDParamState& rhs);
+};
+
+struct TORCH_API SGDOptions {
+  /* implicit */ SGDOptions(double lr);
+  TORCH_ARG(double, lr);
+  TORCH_ARG(double, momentum) = 0;
+  TORCH_ARG(double, dampening) = 0;
+  TORCH_ARG(double, weight_decay) = 0;
+  TORCH_ARG(bool, nesterov) = false;
+
+ public:
+  std::unique_ptr<SGDOptions> clone() const {
+    return std::make_unique<SGDOptions>(static_cast<const SGDOptions&>(*this));
+  }
+  TORCH_API friend bool operator==(
+      const SGDOptions& lhs,
+      const SGDOptions& rhs);
+};
+
+/// Stores parameters in the param_group and stores a pointer to the SGDOptions
+class TORCH_API SGDParamGroup {
+ public:
+  // NOTE: In order to store `SGDParamGroup` in a `std::vector`, it has to be
+  // copy-constructible.
+  SGDParamGroup(const SGDParamGroup& param_group)
+      : params_(param_group.params()),
+        options_(
+            param_group.has_options() ? param_group.options().clone()
+                                      : nullptr) {}
+  SGDParamGroup& operator=(const SGDParamGroup& param_group) {
+    this->params_ = param_group.params();
+    this->options_ =
+        param_group.has_options() ? param_group.options().clone() : nullptr;
+    return *this;
+  }
+  /* implicit */ SGDParamGroup(std::vector<Tensor> params)
+      : params_(std::move(params)) {}
+  SGDParamGroup(std::vector<Tensor> params, std::unique_ptr<SGDOptions> options)
+      : params_(std::move(params)), options_(std::move(options)) {}
+
+  bool has_options() const;
+  SGDOptions& options();
+  const SGDOptions& options() const;
+  void set_options(std::unique_ptr<SGDOptions> options);
+  std::vector<Tensor>& params();
+  const std::vector<Tensor>& params() const;
+
+ protected:
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::vector<Tensor> params_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::unique_ptr<SGDOptions> options_;
+};
+
+class TORCH_API SGD {
+ public:
+  explicit SGD(
+      const std::vector<torch::jit::mobile::SGDParamGroup>& param_groups,
+      SGDOptions defaults)
+      : defaults_(std::make_unique<SGDOptions>(defaults)) {
+    for (const auto& param_group : param_groups) {
+      add_param_group(param_group);
+    }
+    TORCH_CHECK(defaults.lr() >= 0, "Invalid learning rate: ", defaults.lr());
+    TORCH_CHECK(
+        defaults.momentum() >= 0,
+        "Invalid momentum value: ",
+        defaults.momentum());
+    TORCH_CHECK(
+        defaults.weight_decay() >= 0,
+        "Invalid weight_decay value: ",
+        defaults.weight_decay());
+    TORCH_CHECK(
+        !defaults.nesterov() ||
+            (defaults.momentum() > 0 && defaults.dampening() == 0),
+        "Nesterov momentum requires a momentum and zero dampening");
+  }
+
+  explicit SGD(std::vector<Tensor> params, SGDOptions defaults)
+      : SGD({SGDParamGroup(std::move(params))}, defaults) {}
+
+  /// Adds the given param_group to the optimizer's param_group list.
+  void add_param_group(const SGDParamGroup& param_group);
+
+  ~SGD() = default;
+
+  using LossClosure = std::function<Tensor()>;
+  /// A loss function closure, which is expected to return the loss value.
+  torch::Tensor step(const LossClosure& closure = nullptr);
+
+  /// Zeros out the gradients of all parameters.
+  void zero_grad();
+
+ protected:
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::vector<SGDParamGroup> param_groups_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  ska::flat_hash_map<void*, std::unique_ptr<SGDParamState>> state_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::unique_ptr<SGDOptions> defaults_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::vector<Tensor> params_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::unique_ptr<SGDOptions> options_;
+};
+} // namespace torch::jit::mobile
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/train/random.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/train/random.h
new file mode 100644
index 0000000000000000000000000000000000000000..de9c11dc41461a190f646681d2703262db206eba
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/train/random.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/data/samplers/base.h>
+#include <torch/types.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace torch::serialize {
+class OutputArchive;
+class InputArchive;
+} // namespace torch::serialize
+
+namespace torch::jit::mobile {
+
+/// A lighter `Sampler` that returns indices randomly and cannot be
+/// serialized.
+class TORCH_API RandomSampler : public torch::data::samplers::Sampler<> {
+ public:
+  /// Constructs a `RandomSampler` with a size and dtype for the stored indices.
+  ///
+  /// The constructor will eagerly allocate all required indices, which is the
+  /// sequence `0 ... size - 1`. `index_dtype` is the data type of the stored
+  /// indices. You can change it to influence memory usage.
+  explicit RandomSampler(int64_t size, Dtype index_dtype = torch::kInt64);
+
+  ~RandomSampler() override;
+
+  /// Resets the `RandomSampler` to a new set of indices.
+  void reset(std::optional<size_t> new_size = std::nullopt) override;
+
+  /// Returns the next batch of indices.
+  std::optional<std::vector<size_t>> next(size_t batch_size) override;
+
+  /// Serializes the `RandomSampler` to the `archive`.
+  void save(serialize::OutputArchive& archive) const override;
+
+  /// Deserializes the `RandomSampler` from the `archive`.
+  void load(serialize::InputArchive& archive) override;
+
+  /// Returns the current index of the `RandomSampler`.
+  size_t index() const noexcept;
+
+ private:
+  at::Tensor indices_;
+  int64_t index_ = 0;
+};
+
+} // namespace torch::jit::mobile
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/train/sequential.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/train/sequential.h
new file mode 100644
index 0000000000000000000000000000000000000000..8707e6b81e7ce682db85fbb24940b521772cc0c2
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/train/sequential.h
@@ -0,0 +1,45 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/data/samplers/base.h>
+#include <torch/types.h>
+
+#include <cstddef>
+#include <vector>
+
+namespace torch::serialize {
+class OutputArchive;
+class InputArchive;
+} // namespace torch::serialize
+
+namespace torch::jit::mobile {
+
+/// A lighter `Sampler` that returns indices sequentially and cannot be
+/// serialized.
+class TORCH_API SequentialSampler : public torch::data::samplers::Sampler<> {
+ public:
+  /// Creates a `SequentialSampler` that will return indices in the range
+  /// `0...size - 1`.
+  explicit SequentialSampler(size_t size);
+
+  /// Resets the `SequentialSampler` to zero.
+  void reset(std::optional<size_t> new_size = std::nullopt) override;
+
+  /// Returns the next batch of indices.
+  std::optional<std::vector<size_t>> next(size_t batch_size) override;
+
+  /// Not supported for mobile SequentialSampler
+  void save(serialize::OutputArchive& archive) const override;
+
+  /// Not supported for mobile SequentialSampler
+  void load(serialize::InputArchive& archive) override;
+
+  /// Returns the current index of the `SequentialSampler`.
+  size_t index() const noexcept;
+
+ private:
+  size_t size_;
+  size_t index_{0};
+};
+
+} // namespace torch::jit::mobile
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/type_parser.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/type_parser.h
new file mode 100644
index 0000000000000000000000000000000000000000..9ee9ce4c55b9c98ec7edc60379f48ff7e0ecaa42
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/type_parser.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include <ATen/core/dynamic_type.h>
+#include <ATen/core/jit_type.h>
+#include <unordered_set>
+
+namespace c10 {
+
+class TORCH_API TypeParser {
+ public:
+  explicit TypeParser(std::string pythonStr);
+  explicit TypeParser(std::vector<std::string>& pythonStrs);
+
+  TypePtr parse();
+  std::vector<TypePtr> parseList();
+  static const std::unordered_set<std::string>& getNonSimpleType();
+  static const std::unordered_set<std::string>& getCustomType();
+  std::unordered_set<std::string> getContainedTypes();
+
+ private:
+  TypePtr parseNamedTuple(const std::string& qualified_name);
+  TypePtr parseCustomType();
+  TypePtr parseTorchbindClassType();
+  TypePtr parseNonSimple(const std::string& token);
+
+  void expect(const char* s);
+  void expectChar(char c);
+  template <typename T>
+  TypePtr parseSingleElementType();
+
+  void lex();
+
+  std::string next();
+  std::string_view nextView();
+  void advance();
+  [[nodiscard]] std::string_view cur() const;
+
+  std::string pythonStr_;
+  size_t start_;
+  std::string_view next_token_;
+
+  // Used for parsing string list
+  std::vector<std::string> pythonStrs_;
+  std::unordered_map<std::string, c10::TypePtr> str_type_ptr_map_;
+
+  // Store all contained types when parsing a string
+  std::unordered_set<std::string> contained_types_;
+};
+
+TORCH_API TypePtr parseType(const std::string& pythonStr);
+
+TORCH_API std::vector<TypePtr> parseType(std::vector<std::string>& pythonStr);
+
+} // namespace c10
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/upgrader_mobile.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/upgrader_mobile.h
new file mode 100644
index 0000000000000000000000000000000000000000..47767ed5adc82b18272c74f5ab9b5721ee60b471
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/mobile/upgrader_mobile.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <ATen/core/ivalue_inl.h>
+
+#include <torch/csrc/jit/mobile/code.h>
+#include <torch/csrc/jit/mobile/function.h>
+#include <torch/csrc/jit/serialization/import_export_functions.h>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace torch::jit {
+struct Instruction;
+struct Upgrader {
+  int min_version;
+  int max_version;
+  std::string upgrader_name;
+  int index;
+};
+
+// From operator_versions.yaml
+TORCH_API const std::unordered_map<std::string, std::vector<Upgrader>>
+getOperatorVersionMapForMobile();
+
+struct OperatorString {
+  const std::string name;
+  const std::string overload_name;
+  const std::optional<int> num_specified_args;
+};
+
+struct ByteCodeFunctionWithOperator {
+  mobile::Function& function;
+  std::vector<OperatorString> operators;
+};
+
+TORCH_API const std::vector<ByteCodeFunctionWithOperator>&
+getUpgraderBytecodeList();
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/operator_upgraders/upgraders.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/operator_upgraders/upgraders.h
new file mode 100644
index 0000000000000000000000000000000000000000..18db8198a606266b9b60b6758803450b9ca2e260
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/operator_upgraders/upgraders.h
@@ -0,0 +1,46 @@
+#pragma once
+#include <c10/macros/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+
+namespace torch::jit {
+
+class UpgradersMap {
+ public:
+  void set_content(
+      std::unordered_map<std::string, std::shared_ptr<Graph>>&& content);
+  int count();
+  const std::unordered_map<std::string, std::shared_ptr<Graph>>& get_content();
+  bool is_populated();
+  // THESE METHODS ARE ONLY USED FOR TESTING PURPOSES
+  void test_only_set_content(
+      const std::unordered_map<std::string, std::string>& content);
+  void test_only_remove_content(
+      const std::unordered_map<std::string, std::string>& content);
+
+ private:
+  std::unordered_map<std::string, std::shared_ptr<Graph>> content_;
+  std::mutex lock;
+  bool isPopulated = false;
+};
+
+TORCH_API void populate_upgraders_map(
+    std::unordered_map<std::string, std::shared_ptr<Graph>>&& content);
+
+TORCH_API int get_upgraders_map_size();
+
+TORCH_API bool is_upgraders_map_populated();
+
+TORCH_API const std::unordered_map<std::string, std::shared_ptr<Graph>>&
+dump_upgraders_map();
+
+// THESE TWO METHODS BELOW ARE ONLY USED FOR TESTING
+TORCH_API void test_only_populate_upgraders(
+    const std::unordered_map<std::string, std::string>& content);
+
+TORCH_API void test_only_remove_upgraders(
+    const std::unordered_map<std::string, std::string>& content);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/operator_upgraders/upgraders_entry.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/operator_upgraders/upgraders_entry.h
new file mode 100644
index 0000000000000000000000000000000000000000..d8ca43d1bba45619ff05fba0648db702bec68b35
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/operator_upgraders/upgraders_entry.h
@@ -0,0 +1,20 @@
+#pragma once
+#include <c10/macros/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <string>
+#include <unordered_map>
+
+namespace torch::jit {
+
+TORCH_API void populate_upgraders_graph_map();
+
+TORCH_API std::unordered_map<std::string, std::shared_ptr<Graph>>
+generate_upgraders_graph();
+
+TORCH_API std::unordered_map<std::string, std::string> get_upgraders_entry_map();
+
+std::shared_ptr<Graph> create_upgrader_graph(
+    const std::string& upgrader_name,
+    const std::string& upgrader_body);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/operator_upgraders/utils.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/operator_upgraders/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..d5b72e062b94290d92d4249bed722ae9981649f2
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/operator_upgraders/utils.h
@@ -0,0 +1,51 @@
+#pragma once
+#include <c10/macros/Export.h>
+#include <torch/csrc/jit/operator_upgraders/version_map.h>
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <vector>
+
+namespace torch::jit {
+
+struct UpgraderRange {
+  int min_version;
+  int max_version;
+};
+
+// Given a list of upgrader entries for a single operator
+// and the model version for that operator, find a valid
+// upgrader.
+TORCH_API std::optional<UpgraderEntry> findUpgrader(
+    const std::vector<UpgraderEntry>& upgraders_for_schema,
+    size_t current_version);
+
+// Utility methods to find if the operator is up-to-date
+// based on all registered upgraders for this operator.
+// This can be different from the current server version
+// because the implementation of this operator could have
+// been consistent for many later version bumps.
+TORCH_API bool isOpCurrentBasedOnUpgraderEntries(
+    const std::vector<UpgraderEntry>& upgraders_for_schema,
+    size_t current_version);
+
+TORCH_API bool isOpSymbolCurrent(
+    const std::string& name,
+    size_t current_version);
+
+// Returns the possible old schemas for the operator that
+// doesn't exist anymore. This can be true for deprecated
+// operators. Since name is always a symbol name, there
+// can be multiple schemas for different overloads.
+TORCH_API std::vector<std::string> loadPossibleHistoricOps(
+    const std::string& name,
+    std::optional<size_t> version);
+
+TORCH_API uint64_t getMaxOperatorVersion();
+
+// Returns the list of min and max version numbers of the operators
+// that an upgrader `x` support for all upgraders for op `foo`
+TORCH_API std::vector<UpgraderRange> getUpgradersRangeForOp(
+    const std::string& name);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/operator_upgraders/version_map.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/operator_upgraders/version_map.h
new file mode 100644
index 0000000000000000000000000000000000000000..63c8a0c48a9a2e50e71fe8589a278376c55523a7
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/operator_upgraders/version_map.h
@@ -0,0 +1,33 @@
+#pragma once
+#include <c10/macros/Export.h>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace torch::jit {
+
+struct UpgraderEntry {
+  int bumped_at_version;
+  std::string upgrader_name;
+  std::string old_schema;
+};
+
+// Toggle the behaviour of calculating version for the module.
+// If this is true, we calculate solely based on upgraders
+// If this is false, we calculate it based on historic per op version map
+TORCH_API void calculate_package_version_based_on_upgraders(bool val);
+
+TORCH_API bool get_version_calculator_flag();
+
+TORCH_API const std::unordered_map<std::string, std::vector<UpgraderEntry>>&
+get_operator_version_map();
+
+TORCH_API void test_only_add_entry(
+    const std::string& op_name,
+    UpgraderEntry entry);
+
+TORCH_API void test_only_remove_entry(const std::string& op_name);
+
+TORCH_API void test_only_reset_flag();
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/add_if_then_else.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/add_if_then_else.h
new file mode 100644
index 0000000000000000000000000000000000000000..ce1ff6767507bdddfd8e1874ee49d28b85512da5
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/add_if_then_else.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API bool AddIfThenElseOp(std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/annotate_warns.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/annotate_warns.h
new file mode 100644
index 0000000000000000000000000000000000000000..74236021b0c406f21bfd541029a90f20b2623c29
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/annotate_warns.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API void AnnotateWarns(const std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/autocast.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/autocast.h
new file mode 100644
index 0000000000000000000000000000000000000000..6626f49282ef5282063bbd3b10a12c19426eca50
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/autocast.h
@@ -0,0 +1,13 @@
+
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API void Autocast(const std::shared_ptr<Graph>& graph);
+
+TORCH_API bool setAutocastMode(bool value);
+TORCH_API bool autocastEnabled();
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/bailout_graph.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/bailout_graph.h
new file mode 100644
index 0000000000000000000000000000000000000000..fff7d5c910e1301f3eeb6043a7d601689af7aba9
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/bailout_graph.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
+#include <ATen/core/stack.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+#include <list>
+#include <vector>
+
+namespace torch::jit {
+
+// Replaces prim::Guard nodes with prim::BailOut nodes and
+// computes sets of inputs needed to resume execution at
+// bailout points
+TORCH_API void InsertBailOuts(std::shared_ptr<Graph> graph);
+
+// Builds a bailout graph into `target` (which is an empty graph)
+// for a given bailout point `bailout_index`
+// from the original graph `orig` (the original unoptimized graph)
+// BailOut graphs allow Interpreter to resume
+// execution of the (un/de)optimized graph (i.e.
+// a graph that doesn't rely on any assumptions derived from
+// on profiling information) from a given BailOut point
+// should any of the assumptions fail for an actual input.
+TORCH_API std::shared_ptr<Graph> BuildBailOutGraphFrom(
+    int64_t bailout_index,
+    const std::shared_ptr<Graph>& orig,
+    const std::shared_ptr<Graph>& target);
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/batch_mm.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/batch_mm.h
new file mode 100644
index 0000000000000000000000000000000000000000..805a817487d91542a0595ea067e4a3b3e157059d
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/batch_mm.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API void BatchMM(std::shared_ptr<Graph>& graph);
+
+}
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/canonicalize.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/canonicalize.h
new file mode 100644
index 0000000000000000000000000000000000000000..2223a3b2ae05ef4dfc512a1dc9c2b0ea5ec3b4cc
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/canonicalize.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API std::shared_ptr<Graph> Canonicalize(
+    const std::shared_ptr<Graph>& graph,
+    bool keep_unique_names = true);
+
+TORCH_API void CanonicalizeOutputs(std::shared_ptr<Graph>& graph);
+
+TORCH_API std::optional<const Use> firstOrLastUse(Value* v, bool find_first);
+
+TORCH_API bool isBeforeOrAfter(
+    const Use& a,
+    const Use& b,
+    bool checking_before);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/canonicalize_graph_fuser_ops.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/canonicalize_graph_fuser_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..253de51306c6547103be19c0c1321f8043db51cd
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/canonicalize_graph_fuser_ops.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API void CanonicalizeOps(const std::shared_ptr<Graph>& graph);
+
+}
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/check_strict_fusion.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/check_strict_fusion.h
new file mode 100644
index 0000000000000000000000000000000000000000..8fdc260fe9254088fb8df29d6ee28d74290bf949
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/check_strict_fusion.h
@@ -0,0 +1,10 @@
+
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API void CheckStrictFusion(std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/clear_profiling.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/clear_profiling.h
new file mode 100644
index 0000000000000000000000000000000000000000..2ac81a54305f70e6b61b4ae5aefe4c8df4d47c47
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/clear_profiling.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API void unprofileGraphInputs(const std::shared_ptr<Graph>& graph);
+TORCH_API void unprofileBlock(Block* start_block);
+// Unprofiles all the node outputs in a block.
+
+TORCH_API void ClearProfilingInformation(const std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/clear_undefinedness.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/clear_undefinedness.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a92d6bc2b6ff302d1f757ac89dea0675a3108b6
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/clear_undefinedness.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// Undefinedness makes argument matching fail for regular tensor operations
+// if 1+ arguments are undefined or possibly undefined tensors.
+// Technically, undefined tensors are **not** tensors as the regular tensor
+// operations do not know how to handle them.
+// However, in practice, there are guards and conversion operators that
+// **always** gate regular operations if undefined tensors may be present
+// Eventually, we would love to move to the world where we use optionals
+// in lieu of undefined tensors.
+// When this happens, this pass will be removed
+TORCH_API void ClearUndefinedness(const std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/common_subexpression_elimination.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/common_subexpression_elimination.h
new file mode 100644
index 0000000000000000000000000000000000000000..6e19f98631bb4281090114c5d3d8a6ebf7ee43e3
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/common_subexpression_elimination.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API bool EliminateCommonSubexpression(
+    const std::shared_ptr<Graph>& graph);
+}
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/concat_opt.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/concat_opt.h
new file mode 100644
index 0000000000000000000000000000000000000000..9cd64a2af05cfba348c42fcb1bed4a684e7f3a42
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/concat_opt.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// Eliminates common inputs among `aten::cat` ops.
+TORCH_API bool EliminateConcatCommonInputs(const std::shared_ptr<Graph>& graph);
+
+// Expands `aten::cat` ops into `aten::copy` ops and eliminates redudancies
+// in the buffers used for concatenation if possible.
+TORCH_API void ExpandConcatAndEliminateRedundancy(
+    const std::shared_ptr<Graph>& graph);
+
+TORCH_API bool CombineConcats(const std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/constant_pooling.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/constant_pooling.h
new file mode 100644
index 0000000000000000000000000000000000000000..61a8d5fcba91b1028d87323bb9847b92dc5db8a8
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/constant_pooling.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API void ConstantPooling(const std::shared_ptr<Graph>& graph);
+
+}
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/constant_propagation.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/constant_propagation.h
new file mode 100644
index 0000000000000000000000000000000000000000..d96f0316d48b0fa8ae0603f1ed7b470abeac37f7
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/constant_propagation.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// Runs constant propagation on all objects unless ignore_custom_classes is
+// specified as true, in which case user defined classes are skipped.  This is
+// useful to prevent early fusion of packing operations, which end up lowering
+// away information about their constructors (e.g. packed::linear_clamp_prepack
+// and prepacked::conv2d_clamp_prepack)
+// Returns True if the pass made a change to the graph
+TORCH_API bool ConstantPropagation(
+    std::shared_ptr<Graph>& graph,
+    bool ignore_custom_classes = false);
+
+// runs constant propagation only on ops that have non-aliasing inputs & outputs
+// Returns True if the pass made a change to the graph
+TORCH_API bool ConstantPropagationImmutableTypes(std::shared_ptr<Graph>& graph);
+
+// Runs the node if its inputs are constants. Callers of this function must
+// make their own determination if constant prop is appropriate - for example
+// non-deterministic ops or ops with side effects.  If ignore_custom_classes is
+// specified, nodes that output user defined classes are not run.
+TORCH_API std::optional<Stack> runNodeIfInputsAreConstant(
+    const Node* node,
+    bool ignore_custom_classes = false,
+    AliasDb* db = nullptr);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/create_autodiff_subgraphs.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/create_autodiff_subgraphs.h
new file mode 100644
index 0000000000000000000000000000000000000000..d3dc088639e63de9eaaca13cb9d7756a674ae24c
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/create_autodiff_subgraphs.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+#include <cstddef>
+
+namespace torch::jit {
+
+// insert GraphExecutor nodes that group together
+// subgraphs that are differentiable by the jit's autodiff passes
+// threshold - minimum number of nodes that will appear in a block
+// returns all differentiable blocks that have been found
+TORCH_API std::vector<Node*> CreateAutodiffSubgraphs(
+    const std::shared_ptr<Graph>& graph,
+    size_t threshold = 2);
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/create_functional_graphs.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/create_functional_graphs.h
new file mode 100644
index 0000000000000000000000000000000000000000..8716ad67e76f0e2c2d2f5077b0c314f9f08cc4d5
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/create_functional_graphs.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API void CreateFunctionalGraphs(const std::shared_ptr<Graph>& graph);
+
+TORCH_API void InlineFunctionalGraphs(const std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/dbr_quantization/remove_redundant_aliases.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/dbr_quantization/remove_redundant_aliases.h
new file mode 100644
index 0000000000000000000000000000000000000000..68d34c5a6c4a44d2767f670d85cd8b5a7f449395
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/dbr_quantization/remove_redundant_aliases.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+
+namespace torch::jit {
+
+// This function replaces instances of
+//
+//   %b = aten::alias(%a)
+//   %c = foo(%b)
+//
+// with
+//
+//   %c = foo(%a)
+//
+// on the module forward, if it's safe to do so.
+TORCH_API Module DBRQuantRemoveRedundantAliases(Module& module);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/dead_code_elimination.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/dead_code_elimination.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d797e75665fa68f8aa37277cd8de7ced8ae88b3
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/dead_code_elimination.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// If given a top-level graph, DCE will construct do alias analysis that allows
+// for "smarter" dead code elimination (we will eliminate mutable ops if we can
+// prove the mutated values are not used). Otherwise, we will not allow DCE to
+// eliminate mutable ops.
+//
+// So, prefer to use the graph version if you can.
+enum class DCESideEffectPolicy : uint8_t {
+  // default behavior: dead code elimination will check if a node has side
+  // effects
+  // and not delete it if it does.
+  DONT_DELETE_NODES_WITH_SIDE_EFFECTS,
+  // with this flag, dead code elimination will not check if a node has side
+  // effects and treat nodes with side effects like any other node,
+  // i.e. delete them if their outputs aren't used anywhere.
+  ALLOW_DELETING_NODES_WITH_SIDE_EFFECTS
+};
+
+TORCH_API void EliminateDeadCode(
+    const std::shared_ptr<Graph>& graph,
+    DCESideEffectPolicy sideEffectPolicy =
+        DCESideEffectPolicy::DONT_DELETE_NODES_WITH_SIDE_EFFECTS);
+TORCH_API void EliminateDeadCode(
+    Block* block,
+    bool recurse = true,
+    DCESideEffectPolicy sideEffectPolicy =
+        DCESideEffectPolicy::DONT_DELETE_NODES_WITH_SIDE_EFFECTS);
+
+// Invoke the user-provided callback on all live values before deleting anything
+TORCH_API void EliminateDeadCode(
+    Block* block,
+    std::function<void(const std::unordered_set<const Value*>&)> cb,
+    DCESideEffectPolicy sideEffectPolicy =
+        DCESideEffectPolicy::DONT_DELETE_NODES_WITH_SIDE_EFFECTS);
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/decompose_ops.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/decompose_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..6e9eb1aade0933942d91cb3cbe459b86713e4c29
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/decompose_ops.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API void DecomposeOps(std::shared_ptr<Graph>& graph);
+
+}
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/device_type_analysis.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/device_type_analysis.h
new file mode 100644
index 0000000000000000000000000000000000000000..f1b56340078c05c6e665ab42c7e509c52de9f772
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/device_type_analysis.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+struct Graph;
+
+// Propagates Device type info throughout the given graph.
+TORCH_API bool DeviceTypePropagation(std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/dtype_analysis.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/dtype_analysis.h
new file mode 100644
index 0000000000000000000000000000000000000000..a81964cbf459c44ad718fe850fdbb7aa578a0083
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/dtype_analysis.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <memory>
+
+namespace torch::jit {
+struct Graph;
+
+// Propagate tensor properties (e.g., dtype, device, is_contiguous, layout)
+// propagation on all tensor objects. Currently, we only support dtype
+// propagation
+TORCH_API bool DtypePropagation(std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/eliminate_no_ops.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/eliminate_no_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..12f86fb720322921d66ccf08a51202006927accd
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/eliminate_no_ops.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// Remove ops that do nothing on the forward pass (like aten::detach).
+// This pass is invoked as a part of freeze_module.
+// This function also takes a set of custom ops to eliminate. All ops in this
+// set must take their output as their first input, i.e. x = f(x, ...)
+TORCH_API bool EliminateNoOps(
+    std::shared_ptr<Graph>& graph,
+    std::unordered_set<c10::Symbol> custom_ops = {});
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/erase_number_types.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/erase_number_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..400ea0f799ac392938f557c5d0ad4c90e74b2099
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/erase_number_types.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// Erase NumberType information. This is necessary for and only used in
+// exporting to ONNX. This pass ensures that no remaining Values have
+// NumberType types, replacing them with tensors.
+// The following things are done to erase NumberType info:
+// - NumberType outputs are changed to DynamicType.
+// - prim::Constant nodes which are numbers get changed into 0-dim tensors of
+//   the corresponding type
+// - prim::TensorToNum, aten::Float, aten::Int and prim::NumToTensor nodes
+//   are erased.
+//
+// The pass assumes that DCE will be called sometime after.
+TORCH_API void EraseNumberTypes(const std::shared_ptr<Graph>& graph);
+TORCH_API void EraseNumberTypesOnBlock(Block* block);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/fixup_trace_scope_blocks.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/fixup_trace_scope_blocks.h
new file mode 100644
index 0000000000000000000000000000000000000000..4219668a735a0b7a373730d115e73a60c9cb11e7
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/fixup_trace_scope_blocks.h
@@ -0,0 +1,45 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// Directly after tracing, we have an ill-formed graph with blocks inserted.
+// Example:
+//
+// graph(%self : ClassType<Module>,
+//       %input.1 : Float(3, 4)):
+//   %1 : ClassType<Module> = prim::GetAttr[name="relu1"](%self)
+//   %2 : ClassType<Module> = prim::GetAttr[name="relu2"](%self)
+//   %3 : ClassType<Module> = prim::GetAttr[name="rrr"](%2)
+//    = prim::TracedModuleForward[scope="__module.relu1"]()
+//     block0():
+//       %input : Float(3, 4) = aten::relu(%input.1),
+//       -> ()
+//    = prim::TracedModuleForward[scope="__module.relu2"](),
+//     block0():
+//        = prim::TracedModuleForward[scope="__module.relu2.rrr"](),
+//         block0():
+//           %6 : Float(3, 4) = aten::relu(%input),
+//           -> ()
+//       -> ()
+//   return (%6)
+//
+// In this pass, we:
+//   1) Lift Value defs to as high of a scope as needed to ensure that
+//      they dominate all their uses. For example, `input` in the above
+//      graph needs to be lifted to the top-level block so that its use
+//      in the second `relu` operator is dominated.
+//   2) Lambda lift the blocks. This ensures that all values used within
+//      each scope have their defs captured.
+//   3) Convert the scope blocks into methods on their respective Modules,
+//      and convert TracedModuleForward nodes to CallMethod nodes into those
+//      methods.
+//
+//  Then, we'll have a well-formed graph with proper method calls.
+TORCH_API void FixupTraceScopeBlocks(
+    std::shared_ptr<Graph>& graph,
+    Module* self);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/fold_conv_bn.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/fold_conv_bn.h
new file mode 100644
index 0000000000000000000000000000000000000000..79690369133f758761e8799e78d0e234db15b9b1
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/fold_conv_bn.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+
+namespace torch::jit {
+
+/** \brief Fold Conv2d-BatchNorm2d into Conv2d in all methods of this
+ * module and all its submodules, forward is included by default.
+ *
+ * The weight and bias of the Conv2d are correspondingly updated. Should only be
+ * used on modules in eval mode.
+ */
+TORCH_API Module FoldConvBatchNorm(const Module& module);
+
+struct TORCH_API ConvBNParameters {
+  at::Tensor conv_w;
+  at::Tensor conv_b;
+  at::Tensor bn_rm;
+  at::Tensor bn_rv;
+  double bn_eps = 0.0;
+  at::Tensor bn_w;
+  at::Tensor bn_b;
+};
+
+/**
+ * Given the current weight and bias tensors of a Conv module and parameters
+ * of the BatchNorm module we're folding with, compute the updated values
+ * for the weight and bias.
+ *
+ * The function is basically copied from torch/nn/utils/fusion.py
+ */
+TORCH_API std::tuple<at::Tensor, at::Tensor> computeUpdatedConvWeightAndBias(
+    const ConvBNParameters& p);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/fold_linear_bn.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/fold_linear_bn.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ffd270411cde9230019f5343d485c1dea3aaeee
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/fold_linear_bn.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+
+namespace torch::jit {
+
+struct TORCH_API LinearBNParameters {
+  at::Tensor linear_w;
+  at::Tensor linear_b;
+  at::Tensor bn_rm;
+  at::Tensor bn_rv;
+  double bn_eps = 0.0;
+  at::Tensor bn_w;
+  at::Tensor bn_b;
+};
+
+/**
+ * Given the current weight and bias tensors of a Linear module and parameters
+ * of the BatchNorm module we're folding with, compute the updated values
+ * for the weight and bias.
+ *
+ * The function is basically copied from torch/nn/utils/fusion.py
+ */
+TORCH_API std::tuple<at::Tensor, at::Tensor> computeUpdatedLinearWeightAndBias(
+    const LinearBNParameters& p);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/freeze_module.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/freeze_module.h
new file mode 100644
index 0000000000000000000000000000000000000000..739b72e910d395d77855169617965f11e32afffc
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/freeze_module.h
@@ -0,0 +1,34 @@
+/** \brief This file defines freezing Torchscript module API.
+ *
+ * This API has python-binding and can be invoked directly or as a part of
+ * general optimization pipeline.
+ */
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+/** \brief Freeze Module, i.e., Assume all attributes are constants.
+ *
+ * Freezing module is a functionality that allows the JIT to internalize
+ * immutable attributes. Combined with inlining, the module is aggressively
+ * optimized and significant overhead is optimized away. The freezeModule API
+ * produces a cloned frozen module.
+ */
+
+namespace torch::jit {
+
+TORCH_API Module freeze_module(
+    const Module& module,
+    std::vector<std::string> preservedAttrs = std::vector<std::string>(),
+    bool freezeInterfaces = true,
+    bool preserveParameters = false);
+
+// Clone-free version of freeze_module. This modifies the module inplace.
+// Use this version to avoid extra memory usage incurred by cloning the module.
+TORCH_API void freeze_module_inplace(
+    Module* module,
+    std::vector<std::string> preservedAttrs = std::vector<std::string>(),
+    bool freezeInterfaces = true,
+    bool preserveParameters = false);
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_concat_linear.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_concat_linear.h
new file mode 100644
index 0000000000000000000000000000000000000000..3e272f5a86efb5a2406d9bf9ee253230f8ee1eee
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_concat_linear.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// Concats multiple linear ops with the same Tensor input
+// into a single linear op.
+TORCH_API bool FrozenConcatLinear(std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_conv_add_relu_fusion.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_conv_add_relu_fusion.h
new file mode 100644
index 0000000000000000000000000000000000000000..4f3bdabc41b23ab850cbfa6c20288411513d9bc5
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_conv_add_relu_fusion.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API extern std::function<void(std::shared_ptr<Graph>&)>&
+getFuseFrozenConvAddReluImpl();
+
+TORCH_API void FuseFrozenConvAddRelu(std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_conv_folding.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_conv_folding.h
new file mode 100644
index 0000000000000000000000000000000000000000..956b56f54d5d9d2022bd4cc2598f73400146e69e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_conv_folding.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// Fuses Convolution -> Batchnorm into a single Convolution by
+// folding batchnorm weights into conv weights.
+// This pass only works on Frozen Graphs; otherwise it is a No-Op.
+TORCH_API bool FoldFrozenConvBatchnorm(std::shared_ptr<Graph>& graph);
+
+// Fuses Convolution -> Add/Sub into a single Convolution by
+// folding add constant tensor into conv weights.
+// This pass only works on Frozen Graphs; otherwise it is a No-Op.
+TORCH_API bool FoldFrozenConvAddOrSub(std::shared_ptr<Graph>& graph);
+
+// Fuses Convolution -> Mul/Div into a single Convolution by
+// folding add constant tensor into conv weights.
+// This pass only works on Frozen Graphs; otherwise it is a No-Op.
+TORCH_API bool FoldFrozenConvMulOrDiv(std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_graph_optimizations.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_graph_optimizations.h
new file mode 100644
index 0000000000000000000000000000000000000000..c0e8b624b7da04e7696d28d2e90781d6861939af
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_graph_optimizations.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+/** \brief Runs a set of Optimizations that Optimize Frozen Graphs
+ *
+ * Currently this set of optimizations is:
+ * - FoldFrozenConvBatchnorm
+ * - FoldFrozenConvAddOrSub
+ * - FoldFrozenConvMulOrDiv
+ * - FoldFrozenLinearBatchnorm
+ */
+
+namespace torch::jit {
+
+TORCH_API void OptimizeFrozenGraph(
+    std::shared_ptr<Graph>& graph,
+    bool optimize_numerics = true);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_linear_folding.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_linear_folding.h
new file mode 100644
index 0000000000000000000000000000000000000000..a72614cb1f725652e26dcfd75db39423f59568d8
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_linear_folding.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// Fuses Linear -> BatchNormNd into a single Linear by
+// folding batchnorm weights into linear weights.
+// This pass only works on Frozen Graphs; otherwise it is a No-Op.
+TORCH_API bool FoldFrozenLinearBatchnorm(std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_linear_transpose.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_linear_transpose.h
new file mode 100644
index 0000000000000000000000000000000000000000..512f4f5847c2bb05f4c7de6db2b8500544290ecb
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_linear_transpose.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// Transposes the weight matrix for frozen linear modules.
+// and converts it into a matmul
+TORCH_API bool FrozenLinearTranspose(std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_ops_to_mkldnn.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_ops_to_mkldnn.h
new file mode 100644
index 0000000000000000000000000000000000000000..5cb5c633bb52744bf241d2b131d1199938c3305f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/frozen_ops_to_mkldnn.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// Converts operators & their parameters to mkldnn if it is profitable
+// Currently encompassing Conv2d and Conv3d, and Linear
+// Op must be in float32 and mkldnn must be built
+// This pass only works on frozen graph
+TORCH_API void ConvertFrozenOpsToMKLDNN(std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/fuse_linear.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/fuse_linear.h
new file mode 100644
index 0000000000000000000000000000000000000000..5957c1fdc797fdf6439325840ea8fb6ad2364960
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/fuse_linear.h
@@ -0,0 +1,22 @@
+/** \brief Fusing linear patterns as single at::linear for easier pattern
+ * matching in later passes
+ */
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+/** \brief Match the at::linear pattern and fuse it into a single at::linear
+ * This pass fuse the addmm or matmul + add generated by JIT back to linear
+ * This pass can be deleted once the JIT can emit the aten::linear in the future
+ */
+TORCH_API void FuseLinear(std::shared_ptr<Graph>& graph);
+
+/** Swap functional linear CallFunctions to aten::linear
+ */
+TORCH_API void SwapFunctionalLinear(std::shared_ptr<Graph>& graph);
+/** Swap all functional linear CallFunctions in module
+ */
+TORCH_API void SwapFunctionalLinear(Module& module);
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/fuse_relu.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/fuse_relu.h
new file mode 100644
index 0000000000000000000000000000000000000000..06f6a17ea334441a8047de77d8b3217349597888
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/fuse_relu.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+TORCH_API void FuseAddRelu(script::Module& module);
+TORCH_API void FuseAddRelu(std::shared_ptr<Graph>& graph);
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/graph_fuser.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/graph_fuser.h
new file mode 100644
index 0000000000000000000000000000000000000000..0cd17740b97e49de2c3050a343b7ead3cbf518d0
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/graph_fuser.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API bool canFuseOnCPULegacy();
+TORCH_API void overrideCanFuseOnCPULegacy(bool value);
+
+// NB: Be sure to run DCE before fusion, because dead instructions
+// can prevent fusion opportunities from being exploited.
+// On Windows will noop, NYI
+TORCH_API void FuseGraph(
+    std::shared_ptr<Graph>& graph,
+    bool strict_fuser_check = false);
+
+// \brief Custom fusion pass using a node-level callback to
+// determine the inclusion of nodes in a subgraph.
+//
+// This helper omits aliased inputs and fusion across control flow
+// boundaries.
+//
+// \arg graph The graph to be modified in-place
+// \arg is_fusable A callback run on each fusable node in the graph.
+// \arg kind The label given to the resultant fused subgraph
+// \arg arg_limit The maximum number of args the resultant fused subgraph
+//                should have.  Note: This will likely develop into a general
+//                post condition on the fused subgraph.
+TORCH_API void CustomFuseGraph(
+    std::shared_ptr<Graph>& graph,
+    const std::function<bool(Node*)>& is_fusable,
+    Symbol kind,
+    size_t arg_limit = std::numeric_limits<size_t>::max());
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/graph_rewrite_helper.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/graph_rewrite_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..c4cae6c0d31ee6d8e6654a323e65162d3ef67d37
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/graph_rewrite_helper.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/ir/irparser.h>
+#include <torch/csrc/jit/ir/subgraph_matcher.h>
+#include <torch/csrc/jit/passes/subgraph_rewrite.h>
+
+namespace torch::jit::graph_rewrite_helper {
+
+std::string getFuncName(Value* func_value);
+Value* getValue(
+    const std::string& name,
+    const std::unordered_map<const Value*, Value*>& match_vmap,
+    const std::unordered_map<std::string, Value*>& vmap);
+std::optional<IValue> getIValue(
+    const std::string& name,
+    const std::unordered_map<const Value*, Value*>& match_vmap,
+    const std::unordered_map<std::string, Value*>& vmap);
+TORCH_API void replaceConvolutionWithAtenConv(std::shared_ptr<Graph>& graph);
+
+bool isClampFusable(
+    const Match& match,
+    const std::unordered_map<std::string, Value*>& vmap);
+
+// This struct contains a compiled IR patterns slated for use in the
+// findPatternMatches function. The struct encapsulates the common
+// information from parseIR that is used in conjunction with the
+// pattern matching facility. A const instance of this struct can
+// also be stored away to cache the compiled IR pattern and reduce
+// runtime cost
+struct PatternInfo {
+  std::string pattern_string;
+  std::unique_ptr<Graph> pattern_graph;
+  std::unordered_map<std::string, Value*> vmap;
+  std::vector<MatchFilter> filters;
+
+  static PatternInfo parse_from_str(
+      std::string pattern_string,
+      const std::vector<MatchFilter>& filters = {}) {
+    PatternInfo rv{
+        std::move(pattern_string),
+        std::make_unique<Graph>(),
+        decltype(vmap){},
+        filters};
+    parseIR(rv.pattern_string, rv.pattern_graph.get(), rv.vmap);
+    return rv;
+  }
+};
+
+} // namespace torch::jit::graph_rewrite_helper
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/guard_elimination.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/guard_elimination.h
new file mode 100644
index 0000000000000000000000000000000000000000..c060a220bfc1dd6df0ffcc9be8994392e5a5e78c
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/guard_elimination.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
+#include <ATen/core/stack.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+#include <list>
+#include <vector>
+
+namespace torch::jit {
+
+TORCH_API void EliminateRedundantGuards(std::shared_ptr<Graph> graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/hoist_conv_packed_params.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/hoist_conv_packed_params.h
new file mode 100644
index 0000000000000000000000000000000000000000..a78fcd5090877e5fc58d426b5e09e10f35b73952
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/hoist_conv_packed_params.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+void HoistConvPackedParams(script::Module& m);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/inline_autodiff_subgraphs.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/inline_autodiff_subgraphs.h
new file mode 100644
index 0000000000000000000000000000000000000000..b790c9685f84e93a8df01b45bc341b4eab5d8123
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/inline_autodiff_subgraphs.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API bool canRunWithAutograd(Node* node);
+
+TORCH_API void InlineAutodiffSubgraphs(
+    std::shared_ptr<Graph>& graph,
+    size_t threshold = 5);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/inline_fork_wait.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/inline_fork_wait.h
new file mode 100644
index 0000000000000000000000000000000000000000..78a5da170d3f40a3aac49215d096229ea0ae85b0
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/inline_fork_wait.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// Inline Fork and Wait calls. This is used, for example, in ONNX export, where
+// we do not support the explicit parallelism structures and would rather
+// just have a flat graph. This inlines the forked section in the fork()
+// callsite and replaces uses of the result of wait() calls with the values
+// produced from the (now-inlined) forked section.
+TORCH_API void InlineForkWait(const std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/inline_forked_closures.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/inline_forked_closures.h
new file mode 100644
index 0000000000000000000000000000000000000000..9ec9aab53929ee9fbc558874c9d196152514a592
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/inline_forked_closures.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API void inlineForkedClosures(std::shared_ptr<Graph>& to_clean);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/inliner.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/inliner.h
new file mode 100644
index 0000000000000000000000000000000000000000..9653124a735788220231215df101bfe5e97aa5ca
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/inliner.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// Inline function and method calls.
+TORCH_API void Inline(Graph& graph);
+
+TORCH_API GraphFunction* tryToGraphFunction(Node* n);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/inplace_check.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/inplace_check.h
new file mode 100644
index 0000000000000000000000000000000000000000..8c2f47d0161fca4f6dcc0833bff4e28418c96b79
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/inplace_check.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API void CheckInplace(std::shared_ptr<Graph>& graph);
+
+}
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/insert_guards.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/insert_guards.h
new file mode 100644
index 0000000000000000000000000000000000000000..d54fcc59cc843af20cbc78dbeedd5c9a0b2b4667
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/insert_guards.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
+#include <ATen/core/stack.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+#include <list>
+#include <vector>
+
+namespace torch::jit {
+
+TORCH_API void InsertGuards(std::shared_ptr<Graph> graph);
+
+TORCH_API void RemoveProfilingNodes(const std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/integer_value_refinement.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/integer_value_refinement.h
new file mode 100644
index 0000000000000000000000000000000000000000..d3d191e17f71cf655a8478c932eccbba74df4c05
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/integer_value_refinement.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// return true if graph is modified
+TORCH_API bool RefineIntegerValues(const std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/lift_closures.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/lift_closures.h
new file mode 100644
index 0000000000000000000000000000000000000000..fdcfef5934bfaeb7bf178f4dd246806b887af143
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/lift_closures.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API void liftClosures(const std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/liveness.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/liveness.h
new file mode 100644
index 0000000000000000000000000000000000000000..9abfd70e195c1fb29d95257c4b8e2cf990bd0b4c
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/liveness.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
+#include <ATen/core/stack.h>
+#include <c10/util/sparse_bitset.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <list>
+#include <unordered_map>
+#include <vector>
+
+namespace torch::jit {
+
+using SparseBitVector = ::c10::SparseBitVector<256>;
+
+// BuildLivenessSets computes "bailout" liveness which is equivalent to
+// "{LIVE_IN} or {GEN}" or "{LIVE_OUT} - {KILL}"
+TORCH_API std::unordered_map<Node*, std::vector<Value*>> BuildLivenessSets(
+    std::shared_ptr<Graph> graph);
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/loop_unrolling.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/loop_unrolling.h
new file mode 100644
index 0000000000000000000000000000000000000000..a38153b7c03aa916d6ca99a040a9a9883590a5c2
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/loop_unrolling.h
@@ -0,0 +1,34 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// return true if graph is modified
+TORCH_API bool UnrollLoops(std::shared_ptr<Graph>& graph);
+
+// Only unrolls constant loops. Will unroll them regardless of loop block size
+TORCH_API bool UnrollConstantLoops(std::shared_ptr<Graph>& graph);
+
+TORCH_API Node* PeelLoop(Node* n, size_t times);
+
+// return true if graph is modified
+TORCH_API bool PeelProfilingLoops(const std::shared_ptr<Graph>& graph);
+
+struct TORCH_API LoopsPeeler {
+  LoopsPeeler(std::function<bool(Node* n)> callback, size_t num_iterations = 1)
+      : callback_(std::move(callback)), num_iterations_(num_iterations) {}
+
+  bool run(const std::shared_ptr<Graph>& graph);
+
+ private:
+  void collectLoop(Node* n);
+  void collectLoops(Block* block);
+  void peelLoops();
+
+  std::function<bool(Node* n)> callback_ = nullptr;
+  Node* in_loop_ = nullptr;
+  std::list<Node*> loops_to_peel_;
+  size_t num_iterations_ = 1;
+};
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/lower_grad_of.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/lower_grad_of.h
new file mode 100644
index 0000000000000000000000000000000000000000..421814fa02d317880e42721fe2017fa1771a9bc4
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/lower_grad_of.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// This pass removes 'grad_of' nodes, replacing them with conditionals of
+// the form:
+// if any_defined(inputs):
+//  outputs = <original_computation>
+// else:
+//  outputs = undefineds
+TORCH_API void LowerGradOf(Graph& g);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/lower_graph.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/lower_graph.h
new file mode 100644
index 0000000000000000000000000000000000000000..84d70a84d151960d2c4d826d5fd9db0eaf4a1ca8
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/lower_graph.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+using ModulePtr = c10::intrusive_ptr<c10::ivalue::Object>;
+
+// Given a graph with of a method which first argument is %self, lower it to a
+// graph where all attributes accesses are replaced with explicit inputs of the
+// graph (rather than results of prim::GetAttr executed on %self).
+//
+// Returns a tuple (graph, parameters) where the last module.parameters.size()
+// inputs to the graph are the trainable parameters used in this method. The
+// remaining inputs are the true inputs to the function.
+TORCH_API std::pair<std::shared_ptr<Graph>, std::vector<IValue>> LowerGraph(
+    Graph& graph,
+    const ModulePtr& self);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/lower_tuples.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/lower_tuples.h
new file mode 100644
index 0000000000000000000000000000000000000000..050ffba53c33029678f8737deab0cf06d964e696
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/lower_tuples.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// removes tuples where TupleConstruct and TupleUnpack are matched
+// but leaves tuples in place across if statements, loops, and as inputs/outputs
+TORCH_API void LowerSimpleTuples(const std::shared_ptr<Graph>& graph);
+
+// removes _all_ tuples and raises an error if some cannot be removed
+// this is used by ONNX to ensure there are not tuples before conversion,
+// but will not work on graphs whose inputs contain tuples.
+TORCH_API void LowerAllTuples(const std::shared_ptr<Graph>& graph);
+
+TORCH_API void LowerSimpleTuples(Block* block);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/metal_rewrite.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/metal_rewrite.h
new file mode 100644
index 0000000000000000000000000000000000000000..b93e2dff36eee5dfd2abfd4c28b0875dd3120035
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/metal_rewrite.h
@@ -0,0 +1,15 @@
+#pragma once
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <string>
+#include <vector>
+
+namespace torch::jit {
+TORCH_API void metalInsertPrePackedOps(std::shared_ptr<Graph>& graph);
+TORCH_API void metalInsertPrePackedOps(script::Module& module);
+TORCH_API void metalFusePrePackedConvWithClamp(script::Module& module);
+TORCH_API void metalFoldPrePackingOps(script::Module& module);
+TORCH_API script::Module metalOptimizeForMobile(
+    const script::Module& module,
+    const std::vector<std::string>& preserved_methods);
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/mkldnn_rewrite.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/mkldnn_rewrite.h
new file mode 100644
index 0000000000000000000000000000000000000000..a4547d91613c9d699e7b050f7dcfdd322e38ea3f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/mkldnn_rewrite.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <ATen/Config.h>
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/passes/subgraph_rewrite.h>
+
+#if AT_MKLDNN_ENABLED()
+
+#include <ideep/tensor.hpp>
+
+#endif // AT_MKLDNN_ENABLED()
+
+namespace torch::jit {
+
+#if AT_MKLDNN_ENABLED()
+
+namespace mkldnn {
+
+const static std::map<std::string, std::vector<torch::jit::MatchFilter>>
+    fusion_rewrite_map = {
+        {"none", {}},
+        {"relu", {}},
+};
+
+} // namespace mkldnn
+
+#endif // AT_MKLDNN_ENABLED()
+
+void FuseConvWithEltwise(std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/mobile_optimizer_type.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/mobile_optimizer_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..8fed54551b3c96ea5cae8851cf8bdeb37a896c1e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/mobile_optimizer_type.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <cstdint>
+
+enum class MobileOptimizerType : int8_t {
+  CONV_BN_FUSION,
+  INSERT_FOLD_PREPACK_OPS,
+  REMOVE_DROPOUT,
+  FUSE_ADD_RELU,
+  HOIST_CONV_PACKED_PARAMS,
+  CONV_1D_TO_2D,
+  VULKAN_AUTOMATIC_GPU_TRANSFER,
+};
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/normalize_ops.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/normalize_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..c4d5cf977c0dee8dabe864c648bb74e6f0a8fbe1
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/normalize_ops.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// This pass converts aten ops to a normalized form. It is
+// run immediately after IR generation in both the tracer and compiler,
+// so downstream consumers of the IR do not need handle ops in their
+// pre-normalized form.
+// Currently only handles normalization of op aliases.
+TORCH_API void NormalizeOps(const std::shared_ptr<Graph>& graph);
+
+const std::unordered_map<Symbol, Symbol>& getOperatorAliasMap();
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onednn_graph_fuser.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onednn_graph_fuser.h
new file mode 100644
index 0000000000000000000000000000000000000000..f754e4e36adcda6bc3d8d198b50485ed9c0f0f3f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onednn_graph_fuser.h
@@ -0,0 +1,61 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/passes/pass_manager.h>
+
+#include <ATen/Config.h>
+
+namespace torch::jit {
+
+namespace fuser::onednn {
+
+static std::atomic<bool> onednn_enabled{true};
+
+static std::atomic<bool>& getLlgaEnabled() {
+  return onednn_enabled;
+}
+
+TORCH_API void fuseGraph(std::shared_ptr<Graph>& g);
+
+} // namespace fuser::onednn
+
+struct C10_EXPORT RegisterLlgaFuseGraph
+    : public PassManager<RegisterLlgaFuseGraph> {
+  static bool setEnabled(bool enabled) {
+    TORCH_CHECK(
+        AT_MKLDNN_ENABLED(),
+        "Running oneDNN Graph fuser is only supported with MKLDNN builds.");
+    bool oldState = fuser::onednn::getLlgaEnabled();
+    fuser::onednn::getLlgaEnabled() = enabled;
+    if (enabled) {
+      registerPass(fuser::onednn::fuseGraph);
+    } else {
+      clearPass();
+    }
+    return oldState;
+  }
+
+  static bool isEnabled() {
+    return fuser::onednn::getLlgaEnabled();
+  }
+
+  // override PassManager::registerPass to register pre-pass
+  static bool registerPass(GraphPass p) {
+    if (!isRegistered()) {
+      passID(registerPrePass(std::move(p)), true);
+      isRegistered(true);
+      return false;
+    }
+    return true;
+  }
+
+  // override PassManager::clearPass to clear pre-pass
+  static void clearPass() {
+    if (isRegistered()) {
+      clearPrePass(passID());
+      isRegistered(true);
+    }
+  }
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx.h
new file mode 100644
index 0000000000000000000000000000000000000000..32a2d49f1dc7bc7494b2be7a1d764e4856a4c541
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/onnx/onnx.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch::jit {
+
+TORCH_API std::shared_ptr<Graph> ToONNX(
+    std::shared_ptr<Graph>& state,
+    ::torch::onnx::OperatorExportTypes operator_export_type);
+TORCH_API py::dict BlockToONNX(
+    Block* old_block,
+    Block* new_block,
+    ::torch::onnx::OperatorExportTypes operator_export_type,
+    py::dict& env,
+    py::set& values_in_env,
+    bool is_sub_block = false);
+TORCH_API void NodeToONNX(
+    Node* old_node,
+    Block* new_block,
+    ::torch::onnx::OperatorExportTypes operator_export_type,
+    py::dict& env,
+    py::set& values_in_env);
+TORCH_API void RemovePrintOps(std::shared_ptr<Graph>& graph);
+TORCH_API void PreprocessCaffe2Ops(std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/cast_all_constant_to_floating.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/cast_all_constant_to_floating.h
new file mode 100644
index 0000000000000000000000000000000000000000..7406fe16c1666d3a7a221ae164bbee93d6244132
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/cast_all_constant_to_floating.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+#include <memory>
+
+namespace torch::jit {
+// see .cpp for docs
+TORCH_API void CastAllConstantToFloating(const std::shared_ptr<Graph>& graph);
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/constant_fold.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/constant_fold.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc0be47031a0691195d60dce2887523a05029bdf
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/constant_fold.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <memory>
+
+#include <torch/csrc/jit/ir/ir.h>
+#include <optional>
+
+namespace torch::jit {
+
+const int ONNX_OPSET_9 = 9;
+const int ONNX_OPSET_10 = 10;
+const int ONNX_OPSET_11 = 11;
+const int ONNX_OPSET_12 = 12;
+const int ONNX_OPSET_13 = 13;
+const int ONNX_OPSET_14 = 14;
+
+namespace onnx_constant_fold {
+
+at::Tensor IntToTensor(int64_t value);
+
+std::optional<at::Tensor> runTorchBackendForOnnx(
+    const Node* node,
+    std::vector<at::Tensor>& inputTensorValues,
+    int opset_version);
+} // namespace onnx_constant_fold
+
+void ConstantFoldONNX(
+    std::shared_ptr<Graph>& g,
+    std::map<std::string, IValue>& paramDict,
+    int opset_version);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/constant_map.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/constant_map.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d474cf020295e0192cfb309bb755d261d1dd726
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/constant_map.h
@@ -0,0 +1,112 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+
+#include <onnx/shape_inference/implementation.h>
+
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/serialization/export.h>
+#include <unordered_map>
+
+namespace torch::jit {
+
+using ShapeDataMap =
+    std::unordered_map<std::string, ::ONNX_NAMESPACE::TensorShapeProto>;
+
+class ConstantValueMap {
+ public:
+  static ConstantValueMap& getInstance();
+  static void SetRank(const std::string& tensorName, size_t rankValue);
+  static bool HasRank(const std::string& tensorName);
+  static std::optional<size_t> GetRank(const std::string& tensorName);
+
+  static void SetAllGraphInputsStatic(bool all_static);
+  static std::optional<bool> GetAllGraphInputsStatic();
+
+  static void SetAllGraphInputsReliableComputed(bool computed);
+  static bool GetAllGraphInputsReliableComputed();
+
+  static void SetShape(
+      const std::string& tensorName,
+      const c10::SymbolicShape& shapeValue);
+  static bool HasShape(const std::string& tensorName);
+  static std::optional<c10::SymbolicShape> GetShape(
+      const std::string& tensorName);
+
+  static void SetValue(const std::string& tensorName, const at::Tensor& value);
+  static bool HasValue(const std::string& tensorName);
+  static std::optional<at::Tensor> GetValue(const std::string& tensorName);
+  static void EraseValue(const std::string& tensorName);
+
+  static std::vector<int64_t> GetCompleteShapeInto1DInt64Vector(
+      const c10::SymbolicShape& shape);
+  static std::optional<std::vector<int64_t>> GetShapeInto1DInt64Vector(
+      const std::string& value_name);
+  static std::optional<std::vector<int64_t>>
+  GetShapeInto1DInt64VectorWithOneUnknown(const std::string& value_name);
+  static std::vector<int64_t> GetValueInto1DInt64Vector(
+      const std::string& value_name);
+
+  static void SetTypeReliable(const std::string& tensorName, bool reliable);
+  static bool HasTypeReliable(const std::string& tensorName);
+  static std::optional<bool> GetTypeReliable(const std::string& tensorName);
+
+  static void SetUseInferredType(
+      const std::string& tensorName,
+      bool useInferredType);
+  static bool HasUseInferredType(const std::string& tensorName);
+  static std::optional<bool> GetUseInferredType(const std::string& tensorName);
+
+  static void SetShapeValue(
+      const std::string& tensorName,
+      const c10::SymbolicShape& shapeValue);
+  static bool HasShapeValue(const std::string& tensorName);
+  static std::optional<c10::SymbolicShape> GetShapeValue(
+      const std::string& tensorName);
+
+  static ShapeDataMap& GetInferredShapeData();
+
+  static SymbolDimMap& GetSymbolDimMap();
+  static DimSymbolMap& GetDimSymbolMap();
+
+  static void UpdateValueName(
+      const std::string& old_name,
+      const std::string& new_name);
+
+  static void PrintMaps();
+  static void ClearMaps();
+  ~ConstantValueMap() = default;
+
+  ConstantValueMap& operator=(const ConstantValueMap&) = delete;
+
+ private:
+  ConstantValueMap() = default;
+
+  std::unordered_map<std::string, size_t> rankMap;
+  std::unordered_map<std::string, c10::SymbolicShape> shapeMap;
+  std::unordered_map<std::string, at::Tensor> tensorValueMap;
+  // This map indicates whether the current type is reliably estimated or not.
+  std::unordered_map<std::string, bool> typeReliableMap;
+  // This map indicates whether the current type is estimated through inference
+  // or tracer.
+  std::unordered_map<std::string, bool> useInferredTypeMap;
+  // This map indicates a tensor value which represents a shape.
+  // We assume that the rank of the tensor value <= 1, and we ensure this when
+  // we write the processing logic for the operators. When the rank > 1, we
+  // should be able to rewrite the model so that the rank <= 1. The difference
+  // between shapeMap and shapeValueMap: shapeMap stores the shape of the tensor
+  // from a node. shapeValueMap stores the value of the tensor from a node when
+  // this tensor represents a shape.
+  std::unordered_map<std::string, c10::SymbolicShape> shapeValueMap;
+  // Stores earlier data propagation results so that they are accessible
+  // during future node-level shape inference.
+  ShapeDataMap inferredShapeData;
+  SymbolDimMap symbolDimMap;
+  DimSymbolMap dimSymbolMap;
+  // Stores if all graph-level inputs have static shape
+  std::optional<bool> allGraphInputsStatic;
+  // True if reliable has been computed for all graph inputs
+  bool allGraphInputsReliableComputed{};
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/deduplicate_initializers.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/deduplicate_initializers.h
new file mode 100644
index 0000000000000000000000000000000000000000..42e0d9f2a1caa3e60f18b6933ceb64ccd0559d03
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/deduplicate_initializers.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <memory>
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+void DeduplicateInitializers(
+    std::shared_ptr<Graph>& g,
+    std::map<std::string, IValue>& paramsDict,
+    bool is_train);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/eliminate_unused_items.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/eliminate_unused_items.h
new file mode 100644
index 0000000000000000000000000000000000000000..c5949a4dc5e1f1c9a115825d9078d12c5fc7c717
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/eliminate_unused_items.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// EliminateUnusedItemsONNX pass is removing unused
+// initializers and inputs, this is needed because
+// dce pass is only removing unused fork inputs
+void EliminateUnusedItemsONNX(
+    Block* b,
+    std::map<std::string, IValue>& paramDict);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/eval_peephole.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/eval_peephole.h
new file mode 100644
index 0000000000000000000000000000000000000000..6a7f4f201e3981427f4f05e1a9bd8673ecfeab80
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/eval_peephole.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <memory>
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+void EvalPeepholeONNX(
+    std::shared_ptr<Graph>& g,
+    std::map<std::string, IValue>& paramDict);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.h
new file mode 100644
index 0000000000000000000000000000000000000000..5112f0191fdd716b666c6aaed085a7a7f6922084
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/fixup_onnx_controlflow.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+std::vector<Value*> FixupONNXControlflowNode(Node* n, int opset_version);
+void FixupONNXControlflowNodeOutputs(Node* n);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/function_extraction.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/function_extraction.h
new file mode 100644
index 0000000000000000000000000000000000000000..733f2bf06e143f882d42a9af17b30d7112903652
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/function_extraction.h
@@ -0,0 +1,64 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+// This api will be used by serialization/export.cpp to extract function
+// information. It should do conversion on graph to
+//    1. Extract subgraph pattern of functions and define as local function
+//    node.
+//    2. Replace subgraph pattern of functions with a single node reflecting
+//    that local function node type.
+// Function attribute map information is also returned, as Torch IR cannot
+// represent these info inside Graph object.
+// export.cpp will serialize the ONNX model with function_proto with
+// above information.
+namespace torch::jit::onnx {
+
+// The following return types are used to track information regarding function
+// attributes, that are unable to be traced through Torch IR.
+// NodeAttrNameMap tracks mapping from attribute name of IR Node inside function
+// subgraph, to function attribute name. Here's an example of exporting CELU and
+// LayerNorm.
+//
+// clang-format off
+// class M(torch.nn.Module):
+//     def __init__(self) -> None:
+//         super().__init__()
+//         self.lns = torch.nn.ModuleList([torch.nn.LayerNorm(3, eps = i) for i in range(2)])
+//         self.celu1 = torch.nn.CELU(1.0)
+//         self.celu2 = torch.nn.CELU(2.0)
+
+//     def forward(self, x: torch.Tensor, y: torch.Tensor, z: torch.Tensor) -> torch.Tensor:
+//         res1 = self.celu1(x)
+//         res2 = self.celu2(y)
+//         for ln in self.lns:
+//             z = ln(z)
+//         return res1 + res2 + z
+// clang-format on
+//
+// Returning
+//
+// NodeAttrNameMap:
+// {
+//    %1 : Float(2, 3) = onnx::Celu[alpha=2.](%y) : {
+//      'alpha' : 'Celu_alpha'
+//    }
+// }
+//
+// The info here helps graph._export_onnx to construct function attributes for
+// onnx local FunctionProto.
+using NodeAttrNameMap = std::
+    unordered_map<const Node*, std::unordered_map<std::string, std::string>>;
+
+TORCH_API NodeAttrNameMap ONNXFunctionExtraction(
+    std::shared_ptr<Graph>& graph,
+    const std::unordered_set<std::string>& module_names,
+    const std::vector<std::string>& param_names);
+
+TORCH_API void ONNXClearScopeRecords();
+
+TORCH_API void ONNXTrackScopeAttributes(
+    std::shared_ptr<Graph>& graph,
+    std::map<std::string, IValue>& attributes);
+
+} // namespace torch::jit::onnx
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/function_substitution.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/function_substitution.h
new file mode 100644
index 0000000000000000000000000000000000000000..e8c1380d89407923cea1c989ec24536a46cec0a6
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/function_substitution.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API void ONNXFunctionCallSubstitution(Graph& graph);
+
+}
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/helper.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..6a2ea74490c5177c30dbf8598c848dba895cebcc
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/helper.h
@@ -0,0 +1,75 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// Utility functions for PyTorch to ONNX conversion.
+
+static const int OPSET_VERSION_1 = 1;
+static const int OPSET_VERSION_9 = 9;
+static const int OPSET_VERSION_10 = 10;
+static const int OPSET_VERSION_11 = 11;
+static const int OPSET_VERSION_12 = 12;
+static const int OPSET_VERSION_13 = 13;
+static const int OPSET_VERSION_14 = 14;
+static const int OPSET_VERSION_15 = 15;
+static const int OPSET_VERSION_16 = 16;
+
+using ValueToParamPairMap = std::map<Value*, std::pair<std::string, IValue>>;
+
+using ParamMap = std::map<std::string, IValue>;
+
+TORCH_API void buildParamsMapFromValueToParamsMap(
+    const ValueToParamPairMap& valsToParamsMap,
+    ParamMap& paramsDict);
+TORCH_API ValueToParamPairMap
+buildValueToParamsMap(Block* b, const ParamMap& paramsDict);
+TORCH_API void eraseUnusedValuesFromMap(ValueToParamPairMap& valsToParamsMap);
+TORCH_API void eraseUnusedBlockInputs(Block* b);
+TORCH_API void buildParamsMapFromValueToParamsMap(
+    const ValueToParamPairMap& valsToParamsMap,
+    ParamMap& paramsDict);
+
+TORCH_API Node* addNodeToBlock(
+    Block* block,
+    Symbol kind,
+    ArrayRef<Value*> inputs);
+
+TORCH_API Value* addInputToBlock(Block* block);
+
+TORCH_API std::optional<at::ScalarType> ONNXTypeToATenType(int32_t onnx_type);
+
+// Use int return type as no sable way exists to forward declare protobuf enum
+TORCH_API int ATenTypeToOnnxType(at::ScalarType at_type);
+
+TORCH_API void ONNXLintGraph(const std::shared_ptr<Graph>& graph);
+
+Node* createONNXUnsqueeze(
+    Graph* graph,
+    Node* n_to_insert_before,
+    Value* input,
+    int axis,
+    int opset_version);
+Node* createONNXConstant(
+    Graph* graph,
+    Node* n_to_insert_before,
+    at::Tensor value);
+
+bool isValidToTransformToONNXConcatNode(Node* lc_node);
+
+Node* transformToONNXConcatNode(
+    Graph* graph,
+    Node* lc_node,
+    bool need_new_input,
+    int opset_version);
+
+class ScalarTypeHashFunction {
+ public:
+  size_t operator()(const c10::ScalarType& type) const {
+    return static_cast<size_t>(type);
+  }
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/list_model_parameters.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/list_model_parameters.h
new file mode 100644
index 0000000000000000000000000000000000000000..a18af8e35fbbe1603c8f7aa2caab871e6314d72e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/list_model_parameters.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API std::pair<Module, std::vector<IValue>> list_module_parameters(
+    const Module& module);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/naming.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/naming.h
new file mode 100644
index 0000000000000000000000000000000000000000..f616e2938765998ffd0b51c0e86f873992ceeeda
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/naming.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit::onnx {
+
+namespace ONNXScopeName {
+
+std::string createFullScopeName(
+    const std::string& class_name,
+    const std::string& variable_name);
+std::string variableName(const torch::jit::ScopePtr& scope);
+std::string variableNameFromRoot(
+    const torch::jit::ScopePtr& scope,
+    const std::string& layer_separator);
+std::string className(const torch::jit::ScopePtr& scope);
+std::string classNameFromRoot(
+    const torch::jit::ScopePtr& scope,
+    const std::string& layer_separator);
+bool isCompatibleScope(const torch::jit::ScopePtr& scope);
+
+} // namespace ONNXScopeName
+
+TORCH_API void AssignScopedNamesForNodeAndValue(std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit::onnx
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/onnx_log.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/onnx_log.h
new file mode 100644
index 0000000000000000000000000000000000000000..8fdac8236fb1a29857c7f4a190db4858a824773b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/onnx_log.h
@@ -0,0 +1,23 @@
+#pragma once
+#include <torch/csrc/Export.h>
+#include <memory>
+#include <ostream>
+#include <string>
+
+namespace torch::jit::onnx {
+
+TORCH_API bool is_log_enabled();
+
+TORCH_API void set_log_enabled(bool enabled);
+
+TORCH_API void set_log_output_stream(std::shared_ptr<std::ostream> out_stream);
+
+TORCH_API std::ostream& _get_log_output_stream();
+
+#define ONNX_LOG(...)                            \
+  if (::torch::jit::onnx::is_log_enabled()) {    \
+    ::torch::jit::onnx::_get_log_output_stream() \
+        << ::c10::str(__VA_ARGS__) << std::endl; \
+  }
+
+} // namespace torch::jit::onnx
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/pattern_conversion/autograd_function_process.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/pattern_conversion/autograd_function_process.h
new file mode 100644
index 0000000000000000000000000000000000000000..b75e5442c3a212bb2ac50f2968488ff15075e653
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/pattern_conversion/autograd_function_process.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API void ONNXAutogradFunctionProcess(std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/pattern_conversion/common.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/pattern_conversion/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0de9bc44b5883dbde964134fe8f7be41e5f36ed
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/pattern_conversion/common.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+// Functions used by both encapsulation and conversion.
+
+namespace torch::jit {
+
+struct IndexingPatternFinder {
+ public:
+  static std::vector<Node*> FetchSliceAndSelect(const Node* node);
+
+ private:
+  static bool IsSameSource(const Node* n, const Node* m);
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_conversion.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_conversion.h
new file mode 100644
index 0000000000000000000000000000000000000000..9d74e8f1fc79c60668f608deb779905b33126241
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_conversion.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch::jit {
+
+// Introduction
+//
+// The conversion part is called inside the onnx pass.
+// In onnx pass, _run_symbolic_function will be called for each node in
+// topological order. When it reaches the placeholder node, this function will
+// be invoked. It will convert the nodes inside the sub-block based on pattern.
+// By that time, it will have shape/type of upstream operators available. After
+// the conversion is complete, the placeholder node will be removed, and nodes
+// inside its sub-block converted. NodeToONNX will be called for these
+// nodes, and they will be converted from ATen operator to ONNX operator.
+//
+// Note: Edit Pattern Conversion
+//
+// Each pattern is differentiated by the name attribute of placeholder node.
+// The placeholder node is part of torch IR graph, After this function, the aten
+// nodes under placeholder node subblock will be converted to ONNX and appended
+// to the new_block, which is under the new ONNX graph. For the pattern
+// conversion code, it can be divided into three parts.
+//      1. Nodes in this pattern should be captured inside the subblock of
+//         Placeholder node after pattern encapsulation[see
+//         pattern_encapsulation.h]. These nodes will be converted based on
+//         pattern. This part of conversion is from aten to aten. It happens on
+//         the torch IR graph inside placeholder node subblock.
+//      2. The second part of conversion is to convert the aten nodes produced
+//         into ONNX. This is done by calling NodeToONNX for each node. The new
+//         ONNX nodes are appended to the new_block, which is under the new ONNX
+//         graph.
+//      3. The last part of conversion is to find and return, in the same order,
+//         the ONNX outputs corresponding to the original output for the
+//         placeholder node.
+TORCH_API std::vector<Value*> ConvertPatternFromSubblock(
+    Block* new_block,
+    Node* old_node,
+    py::dict& env,
+    py::set& values_in_env);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.h
new file mode 100644
index 0000000000000000000000000000000000000000..b398712625cbdca9ab499a67a41de0c702f0cf00
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/pattern_conversion/pattern_encapsulation.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// Introduction
+//
+// The encapsulation part will find the nodes of patterns, like how other
+// pre-onnx passes are written. But instead of converting the nodes, it will
+// encapsulate them into a sub-block of a new placeholder node. This part is
+// called before onnx pass, so it runs before calling symbolic functions.
+//
+// Note: Why separate the function into two parts
+//
+// The purpose is to support conversions that depend on shape and type
+// information. Shape and type information is only available after
+// _jit_pass_onnx, which converts aten nodes to onnx nodes. So there is a
+// interdependent issue. _jit_pass_onnx depends on preprocess passes to convert
+// aten nodes into convertable condition, and preprocess passes depend on
+// _jit_pass_onnx to convert upstream nodes and apply onnx shape inference.
+// Separating the pass into two parts breaks the interdependency.
+//
+// Note: Edit Pattern Encapsulation
+//
+// Encapsulation step identifies the pattern, and copies the nodes into
+// the subblock of a new placeholder node. The outputs of the new placeholder
+// node are used in place of the original nodes instead. The category of the
+// pattern is stored as attr::name.
+TORCH_API std::optional<Node*> EncapsulatePatternIntoSubblock(Node* n);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/peephole.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/peephole.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c7afd989a6d33807adf4f32ca0a1b271b76149a
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/peephole.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+void PeepholeOptimizeONNX(
+    std::shared_ptr<Graph>& graph,
+    int opset_version,
+    bool fixed_batch_size);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/prepare_division_for_onnx.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/prepare_division_for_onnx.h
new file mode 100644
index 0000000000000000000000000000000000000000..bf4d8d110b821df7eae8ccd841eaba4d16132ead
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/prepare_division_for_onnx.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// Prepare division ops for ONNX export. This is necessary for and only used
+// by ONNX export.
+//
+// The pass corrects the following:
+//
+// - aten::div(int, int) -> float is the python truediv operator. This doesn't
+//   exist in ONNX so we cast the ints to FloatTensors
+//
+TORCH_API void PrepareDivisionForONNX(const std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/preprocess_for_onnx.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/preprocess_for_onnx.h
new file mode 100644
index 0000000000000000000000000000000000000000..fd29d95433a95f13762f9b0ad61df652459e49ac
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/preprocess_for_onnx.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+void PreprocessForONNX(std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.h
new file mode 100644
index 0000000000000000000000000000000000000000..94c897f32822462d3739f6f785bd8d6844b5de95
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/remove_inplace_ops_for_onnx.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API void RemoveInplaceOpsForONNX(
+    const std::shared_ptr<Graph>& graph,
+    Module* model);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/scalar_type_analysis.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/scalar_type_analysis.h
new file mode 100644
index 0000000000000000000000000000000000000000..de15b8ce3a2c241ae792bf5b20cab5adf471f3d9
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/scalar_type_analysis.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API void ScalarTypeAnalysisForONNX(
+    const std::shared_ptr<Graph>& graph,
+    bool lowprecision_cast,
+    int opset_version);
+void ScalarTypeAnalysisNodeForONNX(Node* n);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/shape_type_inference.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/shape_type_inference.h
new file mode 100644
index 0000000000000000000000000000000000000000..76577c9eb368fd5c09a145bf5f2a666f49d43860
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/shape_type_inference.h
@@ -0,0 +1,98 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/passes/onnx/helper.h>
+#include <torch/csrc/jit/python/python_arg_flatten.h>
+
+#include <utility>
+
+namespace torch::jit {
+
+// Merges existing_type and inferred_type.
+// Returns {merged type, whether or not inferred_type was used}.
+//
+// The inferred type will take higher precedence, since it is produced by ONNX
+// shape inference, and is more compatible with ONNX. In cases where ONNX shape
+// inference fails to produce an inferred type, or produces an inferred type
+// that is incomplete, refer to existing type and fill in the gap that is
+// missing. Currently the following cases are supported.
+//  1. existing type: Tensor[], inferred type: Tensor[]
+//    For list of tensors, existing type does not store datatype nor shape for
+//    inner tensor. Thus inferred type always contain more information, and is
+//    returned.
+//  2. existing type: Tensor, inferred type: Tensor
+//    Fill in missing info (shape, data type) for inferred type from existing
+//    type.
+//  3. existing type: Scalar[], inferred type: Tensor
+//    ONNX represents list of scalars by 1-d Tensor. Return inferred type since
+//    it is more compatible with ONNX.
+std::pair<TypePtr, bool> MergeInferredType(
+    const TypePtr& existing_type,
+    const TypePtr& inferred_type);
+
+void MergeInferredTypeAndSetMap(
+    Value* dest_v,
+    const TypePtr& existing_type,
+    const TypePtr& inferred_type);
+
+// Update graph input types with dynamic axes info.
+// Axes that are marked as dynamic will be assigned as dynamic ShapeSymbol.
+// Note it is possible for multiple axes to share the same ShapeSymbol,
+// if they are defined as such in dynamic_axes.
+TORCH_API void ONNXSetDynamicInputShape(
+    std::shared_ptr<Graph>& graph,
+    const std::unordered_map<
+        std::string,
+        std::unordered_map<int64_t, std::string>>& dynamic_axes,
+    const std::vector<std::string>& input_names);
+
+// Update graph output with types of output Tensors.
+// If onnx_shape_inference is true, types of output Tensors will be compared and
+// merged with inferred types. It is possible that inferred types contain
+// dynamic axes, hence it takes precedence over types of output Tensors.
+TORCH_API void ONNXAssignOutputShape(
+    std::shared_ptr<Graph>& graph,
+    at::ArrayRef<at::Tensor> outputs,
+    const python::IODescriptor& desc,
+    bool onnx_shape_inference,
+    bool is_script,
+    int opset_version);
+
+// Replace None in output with Optional node (opset > 15) if it's
+// script model. This helps align the output format in ONNX internal tests
+// when comparing pytorch results with ONNX results, as they have different
+// process for None in output.
+void ReplaceGraphOutputNoneWithOptional(
+    std::shared_ptr<Graph>& graph,
+    size_t outputs_index);
+Node* ONNXOptionalNodeForNone(std::shared_ptr<Graph>& graph);
+
+// Utilize ONNX Shape Inference for node.
+// The node must have ONNX namespace, and is valid ONNX node according to spec.
+// On successful ONNX shape inference runs, the function updates output types of
+// n with inferred shape and type. Otherwise n is unchanged.
+TORCH_API void ONNXShapeTypeInference(
+    Node* n,
+    const ParamMap& params_dict,
+    int opset_version);
+
+// Utilize ONNX Shape Inference for graph.
+// Internally calls ONNXShapeTypeInference for each node, to achieve more
+// coverage that skips only individual nodes if illegal, instead of skipping for
+// the entire graph.
+TORCH_API void ONNXShapeTypeInference(
+    std::shared_ptr<Graph>& g,
+    const ParamMap& params_dict,
+    int opset_version);
+
+bool AllGraphInputsStatic(const Graph* g);
+std::pair<bool, bool> AreInputsReliableOrStatic(Node* n);
+void UpdateReliable(
+    torch::jit::Value* output,
+    const std::pair<bool, bool>& input_reliable,
+    bool no_type_warning = false);
+
+void UpdateReliable(torch::jit::Node* n);
+void UpdateShapeConstantIfReliable(torch::jit::Value* output);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/unpack_quantized_weights.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/unpack_quantized_weights.h
new file mode 100644
index 0000000000000000000000000000000000000000..dac3f6fcd0a0e97b9d4c96178c5e5555db2c9893
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/onnx/unpack_quantized_weights.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/onnx/onnx.h>
+
+#include <memory>
+
+namespace torch::jit {
+
+TORCH_API void UnpackQuantizedWeights(
+    std::shared_ptr<Graph>& graph,
+    std::map<std::string, IValue>& paramsDict);
+TORCH_API void insertPermutes(
+    std::shared_ptr<Graph>& graph,
+    std::map<std::string, IValue>& paramsDict);
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/pass_manager.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/pass_manager.h
new file mode 100644
index 0000000000000000000000000000000000000000..949791654da334e4a2d75766fdc605507667b3f1
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/pass_manager.h
@@ -0,0 +1,134 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+/* `getCustomPrePasses()` returns a vector of passes that will be executed
+ * after differentiation but before any fusion. This is the de-facto location
+ * for compiler backends to insert passes.
+ *
+ * `getCustomPostPasses()` returns a vector of passes that will be
+ * executed after differentiation and after fusion (if any). This is the
+ * location for fusion cleanup passes if they are needed.
+ *
+ * Static registration of a pass can be done by creating a global
+ * `Register{Pre,Post}Pass r(Pass)` variable in a compilation unit.
+ *
+ * pass_manager.h uses a Meyer's singleton to store a vector of `Pass`es, which
+ * modify the IR graph in place.
+ */
+
+namespace torch::jit {
+
+// A pass modifies a Graph in place.
+using GraphPass = std::function<void(std::shared_ptr<Graph>&)>;
+
+// Since Passes are std::functions, we associate a UUID to each pass, this way
+// if we want to deregister a pass, we have something to reference it by.
+using GraphPassNameType = unsigned int;
+
+// Graph pass entries have a name associated with them
+using GraphPassEntry = std::pair<GraphPass, GraphPassNameType>;
+
+// Return currently registered passes. Passes are stored in a static vector
+TORCH_API std::vector<std::pair<GraphPass, GraphPassNameType>>&
+getCustomPostPasses();
+TORCH_API std::vector<std::pair<GraphPass, GraphPassNameType>>&
+getCustomPrePasses();
+
+TORCH_API GraphPassNameType registerPostPass(GraphPass p);
+TORCH_API GraphPassNameType registerPrePass(GraphPass p);
+
+// Look up pass by name passed in, remove it from registered passes
+TORCH_API void clearPostPass(GraphPassNameType p);
+TORCH_API void clearPrePass(GraphPassNameType p);
+
+// Remove all passes
+TORCH_API void clearAllPostPasses();
+TORCH_API void clearAllPrePasses();
+
+// LEGACY CALL
+struct TORCH_API RegisterPostPass {
+  RegisterPostPass(GraphPass p);
+};
+
+using RegisterPass = RegisterPostPass;
+
+/*
+ * PassManager is a wrapper on the register/clear PostPass functions above. It
+ * will register the pass provided in "registerPass" and will hold on to its
+ * associated name that way clearPass can be later called and will delete the
+ * pass used to register when called.
+ *
+ * PassManager is templated because we want static variables based on a
+ * particular GraphPass. When deriving from PassManager, you should send as the
+ * template parameter your derived class as you would for the curiously
+ * recurring template pattern. This template parameter isn't actually used and
+ * is simply done to prevent static members from being shared across derived
+ * types.
+ */
+template <typename DerivedType>
+struct C10_EXPORT PassManager {
+ private:
+  // We want this class to be abstract because it's
+  virtual void abstract() = 0;
+
+ protected:
+  /*
+   * isRegistered() will return if a pass has been registered
+   * isRegistered(true) will change the value of the internal static bool
+   *
+   * There's an internal static bool to this function to keep track of the
+   * state, this is so when functions are derived from this class, they don't
+   * have to worry about initializing the static members.
+   */
+  static bool isRegistered(bool flip_bit = false) {
+    static bool val = false;
+    if (flip_bit)
+      val = !val;
+    return val;
+  }
+
+  /*
+   * name() will return the name of the registered pass
+   * name(pass_name, true) will set the name of the pass
+   * Similarly to isRegistered we use an internal static variable to hold the
+   * name.
+   */
+  static GraphPassNameType passID(
+      GraphPassNameType PassID = 0,
+      bool set = false) {
+    static GraphPassNameType pass_id = 0;
+    if (set)
+      pass_id = PassID;
+    return pass_id;
+  }
+
+ public:
+  // registerPass(pass) will register the pass provided and set the
+  // name/isRegistered functions appropriately, it returns a bool value
+  // indicating whether the given pass is already registered previously.
+  static bool registerPass(GraphPass p) {
+    if (!isRegistered()) {
+      // If we don't already have a registered pass, register pass
+      // hold on to its name, change isRegistered to true
+      passID(registerPostPass(std::move(p)), true);
+      isRegistered(true);
+      return false;
+    }
+    return true;
+  }
+
+  // Calls ClearPostPass(passID())
+  static void clearPass() {
+    // If the pass is registered, clear it and change isRegistered to false.
+    if (isRegistered()) {
+      clearPostPass(passID());
+      isRegistered(true);
+    }
+  }
+
+  // clang-tidy requires virtual destructor;
+  virtual ~PassManager() = default;
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/peephole.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/peephole.h
new file mode 100644
index 0000000000000000000000000000000000000000..be0a1b79ec2c074f5bfcc66bfea93900c88fa36b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/peephole.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// return true if graph is modified
+TORCH_API bool PeepholeOptimize(
+    const std::shared_ptr<Graph>& graph,
+    bool disable_shape_peepholes = false);
+// return true if graph is modified
+TORCH_API bool PeepholeOptimize(
+    Block* block,
+    bool disable_shape_peepholes = false);
+// return true if graph is modified
+TORCH_API bool FuseAddMM(const std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/peephole_alias_sensitive.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/peephole_alias_sensitive.h
new file mode 100644
index 0000000000000000000000000000000000000000..bf8e664c48fe1f3d9c97b64721f6814fdd25945f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/peephole_alias_sensitive.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// Peephole Optimizes alias sensitive peepholes
+// Currently this is invoked as part of PeepholeOptimize
+// return true if graph is modified
+// Optimizes on TensorType if shape_peepholes is true
+TORCH_API bool PeepholeOptimizeAliasSensitive(
+    const std::shared_ptr<Graph>& graph,
+    bool shape_peepholes);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/peephole_dict_idioms.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/peephole_dict_idioms.h
new file mode 100644
index 0000000000000000000000000000000000000000..bfd79bc222cf98d670dd7eae85ba075df8df2d85
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/peephole_dict_idioms.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// Peephole Optimizes Dict Ops such as len() and __getitem__
+// 1. getitem optimizations
+// Given a function like this:
+//     def foo():
+//         d = {0 : 1}
+//         x = d[0]
+//         return x
+// This pass produces (after dead code elimination):
+//     def foo(a, b):
+//         return 1
+//
+// This optimization can only happen if the dict is not modified
+// and the dict has constant, non overlapping keys.
+//
+// 2. len optimizations
+// Given a function like this:
+//     def foo():
+//         d = {0 : 1}
+//         return len(d)
+// This pass produces (after dead code elimination):
+//     def foo():
+//         return 1
+//
+// This has the same requirements as the getitem optimizations.
+//
+// Currently this is invoked as part of PeepholeOptimize
+// return true if graph is modified.
+TORCH_API bool PeepholeOptimizeDictIdioms(const std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/peephole_list_idioms.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/peephole_list_idioms.h
new file mode 100644
index 0000000000000000000000000000000000000000..dac00fb59490bb9a4d5bfa847ed1e7bbb05b23f5
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/peephole_list_idioms.h
@@ -0,0 +1,70 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// Peephole Optimizes List ops such as len(li) and li[1].
+// 1. Construct/Unpack optimizations
+// Given a function like this:
+//    def foo(a, b):
+//        li = [a, b]
+//        x, y = li
+//        return x, y
+// This pass produces (after dead code elimination):
+//    def foo(a, b):
+//        return a, b
+//
+// This is only applied to lists that are not modified.
+//
+// 2. getitem optimizations
+// Given a function like this:
+//     def foo(a, b):
+//         li = [a, b]
+//         x = li[0]
+//         return x
+// This pass produces (after dead code elimination):
+//     def foo(a, b):
+//         return a
+//
+// This optimization can only happen if the list is not modified.
+//
+// 3. len optimizations
+// Given a function like this:
+//     def foo():
+//         li = [1, 2]
+//         return len(li)
+// This pass produces (after dead code elimination):
+//     def foo():
+//         return 2
+//
+// This has the same requirements as the getitem optimizations.
+//
+// 4. ListConstruct + ListConstruct
+// Given a function like this:
+//     def foo():
+//         return [1, 2] + [3, 4]
+// This pass produces (after dead code elimination):
+//     def foo():
+//         return [1, 2, 3, 4]
+//
+// This is only applied to lists that are not modified.
+//
+// 5. Slice
+// Given a function like this:
+//     def foo():
+//         return [1, 2, 3, 4, 5][0:2]
+// This pass produces (after deadcode elimination):
+//     def foo():
+//         return [1, 2]
+//
+// Currently this is invoked as part of PeepholeOptimize
+// return true if graph is modified.
+// If `refine_list_len` is true will attempt to refine the len of lists through
+// len comparisons and assertions. This does not generally optimize pytorch
+// programs so it is not called by default in PeepholeOptimize.
+TORCH_API bool PeepholeOptimizeListIdioms(
+    const std::shared_ptr<Graph>& graph,
+    bool refine_list_len = false);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/peephole_non_tensor.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/peephole_non_tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..c845aca61f0a7e392083a1f5242d1325c4dc9904
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/peephole_non_tensor.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// return true if graph is modified
+// Optimizing General Graph Patterns that
+// are not covered in peephole.cpp and peephole_list_idioms
+TORCH_API bool PeepholeOptimizeNonTensor(const std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/prepack_folding.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/prepack_folding.h
new file mode 100644
index 0000000000000000000000000000000000000000..6acc66b6a0d052a2df16211ff97e240ee9863405
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/prepack_folding.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+using PrePackingOpsFilterFn = std::function<bool(Node*)>;
+
+void PrePackingOpsFolder(
+    script::Module& m,
+    const PrePackingOpsFilterFn& is_foldable_op,
+    const std::string& attr_prefix);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/dedup_module_uses.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/dedup_module_uses.h
new file mode 100644
index 0000000000000000000000000000000000000000..3bdf80a9f68cd2431b3a6aebbead3d1bd1f81cc0
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/dedup_module_uses.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+
+namespace torch::jit {
+
+/** Recursively deduplicate multiple uses of the same module by
+ *  creating an instance clone for each use of the module, which means
+ *  the type will be the same as before and all the attributes will be
+ *  copied, then we'll change the use of the original module to the use
+ *  of cloned module in the Graph.
+ *
+ *  This is done to ensure that modules can survive destructive passes
+ *  without changing model behavior. For example, here:
+ *
+ *    x = self.conv1(x)
+ *    x = self.relu(x)
+ *    x = self.conv2(x)
+ *    x = self.relu(x)
+ *
+ *  self.relu needs to be deduplicated for potential future destructive passes
+ *  to work properly.
+ */
+TORCH_API void DedupModuleUses(Module& module);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/finalize.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/finalize.h
new file mode 100644
index 0000000000000000000000000000000000000000..a312a1aed01263f2a258ced3f954c5817d59b0a2
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/finalize.h
@@ -0,0 +1,61 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/passes/quantization/quantization_type.h>
+
+namespace torch::jit {
+
+/** \brief Backend specific pass to fuse dequantize - op - quantize calls
+ * as quantized_op calls.
+ *
+ * Right now this is a fusion for fbgemm backend and only works for quantized
+ * conv op, we'll extend to more ops and more backends in the future.
+ *
+ * Currently supported fusion:
+ * q(conv2d(dq(a), dq(w), dq(b))) --> to_nchw(fbgemm_conv2d(prepack(to_nhwc(a)),
+ *                                                          prepack(to_nhwc(w)),
+ *                                                          prepack(to_nhwc(b))))
+ *
+ * q(linear(dq(a), dq(w), dq(b))) --> to_nchw(fbgemm_linear(prepack(to_nhwc(a)),
+ *                                                          prepack(to_nhwc(w)),
+ *                                                          prepack(to_nhwc(b))))
+ *
+ * \param graph the graph we want to apply fusion
+ */
+TORCH_API void QuantFusion(
+    std::shared_ptr<Graph>& graph,
+    QuantType quant_type = QuantType::STATIC);
+
+/** \brief Insert prepack and unpack function in graph
+ *  We want add pack/unpack functions for quantized weight because later we want
+ * to fold the packed weight as an attribute of the module, in order to reduce
+ * the cost of packing the weight on the fly in quantized models.
+ *
+ *  Each quantized op has it's corresponding prepack/unpack function,
+ *  right now, we only need to do prepack/unpack for quantized::linear
+ * and quantized::conv2d.
+ */
+TORCH_API void InsertPrepackUnpack(std::shared_ptr<Graph>& graph);
+
+/** \brief Insert pack and unpack function in all graphs
+ *   of module
+ *
+ *   Go through graphs of all the methods of all child modules
+ *   and call InsertPrepackUnpack on the graph.
+ */
+TORCH_API void InsertPrepackUnpack(Module& module);
+
+TORCH_API script::Module Finalize(
+    script::Module& module,
+    QuantType quant_type = QuantType::STATIC,
+    const std::vector<std::string>& preserved_attrs =
+        std::vector<std::string>());
+
+TORCH_API void FoldQuantizedPrepackingOps(Module& module);
+
+TORCH_API Module FinalizeOnDevicePTQ(
+    Module& module,
+    QuantType quant_type,
+    const std::string& method_name);
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/fusion_passes.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/fusion_passes.h
new file mode 100644
index 0000000000000000000000000000000000000000..ac2a883986d69843239e7726c8c8cda2ee7fb348
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/fusion_passes.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+TORCH_API void FuseQuantizedAddRelu(std::shared_ptr<Graph>& graph);
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/helper.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..964b5b0c8e73899bfb8d4b8c720b5b4ae583c9e4
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/helper.h
@@ -0,0 +1,214 @@
+#pragma once
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/ir/subgraph_matcher.h>
+#include <torch/csrc/jit/passes/graph_rewrite_helper.h>
+#include <torch/csrc/jit/passes/quantization/quantization_type.h>
+
+#include <functional>
+#include <regex>
+
+namespace torch::jit {
+
+using graph_rewrite_helper::getFuncName;
+
+// Vector of a module and the name of its method
+using ModuleMethodVector = std::vector<std::pair<Module, std::string>>;
+// Map of quantization parameter name and value
+// for example _scale, _zero_point,
+// _scalar_type and _axis(for per channel quantization)
+using QParamVector = std::vector<std::pair<std::string, IValue>>;
+
+// =========== helper functions for Value =========
+// Check if a value is weight, since we need to use weight observer
+// for weight
+TORCH_API bool isWeight(Value* v);
+
+// Check if a value is bias for conv and linear, which we do not
+// quantize
+TORCH_API bool isBiasOfConvOrLinear(Value* v);
+
+TORCH_API bool isEmbeddingBagNonInput(Value* v);
+
+// Get the use as scalar input of clamp ops for the input value
+std::optional<Use> getClampScalarInputUse(Value* v);
+
+// For a given value `v`, get the list of values that we need to check
+// if they are observed/quantized or not, if so, we can say the
+// `v` is also observed/quantized, since we can derive
+// the quantization parameters for `v` given the list of values
+TORCH_API std::vector<Value*> getPassThroughInputs(Value* v);
+
+// Clones the method by the name of orig_method_name into new_method_name method
+TORCH_API void cloneMethod(
+    Module& module,
+    const std::string& orig_method_name,
+    const std::string& new_method_name);
+
+// Check if a value in the graph is a Scalar value
+TORCH_API bool isScalar(Value* v);
+
+// Check if value is the input of the graph
+TORCH_API bool hitGraphInput(Value* value);
+
+// Converts a mangled name, such as
+//   __torch__.torch.ao.nn.quantized.modules.conv.___torch_mangle_7.Conv2d
+// into an unmangled name, such as
+//   __torch__.torch.ao.nn.quantized.modules.conv.Conv2d
+TORCH_API std::string removeTorchMangle(const std::string& orig_name);
+
+// Return the module name that corresponds to the value.
+TORCH_API std::optional<std::string> getModuleName(Value* value);
+
+// =========== helper functions for Node =========
+TORCH_API bool isSingleInputGeneralShapeAtenFunction(Node* n);
+
+TORCH_API bool isSingleInputGeneralValueAtenFunction(Node* n);
+
+TORCH_API bool isSingleInputGeneralCallFunction(Node* n);
+
+TORCH_API bool isSingleInputGeneralAtenFunction(Node* n);
+
+TORCH_API bool isClamp(Node* n);
+
+// Check if the node will produce the same result regardless of whether
+// the input tensor is quantized or not, example: aten::size
+TORCH_API bool isTensorInfoNode(Node* n);
+
+// Check if this the propagate op that has single input, e.g. aten::cat
+TORCH_API bool isPropagateQuantSingleInputOp(Node* n);
+
+// Check if this is the propagate op that has two inputs, e.g. aten::add
+TORCH_API bool isPropagateQuantBinaryOp(Node* n);
+
+// Check if this is the node that we'll quantize or not quantize depending on
+// whether the input of the node is quantized, example: aten::cat
+TORCH_API bool isPropagateQuantOp(Node* n);
+
+// Check if the node is a binary op like aten::add and aten::mul and
+// if the input 1 is a scalar, these ops will be quantized to
+// quantized::{op}_scalar
+TORCH_API bool isBinaryOpWithScalarInput(Node* n);
+
+TORCH_API std::optional<std::tuple<c10::QScheme, QParamVector>> getFixedQParams(
+    Node* n);
+
+// We don't want to analyze the graph for some `builtin` CallFunctions
+// like `linear` because we want to preserve the op boundary
+TORCH_API bool userDefinedCallFunction(Node* n);
+
+// Check if the node has scalar input
+TORCH_API bool hasScalarInput(Node* n);
+
+// Check if a node is quantizable
+TORCH_API bool nodeQuantizable(
+    Node* n,
+    QuantType quant_type = QuantType::STATIC);
+
+// Nodes which only require quantization of weight value, eg. embedding_bag
+bool isWeightOnlyStaticQuantOp(Node* n);
+
+// Check if a use of the value is quantizable, this depends on
+// both the use node and the offset
+TORCH_API bool useQuantizable(const Use& use, QuantType quant_type);
+
+// Given a CallFunction node, extract the graph of the called function
+TORCH_API std::shared_ptr<Graph> getCallFunctionGraph(Node* n);
+
+// Check if `use` is a CallFunction of name `func_name` and if value
+// `v` is the nth argument (if provided) of the function
+bool matchCallFuncToUse(
+    const Use& use,
+    const std::string& func_name,
+    std::optional<int> nth_arg);
+
+// Check if `use` is a AtenFunction of name `func_name` and if value
+// `v` is the nth argument (if provided) of the function
+bool matchAtenFuncToUse(
+    const Use& use,
+    const std::string& func_name,
+    std::optional<int> nth_arg);
+
+// =========== helper functions for Block =========
+// checks if a block will always raise an Exception
+TORCH_API bool alwaysRaisesException(Block* block);
+
+// =========== helper functions for Module  ==========
+// TODO: remove
+TORCH_API std::vector<std::string> getModuleAccessPath(
+    Value* instance,
+    Value* self);
+// TODO: remove
+TORCH_API Module
+findChildModule(const Module& module, const std::vector<std::string>& path);
+
+// Given an CallMethod node, get the module instance corresponding
+// to the instance Value
+// TODO: refactor all current uses of this function to the Opt one
+TORCH_API Module getInvokedModule(Module& module, Node* n, Value* self);
+
+// Given an CallMethod node, get the module instance corresponding
+// to the instance Value if the instance is a module, otherwise return
+// std::nullopt
+std::optional<Module> getInvokedModuleOpt(
+    const Module& module,
+    Node* n,
+    Value* self);
+
+// ==================== filter functions for matches ==============
+// filter to check Value `vname` is a constant of int value `value`
+bool is_int_constant(
+    const Match& match,
+    const std::unordered_map<std::string, Value*>& vmap,
+    const std::string& vname,
+    int value);
+
+// filter to check if the %alpha argument of aten::add is constant 1
+bool aten_add_alpha_is_one(
+    const Match& match,
+    const std::unordered_map<std::string, Value*>& vmap);
+
+// filter to check if the functional in CallFunction is relu
+bool is_functional_relu(
+    const Match& match,
+    const std::unordered_map<std::string, Value*>& vmap);
+
+// filter to check if the module is torch.nn.ReLU
+bool is_relu_module(
+    const Match& match,
+    const std::unordered_map<std::string, Value*>& vmap);
+
+bool is_linear_module(
+    const Match& match,
+    const std::unordered_map<std::string, Value*>& vmap);
+
+// TODO: add a macro to declare the filters
+bool is_conv1d_module(
+    const Match& match,
+    const std::unordered_map<std::string, Value*>& vmap);
+
+bool is_conv2d_module(
+    const Match& match,
+    const std::unordered_map<std::string, Value*>& vmap);
+
+bool is_conv3d_module(
+    const Match& match,
+    const std::unordered_map<std::string, Value*>& vmap);
+
+bool is_conv_transpose1d_module(
+    const Match& match,
+    const std::unordered_map<std::string, Value*>& vmap);
+
+bool is_conv_transpose2d_module(
+    const Match& match,
+    const std::unordered_map<std::string, Value*>& vmap);
+
+bool is_batchnorm2d_module(
+    const Match& match,
+    const std::unordered_map<std::string, Value*>& vmap);
+
+bool is_batchnorm3d_module(
+    const Match& match,
+    const std::unordered_map<std::string, Value*>& vmap);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/insert_observers.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/insert_observers.h
new file mode 100644
index 0000000000000000000000000000000000000000..5449a855237a7f9c6922565ca466bccf53209ef5
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/insert_observers.h
@@ -0,0 +1,66 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/passes/quantization/quantization_type.h>
+
+namespace std {
+
+template <>
+struct hash<torch::jit::Module> {
+  inline size_t operator()(const torch::jit::Module& arg) const {
+    return std::hash<c10::intrusive_ptr<c10::ivalue::Object>>()(arg._ivalue());
+  }
+};
+
+} // namespace std
+
+namespace torch::jit {
+
+using QConfig = std::tuple<Module, Module>;
+using QConfigDict = std::unordered_map<std::string, std::optional<QConfig>>;
+
+/** \brief Insert observer module and observer function call for
+ *  the Tensors that needs to be observed.
+ *
+ * For each Tensor that needs to be observed in the method, insert observer
+ * module to the input module and add forward calls of observer to the specified
+ * method.
+ *
+ * \param module the input module
+ * \param method_name the method we want to insert observers for
+ * \param qconfig_dict the qconfig dictionary that specifies how
+ * each module is going to be quantized
+ * \param inplace whether we want to do inplace modification to the input module
+ * or clone the module
+ * \param is_dynamic whether the dynamic quantization script is being used.
+ */
+TORCH_API Module InsertObservers(
+    Module& module,
+    const std::string& method_name,
+    const QConfigDict& qconfig_dict,
+    bool inplace,
+    QuantType quant_type = QuantType::STATIC);
+
+/** \brief Insert observer module and observer method for
+ *  the Tensors that needs to be observed.
+ *
+ * For each Tensor that needs to be observed in the method, insert observer
+ * module to the input module and observe_<method-name> methods to the module.
+ * This method is clone of mehtod_name with forward calls of observer added.
+ *
+ * \param module the input module
+ * \param method_name the method we want to insert observers for
+ * \param qconfig_dict the qconfig dictionary that specifies how
+ * each module is going to be quantized
+ * \param inplace whether we want to do inplace modification to the input module
+ * or clone the module
+ * \param is_dynamic whether the dynamic quantization script is being used.
+ */
+TORCH_API Module InsertObserversForOnDevicePTQ(
+    Module& module,
+    const std::string& method_name,
+    const QConfigDict& qconfig_dict,
+    bool inplace,
+    QuantType quant_type = QuantType::STATIC);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/insert_quant_dequant.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/insert_quant_dequant.h
new file mode 100644
index 0000000000000000000000000000000000000000..828cae4f98f1b1f6cee19102725dee476f7d3953
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/insert_quant_dequant.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/passes/quantization/quantization_type.h>
+
+namespace torch::jit {
+
+/** Replicate quantize node for prim::If blocks, so that we can match
+ *  quantization patterns in prim::If blocks
+ */
+TORCH_API void ReplicateQuant(std::shared_ptr<Graph>& graph);
+
+/** Replicate dequantize node for each use, so that we can match
+ *  quantization patterns
+ */
+TORCH_API void ReplicateDeQuant(std::shared_ptr<Graph>& graph);
+
+/** \brief Insert quantize - dequantize calls to the Tensors
+ *  that are observed in insert_observers pass
+ *
+ * For each Tensor that is observed, get the observer module and call
+ * calculate_qparam on the observer module to get quantization parameters
+ * and add quantize - int_repr - dequantize function calls using these
+ * parameters we also have special handling for quantizing "bias" right now.
+ *
+ * \param module the input module
+ * \param method_name the method we want to insert quantization calls for
+ */
+TORCH_API Module InsertQuantDeQuant(
+    Module& module,
+    const std::string& method_name,
+    bool inplace,
+    bool debug,
+    QuantType quant_type = QuantType::STATIC);
+
+TORCH_API Module InsertQuantDeQuantOnDevicePTQ(
+    Module& module,
+    const std::string& method_name,
+    bool inplace,
+    bool debug,
+    QuantType quant_type = QuantType::STATIC);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/quantization_patterns.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/quantization_patterns.h
new file mode 100644
index 0000000000000000000000000000000000000000..69492861036e7a543cf0ad8e7a7105deee67c077
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/quantization_patterns.h
@@ -0,0 +1,1264 @@
+#pragma once
+
+#include <c10/util/irange.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/ir/subgraph_matcher.h>
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/quantization/helper.h>
+#include <torch/csrc/jit/passes/subgraph_rewrite.h>
+#include <string>
+#include <unordered_map>
+#include <utility>
+
+namespace torch::jit {
+
+struct QuantFusionInfo {
+  std::string quantized_op_name;
+  std::string pattern;
+  std::string replacement;
+  std::vector<MatchFilter> filters = {};
+};
+
+namespace {
+std::string getExtraArgList(std::vector<std::string> extra_args) {
+  return std::accumulate(
+      extra_args.begin(),
+      extra_args.end(),
+      std::string(),
+      [](const std::string& acc, const std::string& arg) {
+        return acc + ", " + arg;
+      });
+}
+
+// Get the pattern we want to replace the match with
+std::string getAtenOpPattern(
+    const std::string& graph_header,
+    const std::string& op_name,
+    const std::vector<std::string>& extra_op_args,
+    bool scalar_args = false) {
+  std::vector<std::string> _extra_op_args = extra_op_args;
+  std::string aten_op_pattern = graph_header;
+  if (scalar_args) {
+    for (const auto& extra_arg : _extra_op_args) {
+      aten_op_pattern
+          .append(R"(
+          )")
+          .append(extra_arg)
+          .append("_scalar = aten::item(")
+          .append(extra_arg)
+          .append(")");
+    }
+
+    for (auto& _extra_op_arg : _extra_op_args) {
+      _extra_op_arg.append("_scalar");
+    }
+  }
+  const auto& extra_op_arg_list = getExtraArgList(std::move(_extra_op_args));
+  aten_op_pattern += R"(
+          %r = )";
+  aten_op_pattern += op_name + "(" + "%a_quant" + extra_op_arg_list + ")";
+  aten_op_pattern += R"(
+          return (%r) )";
+  return aten_op_pattern;
+}
+
+// generate ops for quantize pattern for a scalar value
+std::string getQuantizeForScalar(const std::string& value) {
+  // 6 is `torch.float` ScalarType, we are creating a float scalar
+  // tensor from a scalar value
+  std::string quantize_pattern = R"(
+          )" +
+      value + "_float_scalar_type : int = prim::Constant[value=6]()";
+  quantize_pattern += R"(
+          )" +
+      value + "_none : None = prim::Constant()";
+  quantize_pattern += R"(
+          )" +
+      value + "_tensor : Tensor = aten::scalar_tensor(" + value + ", " + value +
+      "_float_scalar_type";
+  for ([[maybe_unused]] const auto i : c10::irange(3)) {
+    quantize_pattern += ", " + value + "_none";
+  }
+  quantize_pattern += ")";
+  quantize_pattern +=
+      R"(
+          )" +
+      value + "_quant = aten::quantize_per_tensor(" + value + "_tensor" +
+      getExtraArgList(
+          {value + "_scale", value + "_zero_point", value + "_dtype"}) +
+      ")";
+  return quantize_pattern;
+}
+
+std::string getDequantize(const std::string& value) {
+  return R"(
+          )" +
+      value + "_dequant = aten::dequantize(" + value + "_quant)";
+}
+
+std::string getItem(const std::string& value) {
+  return R"(
+          )" +
+      value + "_scalar : float = aten::item(" + value + "_dequant)";
+}
+
+// Patterns for the ops that inherit parameters from input
+std::string getInputTensorQParamOpPattern(
+    const std::string& op_name,
+    const std::vector<std::string>& extra_op_args) {
+  const auto& extra_op_arg_list = getExtraArgList(extra_op_args);
+  std::string op_pattern = "graph(%a_quant" + extra_op_arg_list + "):" + R"(
+          %a_dequant = aten::dequantize(%a_quant)
+          %r = )" +
+      op_name + "(" + "%a_dequant" + extra_op_arg_list + ")" + R"(
+          %r_scale : float = aten::q_scale(%a_quant)
+          %r_zero_point : int = aten::q_zero_point(%a_quant)
+          %r_dtype : int = prim::dtype(%a_quant)
+          %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+          return (%r_quant) )";
+  return op_pattern;
+}
+
+// QuantFusionInfo for the ops that inherit parameters from input
+QuantFusionInfo getInputTensorQParamOpFusionInfo(
+    const std::string& op_name,
+    const std::vector<std::string>& extra_op_args) {
+  std::string op_pattern =
+      getInputTensorQParamOpPattern(op_name, extra_op_args);
+  const auto& extra_op_arg_list = getExtraArgList(extra_op_args);
+  std::string graph_header = "graph(%a_quant" + extra_op_arg_list + "):";
+  std::string op_replacement =
+      getAtenOpPattern(graph_header, op_name, extra_op_args);
+
+  return {op_name, std::move(op_pattern), std::move(op_replacement)};
+}
+
+// quant fusion for ops like `quantized::add_scalar`, `quantized::mul_scalar`
+QuantFusionInfo getBinaryOpScalarFusionInfo(
+    const std::string& op_name,
+    const std::vector<std::string>& extra_op_args,
+    const std::string& quantized_op_name,
+    const std::vector<std::string>& extra_quantized_op_args,
+    const std::vector<MatchFilter>& filters = {}) {
+  std::string op_pattern =
+      getInputTensorQParamOpPattern(op_name, extra_op_args);
+
+  const auto& extra_op_arg_list = getExtraArgList(extra_op_args);
+  std::string graph_header = "graph(%a_quant" + extra_op_arg_list + "):";
+  std::string op_replacement = getAtenOpPattern(
+      graph_header, quantized_op_name, extra_quantized_op_args);
+
+  return {op_name, std::move(op_pattern), std::move(op_replacement), filters};
+}
+
+QuantFusionInfo getClampOpFusionInfo(
+    const std::string& op_name,
+    const std::vector<std::string>& extra_op_args) {
+  std::vector<std::string> header_args = extra_op_args;
+  std::vector<std::string> input_qparams = {"_scale", "_zero_point", "_dtype"};
+  for (const auto& arg : extra_op_args) {
+    for (const auto& qparam : input_qparams) {
+      header_args.push_back(arg + qparam);
+    }
+  }
+  for (const auto& qparam : input_qparams) {
+    header_args.push_back("%r" + qparam);
+  }
+  const auto& extra_header_arg_list = getExtraArgList(std::move(header_args));
+  std::string graph_header = "graph(%a_quant" + extra_header_arg_list + "):";
+  std::string op_pattern = graph_header;
+  for (const auto& arg : extra_op_args) {
+    op_pattern += getQuantizeForScalar(arg);
+    op_pattern += getDequantize(arg);
+    op_pattern += getItem(arg);
+  }
+  op_pattern += getDequantize("%a");
+  op_pattern += R"(
+          %r = )";
+  std::vector<std::string> scalar_extra_args;
+  scalar_extra_args.reserve(extra_op_args.size());
+  for (const auto& arg : extra_op_args) {
+    scalar_extra_args.push_back(arg + "_scalar");
+  }
+  op_pattern += op_name + "(" + "%a_dequant" +
+      getExtraArgList(std::move(scalar_extra_args)) + ")";
+  // IR pattern common to all ops that inherit qparam from input
+  op_pattern += R"(
+          %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+          return (%r_quant) )";
+
+  std::string aten_op_pattern =
+      getAtenOpPattern(graph_header, op_name, extra_op_args);
+
+  return {op_name, std::move(op_pattern), std::move(aten_op_pattern)};
+}
+
+// Patterns for the ops that has fixed quantization parameters
+QuantFusionInfo getFixedQParamOpFusionInfo(
+    const std::string& op_name,
+    const std::vector<std::string>& extra_op_args,
+    bool is_symmetric) {
+  const auto& extra_op_arg_list = getExtraArgList(extra_op_args);
+  std::string graph_header = "graph(%a_quant" + extra_op_arg_list + "):";
+  std::string op_pattern = graph_header;
+  op_pattern += R"(
+          %a_dequant = aten::dequantize(%a_quant)
+          %r = )";
+  op_pattern += op_name + "(" + "%a_dequant" + extra_op_arg_list + ")";
+  // IR pattern common to all ops with fixed quantization parameters for
+  // asymetric quantization
+  std::string asym_fixed_qparam_op_suffix = R"(
+          %r_scale : float = prim::Constant[value=0.00390625]()
+          %r_zero_point : int = prim::Constant[value=0]()
+          %r_dtype : int = prim::Constant[value=13]()
+          %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+          return (%r_quant) )";
+
+  std::string sym_fixed_qparam_op_suffix = R"(
+          %r_scale : float = prim::Constant[value=0.0078125]()
+          %r_zero_point : int = prim::Constant[value=128]()
+          %r_dtype : int = prim::Constant[value=13]()
+          %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+          return (%r_quant) )";
+  op_pattern +=
+      is_symmetric ? sym_fixed_qparam_op_suffix : asym_fixed_qparam_op_suffix;
+
+  std::string aten_op_pattern =
+      getAtenOpPattern(graph_header, op_name, extra_op_args);
+
+  return {op_name, std::move(op_pattern), std::move(aten_op_pattern)};
+}
+
+// filter that checks %b_scalar is a scalar
+bool input_b_is_scalar(
+    const Match& match,
+    const std::unordered_map<std::string, Value*>& vmap) {
+  const auto& match_vmap = match.values_map;
+  auto b_scalar = match_vmap.at(vmap.at("b_scalar"));
+  return isScalar(b_scalar);
+}
+
+// Patterns for ops that require observation for output quantization parameters
+// Example:
+//
+// before fusion:
+//
+// graph(%a_quant, %r_scale, %r_zero_point, %r_dtype):
+//     %a_dequant = aten::dequantize(%a_quant)
+//     %r = {op_name}(%a_dequant, {extra_args})
+//     %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point,
+//     %r_dtype) return (%r_quant)
+//
+// after fusion:
+//
+// graph(%a_quant, %r_scale, %r_zero_point, %r_dtype):
+//     %r_quant = {quantized_op_name}(%a_quant, {extra_args}, %r_scale,
+//     %r_zero_point) return (%r_quant)
+QuantFusionInfo getObservedQParamOpFusionInfo(
+    const std::string& fp_op_name,
+    const std::string& q_op_name,
+    const std::vector<std::string>& fp_extra_args,
+    const std::vector<std::string>& q_extra_args) {
+  const auto& fp_extra_arg_list = getExtraArgList(fp_extra_args);
+  const auto& q_extra_arg_list = getExtraArgList(q_extra_args);
+
+  std::string op_pattern = "graph(%a_quant" + fp_extra_arg_list +
+      ", %r_scale, %r_zero_point, %r_dtype):" + R"(
+          %a_dequant = aten::dequantize(%a_quant)
+          %r = )" +
+      fp_op_name + "(" + "%a_dequant" + fp_extra_arg_list + ")" + R"(
+          %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+          return (%r_quant) )";
+
+  std::string aten_op_pattern = "graph(%a_quant" + fp_extra_arg_list +
+      ", %r_scale, %r_zero_point, %r_dtype):" + R"(
+          %r_quant = )" +
+      q_op_name + "(%a_quant" + q_extra_arg_list +
+      ", %r_scale, %r_zero_point)" + R"(
+          return (%r_quant) )";
+
+  return {q_op_name, std::move(op_pattern), std::move(aten_op_pattern)};
+}
+
+} // namespace
+
+static std::vector<QuantFusionInfo> quant_fusion_pattern_and_replacements() {
+  // aten::conv1d
+  std::string conv1d = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %dilation, %groups):
+        %a_dequant = aten::dequantize(%a_quant)
+        %w_quant : Tensor, %b : Tensor? = quantized::conv1d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant)
+        %r = aten::conv1d(%a_dequant, %w_dequant, %b, %stride, %padding, %dilation, %groups)
+        %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+        return (%r_quant) )";
+
+  // aten::conv1d - aten::relu
+  std::string conv1d_relu = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %dilation, %groups):
+        %a_dequant = aten::dequantize(%a_quant)
+        %w_quant : Tensor, %b : Tensor? = quantized::conv1d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant)
+        %conv_out = aten::conv1d(%a_dequant, %w_dequant, %b, %stride, %padding, %dilation, %groups)
+        %r = aten::relu(%conv_out)
+        %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+        return (%r_quant) )";
+
+  // aten::conv1d - aten::relu_
+  std::string conv1d_inplace_relu = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %dilation, %groups):
+        %a_dequant = aten::dequantize(%a_quant)
+        %w_quant : Tensor, %b : Tensor? = quantized::conv1d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant)
+        %conv_out = aten::conv1d(%a_dequant, %w_dequant, %b, %stride, %padding, %dilation, %groups)
+        %r = aten::relu_(%conv_out)
+        %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+        return (%r_quant) )";
+
+  // quantized::conv1d
+  std::string quantized_conv1d = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %dilation, %groups):
+        %r_quant = quantized::conv1d(%a_quant, %packed_params, %r_scale, %r_zero_point)
+        return (%r_quant) )";
+
+  // quantized::conv1d_relu
+  std::string quantized_conv1d_relu = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %dilation, %groups):
+        %r_quant = quantized::conv1d_relu(%a_quant, %packed_params, %r_scale, %r_zero_point)
+        return (%r_quant) )";
+
+  // aten::conv2d
+  std::string conv2d = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %dilation, %groups):
+        %a_dequant = aten::dequantize(%a_quant)
+        %w_quant : Tensor, %b : Tensor? = quantized::conv2d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant)
+        %r = aten::conv2d(%a_dequant, %w_dequant, %b, %stride, %padding, %dilation, %groups)
+        %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+        return (%r_quant) )";
+
+  // aten::conv2d - aten::relu
+  std::string conv2d_relu = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %dilation, %groups):
+        %a_dequant = aten::dequantize(%a_quant)
+        %w_quant : Tensor, %b : Tensor? = quantized::conv2d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant)
+        %conv_out = aten::conv2d(%a_dequant, %w_dequant, %b, %stride, %padding, %dilation, %groups)
+        %r = aten::relu(%conv_out)
+        %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+        return (%r_quant) )";
+
+  // aten::conv2d - aten::relu_
+  std::string conv2d_inplace_relu = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %dilation, %groups):
+        %a_dequant = aten::dequantize(%a_quant)
+        %w_quant : Tensor, %b : Tensor? = quantized::conv2d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant)
+        %conv_out = aten::conv2d(%a_dequant, %w_dequant, %b, %stride, %padding, %dilation, %groups)
+        %r = aten::relu_(%conv_out)
+        %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+        return (%r_quant) )";
+
+  // quantized::conv2d
+  std::string quantized_conv2d = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %dilation, %groups):
+        %r_quant = quantized::conv2d(%a_quant, %packed_params, %r_scale, %r_zero_point)
+        return (%r_quant) )";
+
+  // quantized::conv2d_relu
+  std::string quantized_conv2d_relu = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %dilation, %groups):
+        %r_quant = quantized::conv2d_relu(%a_quant, %packed_params, %r_scale, %r_zero_point)
+        return (%r_quant) )";
+
+  // aten::conv3d
+  std::string conv3d = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %dilation, %groups):
+        %a_dequant = aten::dequantize(%a_quant)
+        %w_quant : Tensor, %b : Tensor? = quantized::conv3d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant)
+        %r = aten::conv3d(%a_dequant, %w_dequant, %b, %stride, %padding, %dilation, %groups)
+        %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+        return (%r_quant) )";
+
+  // aten::conv3d - aten::relu
+  std::string conv3d_relu = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %dilation, %groups):
+        %a_dequant = aten::dequantize(%a_quant)
+        %w_quant : Tensor, %b : Tensor? = quantized::conv3d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant)
+        %conv_out = aten::conv3d(%a_dequant, %w_dequant, %b, %stride, %padding, %dilation, %groups)
+        %r = aten::relu(%conv_out)
+        %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+        return (%r_quant) )";
+
+  // aten::conv3d - aten::relu_
+  std::string conv3d_inplace_relu = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %dilation, %groups):
+        %a_dequant = aten::dequantize(%a_quant)
+        %w_quant : Tensor, %b : Tensor? = quantized::conv3d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant)
+        %conv_out = aten::conv3d(%a_dequant, %w_dequant, %b, %stride, %padding, %dilation, %groups)
+        %r = aten::relu_(%conv_out)
+        %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+        return (%r_quant) )";
+
+  // quantized::conv3d
+  std::string quantized_conv3d = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %dilation, %groups):
+        %r_quant = quantized::conv3d(%a_quant, %packed_params, %r_scale, %r_zero_point)
+        return (%r_quant) )";
+
+  // quantized::conv3d_relu
+  std::string quantized_conv3d_relu = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %dilation, %groups):
+        %r_quant = quantized::conv3d_relu(%a_quant, %packed_params, %r_scale, %r_zero_point)
+        return (%r_quant) )";
+
+  // aten::conv_transpose1d
+  std::string conv_transpose1d = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %output_padding, %groups, %dilation):
+        %a_dequant = aten::dequantize(%a_quant)
+        %w_quant : Tensor, %b : Tensor? = quantized::conv_transpose1d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant)
+        %r = aten::conv_transpose1d(%a_dequant, %w_dequant, %b, %stride, %padding, %output_padding, %groups, %dilation)
+        %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+        return (%r_quant) )";
+
+  // quantized::conv_transpose1d
+  std::string quantized_conv_transpose1d = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %output_padding, %groups, %dilation):
+        %r_quant = quantized::conv_transpose1d(%a_quant, %packed_params, %r_scale, %r_zero_point)
+        return (%r_quant) )";
+
+  // aten::conv_transpose2d
+  std::string conv_transpose2d = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %output_padding, %groups, %dilation):
+        %a_dequant = aten::dequantize(%a_quant)
+        %w_quant : Tensor, %b : Tensor? = quantized::conv_transpose2d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant)
+        %r = aten::conv_transpose2d(%a_dequant, %w_dequant, %b, %stride, %padding, %output_padding, %groups, %dilation)
+        %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+        return (%r_quant) )";
+
+  // quantized::conv_transpose1d
+  std::string quantized_conv_transpose2d = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype, %stride, %padding, %output_padding, %groups, %dilation):
+        %r_quant = quantized::conv_transpose2d(%a_quant, %packed_params, %r_scale, %r_zero_point)
+        return (%r_quant) )";
+
+  std::string add_relu = R"(
+graph(%a_quant, %b_quant, %alpha, %scale, %zero_point, %dtype):
+         %a_dequant = aten::dequantize(%a_quant)
+         %b_dequant = aten::dequantize(%b_quant)
+         %r_add = aten::add(%a_dequant, %b_dequant, %alpha)
+         %r_relu = aten::relu(%r_add)
+         %r = aten::quantize_per_tensor(%r_relu, %scale, %zero_point, %dtype)
+         return (%r) )";
+
+  std::string add_inplace_relu = R"(
+graph(%a_quant, %b_quant, %alpha, %scale, %zero_point, %dtype):
+         %a_dequant = aten::dequantize(%a_quant)
+         %b_dequant = aten::dequantize(%b_quant)
+         %r_add = aten::add(%a_dequant, %b_dequant, %alpha)
+         %r_relu = aten::relu_(%r_add)
+         %r = aten::quantize_per_tensor(%r_relu, %scale, %zero_point, %dtype)
+         return (%r) )";
+
+  std::string inplace_add_relu = R"(
+graph(%a_quant, %b_quant, %alpha, %scale, %zero_point, %dtype):
+         %a_dequant = aten::dequantize(%a_quant)
+         %b_dequant = aten::dequantize(%b_quant)
+         %r_add = aten::add_(%a_dequant, %b_dequant, %alpha)
+         %r_relu = aten::relu(%r_add)
+         %r = aten::quantize_per_tensor(%r_relu, %scale, %zero_point, %dtype)
+         return (%r) )";
+
+  std::string inplace_add_inplace_relu = R"(
+graph(%a_quant, %b_quant, %alpha, %scale, %zero_point, %dtype):
+         %a_dequant = aten::dequantize(%a_quant)
+         %b_dequant = aten::dequantize(%b_quant)
+         %r_add = aten::add_(%a_dequant, %b_dequant, %alpha)
+         %r_relu = aten::relu_(%r_add)
+         %r = aten::quantize_per_tensor(%r_relu, %scale, %zero_point, %dtype)
+         return (%r) )";
+
+  std::string quantized_add_relu = R"(
+graph(%a_quant, %b_quant, %alpha, %scale, %zero_point, %dtype):
+         %r = quantized::add_relu(%a_quant, %b_quant, %scale, %zero_point)
+         return (%r) )";
+
+  // aten::linear
+  std::string linear = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype):
+        %a_dequant = aten::dequantize(%a_quant)
+        %w_quant : Tensor, %b : Tensor? = quantized::linear_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant)
+        %r = aten::linear(%a_dequant, %w_dequant, %b)
+        %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+        return (%r_quant) )";
+
+  std::string linear_relu = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype):
+        %a_dequant = aten::dequantize(%a_quant)
+        %w_quant : Tensor, %b : Tensor? = quantized::linear_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant)
+        %linear_out = aten::linear(%a_dequant, %w_dequant, %b)
+        %r = aten::relu(%linear_out)
+        %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+        return (%r_quant) )";
+
+  std::string linear_inplace_relu = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype):
+        %a_dequant = aten::dequantize(%a_quant)
+        %w_quant : Tensor, %b : Tensor? = quantized::linear_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant)
+        %linear_out = aten::linear(%a_dequant, %w_dequant, %b)
+        %r = aten::relu_(%linear_out)
+        %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+        return (%r_quant) )";
+
+  // quantized::linear
+  std::string quantized_linear = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype):
+        %r = quantized::linear(%a_quant, %packed_params, %r_scale, %r_zero_point)
+        return (%r) )";
+
+  std::string quantized_linear_relu = R"(
+graph(%a_quant, %packed_params, %r_scale, %r_zero_point, %r_dtype):
+        %r = quantized::linear_relu(%a_quant, %packed_params, %r_scale, %r_zero_point)
+        return (%r) )";
+
+  std::string cat = R"(
+graph(%input_quant, %dim, %r_scale, %r_zero_point, %r_dtype):
+        %input_dequant = aten::dequantize(%input_quant)
+        %r = aten::cat(%input_dequant, %dim)
+        %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+        return (%r_quant) )";
+
+  std::string quantized_cat = R"(
+graph(%input_quant, %dim, %r_scale, %r_zero_point, %r_dtype):
+         %r_quant = quantized::cat(%input_quant, %dim, %r_scale, %r_zero_point)
+         return (%r_quant) )";
+
+  // aten::add
+  std::string add = R"(
+graph(%a_quant, %b_quant, %alpha, %scale, %zero_point, %dtype):
+         %a_dequant = aten::dequantize(%a_quant)
+         %b_dequant = aten::dequantize(%b_quant)
+         %r_add = aten::add(%a_dequant, %b_dequant, %alpha)
+         %r = aten::quantize_per_tensor(%r_add, %scale, %zero_point, %dtype)
+         return (%r) )";
+
+  // TODO: add %dtype after when https://github.com/pytorch/pytorch/issues/34351
+  // is fixed
+  // quantized::add
+  std::string quantized_add = R"(
+graph(%a_quant, %b_quant, %alpha, %scale, %zero_point, %dtype):
+         %r = quantized::add(%a_quant, %b_quant, %scale, %zero_point)
+         return (%r) )";
+
+  // aten::add_
+  std::string inplace_add = R"(
+graph(%a_quant, %b_quant, %alpha, %scale, %zero_point, %dtype):
+         %a_dequant = aten::dequantize(%a_quant)
+         %b_dequant = aten::dequantize(%b_quant)
+         %r_add = aten::add_(%a_dequant, %b_dequant, %alpha)
+         %r = aten::quantize_per_tensor(%r_add, %scale, %zero_point, %dtype)
+         return (%r) )";
+
+  auto add_scalar = getBinaryOpScalarFusionInfo(
+      "aten::add",
+      {"%b_scalar", "%alpha"},
+      "quantized::add_scalar",
+      {"%b_scalar"},
+      {aten_add_alpha_is_one, input_b_is_scalar});
+
+  auto add_scalar_out = getBinaryOpScalarFusionInfo(
+      "aten::add_",
+      {"%b_scalar", "%alpha"},
+      "quantized::add_scalar_out",
+      {"%b_scalar", "%a_quant"},
+      {aten_add_alpha_is_one, input_b_is_scalar});
+
+  // quantized::add_scalar_relu -- fusing quantized::add_scalar
+  // and aten::relu
+  auto quantized_add_scalar_relu_pattern = R"(
+graph(%a_quant, %b_scalar):
+         %r_add = quantized::add_scalar(%a_quant, %b_scalar)
+         %r = aten::relu(%r_add)
+         return (%r) )";
+
+  auto quantized_add_scalar_inplace_relu_pattern = R"(
+graph(%a_quant, %b_scalar):
+         %r_add = quantized::add_scalar(%a_quant, %b_scalar)
+         %r = aten::relu_(%r_add)
+         return (%r) )";
+
+  auto quantized_add_scalar_relu_replacement = R"(
+graph(%a_quant, %b_scalar):
+         %r = quantized::add_scalar_relu(%a_quant, %b_scalar)
+         return (%r) )";
+
+  // quantized::add_scalar_relu_out -- fusing quantized::add_scalarOut
+  // and aten::relu
+  auto quantized_add_scalar_relu_out_pattern = R"(
+graph(%a_quant, %b_scalar):
+         %r_add = quantized::add_scalar_out(%a_quant, %b_scalar, %a_quant)
+         %r = aten::relu(%r_add)
+         return (%r) )";
+
+  auto quantized_add_scalar_inplace_relu_out_pattern = R"(
+graph(%a_quant, %b_scalar):
+         %r_add = quantized::add_scalar_out(%a_quant, %b_scalar, %a_quant)
+         %r = aten::relu_(%r_add)
+         return (%r) )";
+
+  auto quantized_add_scalar_relu_out_replacement = R"(
+graph(%a_quant, %b_scalar):
+         %r = quantized::add_scalar_relu_out(%a_quant, %b_scalar, %a_quant)
+         return (%r) )";
+
+  // quantized::batch_norm
+  std::string batch_norm = R"(
+graph(%a_quant, %weight, %bias, %mean, %var, %training, %eaf, %eps, %7, %scale, %zero_point, %scalar_type):
+         %a_dequant = aten::dequantize(%a_quant)
+         %r_bn = aten::batch_norm(%a_dequant, %weight, %bias, %mean, %var, %training, %eaf, %eps, %7)
+         %r = aten::quantize_per_tensor(%r_bn, %scale, %zero_point, %scalar_type)
+         return (%r) )";
+  std::string quantized_batch_norm = R"(
+graph(%a_quant, %weight, %bias, %mean, %var, %training, %eaf, %eps, %7, %scale, %zero_point, %scalar_type):
+         %r = quantized::batch_norm(%a_quant, %weight, %bias, %mean, %var, %eps, %scale, %zero_point)
+         return (%r) )";
+
+  std::string batch_norm_relu = R"(
+graph(%a_quant, %weight, %bias, %mean, %var, %training, %eaf, %eps, %7, %scale, %zero_point, %scalar_type):
+         %a_dequant = aten::dequantize(%a_quant)
+         %bn_out = aten::batch_norm(%a_dequant, %weight, %bias, %mean, %var, %training, %eaf, %eps, %7)
+         %relu = aten::relu(%bn_out)
+         %r = aten::quantize_per_tensor(%relu, %scale, %zero_point, %scalar_type)
+         return (%r) )";
+  std::string batch_norm_inplace_relu = R"(
+graph(%a_quant, %weight, %bias, %mean, %var, %training, %eaf, %eps, %7, %scale, %zero_point, %scalar_type):
+         %a_dequant = aten::dequantize(%a_quant)
+         %bn_out = aten::batch_norm(%a_dequant, %weight, %bias, %mean, %var, %training, %eaf, %eps, %7)
+         %relu = aten::relu_(%bn_out)
+         %r = aten::quantize_per_tensor(%relu, %scale, %zero_point, %scalar_type)
+         return (%r) )";
+
+  std::string quantized_batch_norm_relu = R"(
+graph(%a_quant, %weight, %bias, %mean, %var, %training, %eaf, %eps, %7, %scale, %zero_point, %scalar_type):
+         %r = quantized::batch_norm_relu(%a_quant, %weight, %bias, %mean, %var, %eps, %scale, %zero_point)
+         return (%r) )";
+
+  // aten::mul
+  std::string mul = R"(
+graph(%a_quant, %b_quant, %scale, %zero_point, %dtype):
+         %a_dequant = aten::dequantize(%a_quant)
+         %b_dequant = aten::dequantize(%b_quant)
+         %r_mul = aten::mul(%a_dequant, %b_dequant)
+         %r = aten::quantize_per_tensor(%r_mul, %scale, %zero_point, %dtype)
+         return (%r) )";
+
+  // aten::mul_
+  std::string inplace_mul = R"(
+graph(%a_quant, %b_quant, %scale, %zero_point, %dtype):
+         %a_dequant = aten::dequantize(%a_quant)
+         %b_dequant = aten::dequantize(%b_quant)
+         %r_mul = aten::mul_(%a_dequant, %b_dequant)
+         %r = aten::quantize_per_tensor(%r_mul, %scale, %zero_point, %dtype)
+         return (%r) )";
+
+  // quantized::mul
+  std::string quantized_mul = R"(
+graph(%a_quant, %b_quant, %scale, %zero_point, %dtype):
+         %r = quantized::mul(%a_quant, %b_quant, %scale, %zero_point)
+         return (%r) )";
+
+  auto mul_scalar = getBinaryOpScalarFusionInfo(
+      "aten::mul",
+      {"%b_scalar"},
+      "quantized::mul_scalar",
+      {"%b_scalar"},
+      {input_b_is_scalar});
+
+  auto mul_scalar_out = getBinaryOpScalarFusionInfo(
+      "aten::mul_",
+      {"%b_scalar"},
+      "quantized::mul_scalar_out",
+      {"%b_scalar", "%a_quant"},
+      {input_b_is_scalar});
+
+  // quantized::mul_relu
+  std::string mul_relu = R"(
+graph(%a_quant, %b_quant, %scale, %zero_point, %dtype):
+         %a_dequant = aten::dequantize(%a_quant)
+         %b_dequant = aten::dequantize(%b_quant)
+         %r_mul = aten::mul(%a_dequant, %b_dequant)
+         %r_relu = aten::relu(%r_mul)
+         %r = aten::quantize_per_tensor(%r_relu, %scale, %zero_point, %dtype)
+         return (%r) )";
+
+  std::string mul_inplace_relu = R"(
+graph(%a_quant, %b_quant, %scale, %zero_point, %dtype):
+         %a_dequant = aten::dequantize(%a_quant)
+         %b_dequant = aten::dequantize(%b_quant)
+         %r_mul = aten::mul(%a_dequant, %b_dequant)
+         %r_relu = aten::relu_(%r_mul)
+         %r = aten::quantize_per_tensor(%r_relu, %scale, %zero_point, %dtype)
+         return (%r) )";
+
+  std::string inplace_mul_relu = R"(
+graph(%a_quant, %b_quant, %scale, %zero_point, %dtype):
+         %a_dequant = aten::dequantize(%a_quant)
+         %b_dequant = aten::dequantize(%b_quant)
+         %r_mul = aten::mul_(%a_dequant, %b_dequant)
+         %r_relu = aten::relu(%r_mul)
+         %r = aten::quantize_per_tensor(%r_relu, %scale, %zero_point, %dtype)
+         return (%r) )";
+
+  std::string inplace_mul_inplace_relu = R"(
+graph(%a_quant, %b_quant, %scale, %zero_point, %dtype):
+         %a_dequant = aten::dequantize(%a_quant)
+         %b_dequant = aten::dequantize(%b_quant)
+         %r_mul = aten::mul_(%a_dequant, %b_dequant)
+         %r_relu = aten::relu_(%r_mul)
+         %r = aten::quantize_per_tensor(%r_relu, %scale, %zero_point, %dtype)
+         return (%r) )";
+
+  std::string quantized_mul_relu = R"(
+graph(%a_quant, %b_quant, %scale, %zero_point, %dtype):
+         %r = quantized::mul_relu(%a_quant, %b_quant, %scale, %zero_point)
+         return (%r) )";
+
+  // quantized::mul_scalar_relu -- fusing quantized::mul_scalar
+  // and aten::relu
+  auto quantized_mul_scalar_relu_pattern = R"(
+graph(%a_quant, %b_scalar):
+         %r_mul = quantized::mul_scalar(%a_quant, %b_scalar)
+         %r = aten::relu(%r_mul)
+         return (%r) )";
+
+  auto quantized_mul_scalar_inplace_relu_pattern = R"(
+graph(%a_quant, %b_scalar):
+         %r_mul = quantized::mul_scalar(%a_quant, %b_scalar)
+         %r = aten::relu_(%r_mul)
+         return (%r) )";
+
+  auto quantized_mul_scalar_relu_replacement = R"(
+graph(%a_quant, %b_scalar):
+         %r = quantized::mul_scalar_relu(%a_quant, %b_scalar)
+         return (%r) )";
+
+  // quantized::mul_scalar_relu_out -- fusing quantized::mul_scalarOut
+  // and aten::relu
+  auto quantized_mul_scalar_relu_out_pattern = R"(
+graph(%a_quant, %b_scalar):
+         %r_mul = quantized::mul_scalar_out(%a_quant, %b_scalar, %a_quant)
+         %r = aten::relu(%r_mul)
+         return (%r) )";
+
+  auto quantized_mul_scalar_inplace_relu_out_pattern = R"(
+graph(%a_quant, %b_scalar):
+         %r_mul = quantized::mul_scalar_out(%a_quant, %b_scalar, %a_quant)
+         %r = aten::relu_(%r_mul)
+         return (%r) )";
+
+  auto quantized_mul_scalar_relu_out_replacement = R"(
+graph(%a_quant, %b_scalar):
+         %r = quantized::mul_scalar_relu_out(%a_quant, %b_scalar, %a_quant)
+         return (%r) )";
+
+  // quantized::elu
+  std::string elu = R"(
+graph(%a_quant, %alpha, %scale, %input_scale, %r_scale, %r_zero_point, %r_dtype):
+         %a_dequant = aten::dequantize(%a_quant)
+         %r = aten::elu(%a_dequant, %alpha, %scale, %input_scale)
+         %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+         return (%r_quant) )";
+
+  std::string quantized_elu = R"(
+graph(%a_quant, %alpha, %scale, %input_scale, %r_scale, %r_zero_point, %r_dtype):
+         %r_quant = quantized::elu(%a_quant, %r_scale, %r_zero_point, %alpha, %scale, %input_scale)
+         return (%r_quant) )";
+
+  std::string elu_ = R"(
+graph(%a_quant, %alpha, %scale, %input_scale, %r_scale, %r_zero_point, %r_dtype):
+         %a_dequant = aten::dequantize(%a_quant)
+         %r = aten::elu_(%a_dequant, %alpha, %scale, %input_scale)
+         %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+         return (%r_quant) )";
+
+  // ============= General Ops that inherit quantization parameters from input
+  // tensor =============
+  auto avg_pool1d = getInputTensorQParamOpFusionInfo(
+      "aten::avg_pool1d",
+      {"%kernel_size",
+       "%stride",
+       "%padding",
+       "%ceil_mode",
+       "%count_include_pad"});
+
+  auto avg_pool2d = getInputTensorQParamOpFusionInfo(
+      "aten::avg_pool2d",
+      {"%kernel_size",
+       "%stride",
+       "%padding",
+       "%ceil_mode",
+       "%count_include_pad",
+       "%divisor_override"});
+
+  auto avg_pool3d = getInputTensorQParamOpFusionInfo(
+      "aten::avg_pool3d",
+      {"%kernel_size",
+       "%stride",
+       "%padding",
+       "%ceil_mode",
+       "%count_include_pad",
+       "%divisor_override"});
+
+  auto adaptive_avg_pool1d = getInputTensorQParamOpFusionInfo(
+      "aten::adaptive_avg_pool1d", {"%output_size"});
+
+  auto adaptive_avg_pool2d = getInputTensorQParamOpFusionInfo(
+      "aten::adaptive_avg_pool2d", {"%output_size"});
+
+  auto adaptive_avg_pool3d = getInputTensorQParamOpFusionInfo(
+      "aten::adaptive_avg_pool3d", {"%output_size"});
+
+  auto mean1 = getInputTensorQParamOpFusionInfo("aten::mean", {"%dim"});
+
+  auto mean2 = getInputTensorQParamOpFusionInfo(
+      "aten::mean", {"%dim", "%keepdim", "%out"});
+
+  auto upsample_nearest1d_vec = getInputTensorQParamOpFusionInfo(
+      "aten::upsample_nearest1d", {"%output_size", "%scale_factors"});
+
+  auto upsample_nearest2d_vec = getInputTensorQParamOpFusionInfo(
+      "aten::upsample_nearest2d", {"%output_size", "%scale_factors"});
+
+  auto upsample_nearest3d_vec = getInputTensorQParamOpFusionInfo(
+      "aten::upsample_nearest3d", {"%output_size", "%scale_factors"});
+
+  auto upsample_linear1d_vec = getInputTensorQParamOpFusionInfo(
+      "aten::upsample_linear1d",
+      {"%output_size", "%align_corners", "%scale_factors"});
+
+  auto upsample_bilinear2d_vec = getInputTensorQParamOpFusionInfo(
+      "aten::upsample_bilinear2d",
+      {"%output_size", "%align_corners", "%scale_factors"});
+
+  auto upsample_trilinear3d_vec = getInputTensorQParamOpFusionInfo(
+      "aten::upsample_trilinear3d",
+      {"%output_size", "%align_corners", "%scale_factors"});
+
+  auto upsample_nearest1d = getInputTensorQParamOpFusionInfo(
+      "aten::upsample_nearest1d", {"%output_size", "%scales"});
+
+  auto upsample_nearest2d = getInputTensorQParamOpFusionInfo(
+      "aten::upsample_nearest2d", {"%output_size", "%scale_h", "%scale_w"});
+
+  auto upsample_nearest3d = getInputTensorQParamOpFusionInfo(
+      "aten::upsample_nearest3d",
+      {"%output_size", "%scale_d", "%scale_h", "%scale_w"});
+
+  auto upsample_linear1d = getInputTensorQParamOpFusionInfo(
+      "aten::upsample_linear1d", {"%output_size", "%align_corners", "%scales"});
+
+  auto upsample_bilinear2d = getInputTensorQParamOpFusionInfo(
+      "aten::upsample_bilinear2d",
+      {"%output_size", "%align_corners", "%scale_h", "%scale_w"});
+
+  auto upsample_trilinear3d = getInputTensorQParamOpFusionInfo(
+      "aten::upsample_trilinear3d",
+      {"%output_size", "%align_corners", "%scale_d", "%scale_h", "%scale_w"});
+
+  auto clamp = getClampOpFusionInfo("aten::clamp", {"%min", "%max"});
+
+  auto hardtanh = getClampOpFusionInfo("aten::hardtanh", {"%min", "%max"});
+
+  auto hardtanh_ = getClampOpFusionInfo("aten::hardtanh_", {"%min", "%max"});
+
+  auto leaky_relu =
+      getInputTensorQParamOpFusionInfo("aten::leaky_relu", {"%negative_slope"});
+
+  auto leaky_relu_ = getInputTensorQParamOpFusionInfo(
+      "aten::leaky_relu_", {"%negative_slope"});
+
+  // Ops with fixed quantization parameters
+  auto hardsigmoid = getFixedQParamOpFusionInfo("aten::hardsigmoid", {}, false);
+
+  auto hardsigmoid_ =
+      getFixedQParamOpFusionInfo("aten::hardsigmoid_", {}, false);
+
+  auto sigmoid = getFixedQParamOpFusionInfo("aten::sigmoid", {}, false);
+
+  auto sigmoid_ = getFixedQParamOpFusionInfo("aten::sigmoid_", {}, false);
+
+  auto tanh = getFixedQParamOpFusionInfo("aten::tanh", {}, true);
+
+  auto tanh_ = getFixedQParamOpFusionInfo("aten::tanh_", {}, true);
+
+  auto hardswish = getObservedQParamOpFusionInfo(
+      "aten::hardswish", "quantized::hardswish", {}, {});
+
+  auto hardswish_ = getObservedQParamOpFusionInfo(
+      "aten::hardswish_", "quantized::hardswish", {}, {});
+
+  auto layer_norm = getObservedQParamOpFusionInfo(
+      "aten::layer_norm",
+      "quantized::layer_norm",
+      {"%normalized_shape", "%weight", "%bias", "%eps", "%cudnn_enabled"},
+      {"%normalized_shape", "%weight", "%bias", "%eps"});
+
+  auto group_norm = getObservedQParamOpFusionInfo(
+      "aten::group_norm",
+      "quantized::group_norm",
+      {"%num_groups", "%weight", "%bias", "%eps", "%cudnn_enabled"},
+      {"%num_groups", "%weight", "%bias", "%eps"});
+
+  auto instance_norm = getObservedQParamOpFusionInfo(
+      "aten::instance_norm",
+      "quantized::instance_norm",
+      {"%weight",
+       "%bias",
+       "%running_mean",
+       "%running_var",
+       "%use_input_stats",
+       "%momentum",
+       "%eps",
+       "%cudnn_enabled"},
+      {"%weight", "%bias", "%eps"});
+
+  return {
+      {"quantized::conv1d", std::move(conv1d), std::move(quantized_conv1d)},
+      {"quantized::conv1d_relu", std::move(conv1d_relu), quantized_conv1d_relu},
+      {"quantized::conv1d_relu",
+       std::move(conv1d_inplace_relu),
+       std::move(quantized_conv1d_relu)},
+      {"quantized::conv2d", std::move(conv2d), std::move(quantized_conv2d)},
+      {"quantized::conv2d_relu", std::move(conv2d_relu), quantized_conv2d_relu},
+      {"quantized::conv2d_relu",
+       std::move(conv2d_inplace_relu),
+       std::move(quantized_conv2d_relu)},
+      {"quantized::conv3d", std::move(conv3d), std::move(quantized_conv3d)},
+      {"quantized::conv3d_relu", std::move(conv3d_relu), quantized_conv3d_relu},
+      {"quantized::conv3d_relu",
+       std::move(conv3d_inplace_relu),
+       std::move(quantized_conv3d_relu)},
+      {"quantized::conv_transpose1d",
+       std::move(conv_transpose1d),
+       std::move(quantized_conv_transpose1d)},
+      {"quantized::conv_transpose2d",
+       std::move(conv_transpose2d),
+       std::move(quantized_conv_transpose2d)},
+      {"quantized::linear", std::move(linear), std::move(quantized_linear)},
+      {"quantized::linear_relu", std::move(linear_relu), quantized_linear_relu},
+      {"quantized::linear_relu",
+       std::move(linear_inplace_relu),
+       std::move(quantized_linear_relu)},
+      {"quantized::add_relu",
+       std::move(add_relu),
+       quantized_add_relu,
+       {aten_add_alpha_is_one}},
+      {"quantized::add_relu",
+       std::move(add_inplace_relu),
+       quantized_add_relu,
+       {aten_add_alpha_is_one}},
+      {"quantized::add_relu",
+       std::move(inplace_add_relu),
+       quantized_add_relu,
+       {aten_add_alpha_is_one}},
+      {"quantized::add_relu",
+       std::move(inplace_add_inplace_relu),
+       std::move(quantized_add_relu),
+       {aten_add_alpha_is_one}},
+      std::move(add_scalar),
+      std::move(add_scalar_out),
+      // note that these must come after quantized::add_scalar and
+      // quantized::add_scalar_out patterns
+      {"quantized::add_scalar_relu",
+       quantized_add_scalar_relu_pattern,
+       quantized_add_scalar_relu_replacement},
+      {"quantized::add_scalar_relu",
+       quantized_add_scalar_inplace_relu_pattern,
+       quantized_add_scalar_relu_replacement},
+      {"quantized::add_scalar_relu_out",
+       quantized_add_scalar_relu_out_pattern,
+       quantized_add_scalar_relu_out_replacement},
+      {"quantized::add_scalar_relu_out",
+       quantized_add_scalar_inplace_relu_out_pattern,
+       quantized_add_scalar_relu_out_replacement},
+      {"quantized::add",
+       std::move(add),
+       quantized_add,
+       {aten_add_alpha_is_one}},
+      {"quantized::add",
+       std::move(inplace_add),
+       std::move(quantized_add),
+       {aten_add_alpha_is_one}},
+      {"quantized::cat", std::move(cat), std::move(quantized_cat)},
+      {"quantized::batch_norm",
+       std::move(batch_norm),
+       std::move(quantized_batch_norm)},
+      {"quantized::batch_norm_relu",
+       std::move(batch_norm_relu),
+       quantized_batch_norm_relu},
+      {"quantized::batch_norm_relu",
+       std::move(batch_norm_inplace_relu),
+       std::move(quantized_batch_norm_relu)},
+      std::move(mul_scalar),
+      std::move(mul_scalar_out),
+      // note that these must come after quantized::mul_scalar and
+      // quantized::mul_scalar_out patterns
+      {"quantized::mul_scalar_relu",
+       quantized_mul_scalar_relu_pattern,
+       quantized_mul_scalar_relu_replacement},
+      {"quantized::mul_scalar_relu",
+       quantized_mul_scalar_inplace_relu_pattern,
+       quantized_mul_scalar_relu_replacement},
+      {"quantized::mul_scalar_relu_out",
+       quantized_mul_scalar_relu_out_pattern,
+       quantized_mul_scalar_relu_out_replacement},
+      {"quantized::mul_scalar_relu_out",
+       quantized_mul_scalar_inplace_relu_out_pattern,
+       quantized_mul_scalar_relu_out_replacement},
+      {"quantized::mul_relu", std::move(mul_relu), quantized_mul_relu},
+      {"quantized::mul_relu", std::move(mul_inplace_relu), quantized_mul_relu},
+      {"quantized::mul_relu", std::move(inplace_mul_relu), quantized_mul_relu},
+      {"quantized::mul_relu",
+       std::move(inplace_mul_inplace_relu),
+       std::move(quantized_mul_relu)},
+      {"quantized::mul", std::move(mul), quantized_mul},
+      {"quantized::mul", std::move(inplace_mul), std::move(quantized_mul)},
+      std::move(hardswish),
+      std::move(hardswish_),
+      std::move(layer_norm),
+      std::move(group_norm),
+      std::move(instance_norm),
+      {"quantized::elu", std::move(elu), quantized_elu},
+      {"quantized::elu_", std::move(elu_), std::move(quantized_elu)},
+      std::move(avg_pool1d),
+      std::move(avg_pool2d),
+      std::move(avg_pool3d),
+      std::move(adaptive_avg_pool1d),
+      std::move(adaptive_avg_pool2d),
+      std::move(adaptive_avg_pool3d),
+      std::move(mean1),
+      std::move(mean2),
+      std::move(upsample_nearest1d),
+      std::move(upsample_nearest2d),
+      std::move(upsample_nearest3d),
+      std::move(upsample_linear1d),
+      std::move(upsample_bilinear2d),
+      std::move(upsample_trilinear3d),
+      std::move(upsample_nearest1d_vec),
+      std::move(upsample_nearest2d_vec),
+      std::move(upsample_nearest3d_vec),
+      std::move(upsample_linear1d_vec),
+      std::move(upsample_bilinear2d_vec),
+      std::move(upsample_trilinear3d_vec),
+      std::move(clamp),
+      std::move(hardtanh),
+      std::move(hardtanh_),
+      std::move(leaky_relu),
+      std::move(leaky_relu_),
+      // fixed qparam ops
+      std::move(hardsigmoid),
+      std::move(hardsigmoid_),
+      std::move(sigmoid),
+      std::move(sigmoid_),
+      std::move(tanh),
+      std::move(tanh_),
+  };
+}
+
+inline std::vector<QuantFusionInfo>
+dynamic_quantized_linear_pattern_and_replacements() {
+  std::string linear_dynamic = R"(
+graph(%packed_params, %a):
+        %w_quant : Tensor, %b : Tensor? = quantized::linear_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant)
+        %r = aten::linear(%a, %w_dequant, %b)
+        return (%r) )";
+
+  // This pattern ignores reduce range
+  // Set the reduce range to default to true, since qnnpack backend ignores this
+  // argument.
+  std::string quantized_linear_dynamic = R"(
+graph(%packed_params, %a):
+        %reduce_range : bool = prim::Constant[value=1]()
+        %r = quantized::linear_dynamic(%a, %packed_params, %reduce_range)
+        return (%r) )";
+
+  return {
+      {"quantized::linear_dynamic",
+       std::move(linear_dynamic),
+       std::move(quantized_linear_dynamic)},
+  };
+}
+
+static std::vector<QuantFusionInfo>
+dynamic_quant_fusion_pattern_and_replacements() {
+  std::string linear_dynamic = R"(
+graph(%packed_params, %a, %reduce_range, %a_dtype):
+        %a_scale : float, %a_zero_point : int = aten::_choose_qparams_per_tensor(%a, %reduce_range)
+        %a_quant = aten::quantize_per_tensor(%a, %a_scale, %a_zero_point, %a_dtype)
+        %a_dequant = aten::dequantize(%a_quant)
+        %w_quant : Tensor, %b : Tensor? = quantized::linear_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant)
+        %r = aten::linear(%a_dequant, %w_dequant, %b)
+        return (%r) )";
+
+  std::string quantized_linear_dynamic = R"(
+graph(%packed_params, %a, %reduce_range, %a_dtype):
+        %r = quantized::linear_dynamic(%a, %packed_params, %reduce_range)
+        return (%r) )";
+
+  std::string linear_dynamic_fp16 = R"(
+graph(%packed_params, %a):
+        %w_unpacked : Tensor, %b : Tensor? = quantized::linear_unpack_fp16(%packed_params)
+        %r = aten::linear(%a, %w_unpacked, %b)
+        return (%r) )";
+
+  std::string quantized_linear_dynamic_fp16 = R"(
+graph(%packed_params, %a):
+        %r = quantized::linear_dynamic_fp16(%a, %packed_params)
+        return (%r) )";
+
+  return {
+      {"quantized::linear_dynamic",
+       std::move(linear_dynamic),
+       std::move(quantized_linear_dynamic)},
+      {"quantized::linear_dynamic_fp16",
+       std::move(linear_dynamic_fp16),
+       std::move(quantized_linear_dynamic_fp16)},
+  };
+}
+
+static std::vector<QuantFusionInfo> linear_prepack_unpack_patterns() {
+  std::string linear_with_quant = R"(
+graph(%a_dequant, %w_quant, %b):
+        %w_dequant = aten::dequantize(%w_quant)
+        %r = aten::linear(%a_dequant, %w_dequant, %b)
+        return (%r) )";
+
+  std::string linear_with_quant_prepack = R"(
+graph(%a_dequant, %w_quant, %b):
+        %packed_params = quantized::linear_prepack(%w_quant, %b)
+        %w_quant_unpacked : Tensor, %b_unpacked : Tensor? = quantized::linear_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant_unpacked)
+        %r = aten::linear(%a_dequant, %w_dequant, %b_unpacked)
+        return (%r) )";
+  std::string linear_fp16_with_cast = R"(
+graph(%w, %a_dq, %b):
+        %fp16_tensor = aten::_saturate_weight_to_fp16(%w)
+        %r = aten::linear(%a_dq, %fp16_tensor, %b)
+        return (%r) )";
+  std::string linear_fp16_with_prepack = R"(
+graph(%w, %a_dq, %b):
+        %packed_params = quantized::linear_prepack_fp16(%w, %b)
+        %w_unpacked : Tensor, %b_unpacked : Tensor? = quantized::linear_unpack_fp16(%packed_params)
+        %r = aten::linear(%a_dq, %w_unpacked, %b_unpacked)
+        return (%r) )";
+
+  return {
+      {"linear_prepack_unpack",
+       std::move(linear_with_quant),
+       std::move(linear_with_quant_prepack)},
+      {"linear_fp16_prepack_unpack",
+       std::move(linear_fp16_with_cast),
+       std::move(linear_fp16_with_prepack)},
+  };
+}
+
+static std::vector<QuantFusionInfo> conv_prepack_unpack_patterns() {
+  std::string conv1d_with_quant = R"(
+graph(%a_dequant, %w_quant, %b, %stride, %padding, %dilation, %groups):
+        %w_dequant = aten::dequantize(%w_quant)
+        %r = aten::conv1d(%a_dequant, %w_dequant, %b, %stride, %padding, %dilation, %groups)
+        return (%r) )";
+
+  std::string conv1d_with_quant_prepack = R"(
+graph(%a_dequant, %w_quant, %b, %stride, %padding, %dilation, %groups):
+        %packed_params : __torch__.torch.classes.quantized.Conv2dPackedParamsBase = quantized::conv1d_prepack(%w_quant, %b, %stride, %padding, %dilation, %groups)
+        %w_quant_unpacked : Tensor, %b_unpacked : Tensor? = quantized::conv1d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant_unpacked)
+        %r = aten::conv1d(%a_dequant, %w_dequant, %b_unpacked, %stride, %padding, %dilation, %groups)
+        return (%r) )";
+
+  std::string conv2d_with_quant = R"(
+graph(%a_dequant, %w_quant, %b, %stride, %padding, %dilation, %groups):
+        %w_dequant = aten::dequantize(%w_quant)
+        %r = aten::conv2d(%a_dequant, %w_dequant, %b, %stride, %padding, %dilation, %groups)
+        return (%r) )";
+
+  std::string conv2d_with_quant_prepack = R"(
+graph(%a_dequant, %w_quant, %b, %stride, %padding, %dilation, %groups):
+        %packed_params : __torch__.torch.classes.quantized.Conv2dPackedParamsBase = quantized::conv2d_prepack(%w_quant, %b, %stride, %padding, %dilation, %groups)
+        %w_quant_unpacked : Tensor, %b_unpacked : Tensor? = quantized::conv2d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant_unpacked)
+        %r = aten::conv2d(%a_dequant, %w_dequant, %b_unpacked, %stride, %padding, %dilation, %groups)
+        return (%r) )";
+
+  std::string conv3d_with_quant = R"(
+graph(%a_dequant, %w_quant, %b, %stride, %padding, %dilation, %groups):
+        %w_dequant = aten::dequantize(%w_quant)
+        %r = aten::conv3d(%a_dequant, %w_dequant, %b, %stride, %padding, %dilation, %groups)
+        return (%r) )";
+
+  std::string conv3d_with_quant_prepack = R"(
+graph(%a_dequant, %w_quant, %b, %stride, %padding, %dilation, %groups):
+        %packed_params : __torch__.torch.classes.quantized.Conv3dPackedParamsBase = quantized::conv3d_prepack(%w_quant, %b, %stride, %padding, %dilation, %groups)
+        %w_quant_unpacked : Tensor, %b_unpacked : Tensor? = quantized::conv3d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant_unpacked)
+        %r = aten::conv3d(%a_dequant, %w_dequant, %b_unpacked, %stride, %padding, %dilation, %groups)
+        return (%r) )";
+
+  std::string conv_transpose1d_with_quant = R"(
+graph(%a_dequant, %w_quant, %b, %stride, %padding, %output_padding, %groups, %dilation):
+        %w_dequant = aten::dequantize(%w_quant)
+        %r = aten::conv_transpose1d(%a_dequant, %w_dequant, %b, %stride, %padding, %output_padding, %groups, %dilation)
+        return (%r) )";
+
+  std::string conv_transpose1d_with_quant_prepack = R"(
+graph(%a_dequant, %w_quant, %b, %stride, %padding, %output_padding, %groups, %dilation):
+        %packed_params : __torch__.torch.classes.quantized.Conv2dPackedParamsBase = quantized::conv_transpose1d_prepack(%w_quant, %b, %stride, %padding, %output_padding, %dilation, %groups)
+        %w_quant_unpacked : Tensor, %b_unpacked : Tensor? = quantized::conv_transpose1d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant_unpacked)
+        %r = aten::conv_transpose1d(%a_dequant, %w_dequant, %b_unpacked, %stride, %padding, %output_padding, %groups, %dilation)
+        return (%r) )";
+
+  std::string conv_transpose2d_with_quant = R"(
+graph(%a_dequant, %w_quant, %b, %stride, %padding, %output_padding, %groups, %dilation):
+        %w_dequant = aten::dequantize(%w_quant)
+        %r = aten::conv_transpose2d(%a_dequant, %w_dequant, %b, %stride, %padding, %output_padding, %groups, %dilation)
+        return (%r) )";
+
+  std::string conv_transpose2d_with_quant_prepack = R"(
+graph(%a_dequant, %w_quant, %b, %stride, %padding, %output_padding, %groups, %dilation):
+        %packed_params : __torch__.torch.classes.quantized.Conv2dPackedParamsBase = quantized::conv_transpose2d_prepack(%w_quant, %b, %stride, %padding, %output_padding, %dilation, %groups)
+        %w_quant_unpacked : Tensor, %b_unpacked : Tensor? = quantized::conv_transpose2d_unpack(%packed_params)
+        %w_dequant = aten::dequantize(%w_quant_unpacked)
+        %r = aten::conv_transpose2d(%a_dequant, %w_dequant, %b_unpacked, %stride, %padding, %output_padding, %groups, %dilation)
+        return (%r) )";
+
+  return {
+      {"conv1d_prepack_unpack",
+       std::move(conv1d_with_quant),
+       std::move(conv1d_with_quant_prepack)},
+      {"conv2d_prepack_unpack",
+       std::move(conv2d_with_quant),
+       std::move(conv2d_with_quant_prepack)},
+      {"conv3d_prepack_unpack",
+       std::move(conv3d_with_quant),
+       std::move(conv3d_with_quant_prepack)},
+      {"conv_transpose1d_prepack_unpack",
+       std::move(conv_transpose1d_with_quant),
+       std::move(conv_transpose1d_with_quant_prepack)},
+      {"conv_transpose2d_prepack_unpack",
+       std::move(conv_transpose2d_with_quant),
+       std::move(conv_transpose2d_with_quant_prepack)}};
+}
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/quantization_type.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/quantization_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..5bd3f4bab79baaa3ef0d7d44f13ead182fa74c31
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/quantization_type.h
@@ -0,0 +1,13 @@
+#pragma once
+#include <cstdint>
+#include <ostream>
+
+namespace torch::jit {
+
+// Quantization type (dynamic quantization, static quantization).
+// Should match the Python enum in quantize_jit.py
+enum QuantType : std::uint8_t { DYNAMIC = 0, STATIC };
+
+std::ostream& operator<<(std::ostream& os, QuantType t);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/register_packed_params.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/register_packed_params.h
new file mode 100644
index 0000000000000000000000000000000000000000..5b320693cb753edccecee9452e3571d7ef30f6d2
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/quantization/register_packed_params.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <memory>
+
+namespace torch::jit {
+
+using PrePackParamFilterFn = std::function<bool(Node*)>;
+
+TORCH_API std::unordered_set<std::string> RegisterPrePackParams(
+    Module& m,
+    const std::string& method_name,
+    const PrePackParamFilterFn& is_packed_param,
+    const std::string& attr_prefix);
+
+TORCH_API std::string joinPaths(const std::vector<std::string>& paths);
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/refine_tuple_types.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/refine_tuple_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..87d3dfc5f40ff35cb70fc19cc7492573f0d556d0
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/refine_tuple_types.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// updates the types of tuples according to the type of their current inputs.
+TORCH_API void RefineTupleTypes(std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/remove_dropout.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/remove_dropout.h
new file mode 100644
index 0000000000000000000000000000000000000000..47e61aad6ed74d546106fdccb12b12306a41f636
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/remove_dropout.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API void removeDropout(std::shared_ptr<Graph>& graph);
+
+TORCH_API void removeDropout(script::Module& module);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/remove_exceptions.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/remove_exceptions.h
new file mode 100644
index 0000000000000000000000000000000000000000..a577dfa624434e99fa92ac71978ebe2260a3724d
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/remove_exceptions.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// Considering prim::RaiseException nodes unreachable, simplify prim::If nodes
+// when one of the branches contains prim::RaiseException.
+//
+// This pass is illegal in general case as the modified graph might not throw
+// an exception that the original graph would throw. The purpose of the pass is
+// to cleanup the graph in a "risky" way by removing pathways leading to
+// RaiseExceptions nodes. In some sense, this pass could be considered as a
+// "Release" mode, while the original graph was in a "Debug" mode.
+// The pass should only be used when such transformation is guaranteed to be
+// safe by some other mechanisms. For instance, when we know exact shapes of
+// tensors flowing through the graph and tensors with such shapes never cause
+// exceptions.
+TORCH_API void EliminateExceptions(std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/remove_expands.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/remove_expands.h
new file mode 100644
index 0000000000000000000000000000000000000000..cb513f8d6ac9f7d68a031bf82af89d09a1b50509
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/remove_expands.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API void RemoveExpands(const std::shared_ptr<Graph>& graph);
+
+}
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/remove_inplace_ops.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/remove_inplace_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..77bf1b0a79963aac5c3721e3c0e201338908e71c
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/remove_inplace_ops.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+#include <memory>
+
+namespace torch::jit {
+// see .cpp for docs
+TORCH_API void RemoveInplaceOps(const std::shared_ptr<Graph>& graph);
+
+TORCH_API void ImplicitCastForBinaryInplaceOps(Block* block);
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/remove_mutation.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/remove_mutation.h
new file mode 100644
index 0000000000000000000000000000000000000000..d5ae5a041e331b9d8cfc6fa6ca3360308f99fdef
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/remove_mutation.h
@@ -0,0 +1,81 @@
+#pragma once
+
+#include <c10/util/Exception.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/alias_analysis.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+#include <utility>
+
+namespace torch::jit {
+
+struct TORCH_API MutationRemover {
+  MutationRemover(
+      std::shared_ptr<Graph> graph,
+      std::optional<std::function<bool(Node*)>> mutation_filter = std::nullopt)
+      : mutation_filter_(std::move(mutation_filter)),
+        aliasDb_(nullptr),
+        graph_(std::move(graph)) {}
+
+  // return true if graph is modified
+  bool removeListMutation();
+
+  // return true if graph is modified
+  bool removeTensorMutation();
+
+  bool isSpecialMappedOp(Node* n) {
+    return n->matches("aten::zero_(Tensor(a!) self) -> Tensor(a!)") ||
+        n->matches(
+            "aten::fill_.Scalar(Tensor(a!) self, Scalar value) -> Tensor(a!)") ||
+        n->matches(
+            "aten::normal_(Tensor(a!) self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor(a!)");
+  }
+
+  bool inplaceOpVariant(Node* n);
+
+  static bool hasSideEffectOrAlias(Value* v, AliasDb* aliasDb);
+
+ private:
+  Node* createSpecialMappedOp(Node* n);
+  bool listMutationFollowingListConstruct(Node* n);
+  bool tryMakeCreationAndMutationAtomic(
+      Value* mutated_value,
+      Node* mutating_op);
+  bool tryMakeUnaliasedIfOutputAndMutationAtomic(
+      Value* mutated_value,
+      Node* mutating_op);
+  // return true if graph is modified
+  bool RemoveListMutation(Block* block);
+  // return true if graph is modified
+  bool RemoveTensorMutation(Block* block);
+
+  AliasDb* getOrCreateAliasDb() {
+    if (!aliasDb_) {
+      aliasDb_ = std::make_unique<AliasDb>(graph_);
+    }
+    return aliasDb_.get();
+  }
+
+  std::optional<std::function<bool(Node*)>> mutation_filter_;
+  std::unique_ptr<AliasDb> aliasDb_ = nullptr;
+  std::shared_ptr<Graph> graph_;
+};
+
+// Removes list mutation with functional equivalents
+// return true if graph is modified
+TORCH_API bool RemoveListMutation(const std::shared_ptr<Graph>& graph);
+
+// Replaces in-place aten ops with their functional equivalents
+// when it can be proven that this does not change graph semantics
+// if `mutation_filter` is present, the pass will only attempt to
+// remove mutation on nodes which return true for the filter
+// return true if graph is modified
+TORCH_API bool RemoveTensorMutation(
+    const std::shared_ptr<Graph>& graph,
+    std::optional<std::function<bool(Node*)>> mutation_filter = std::nullopt);
+
+// Replaces in-place aten activation ops with their functional equivalence
+TORCH_API bool InplaceToFunctionalActivation(
+    const std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/remove_redundant_profiles.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/remove_redundant_profiles.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed2287984cd871e06af741c5908eaa0ab0fd2eed
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/remove_redundant_profiles.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API void RemoveRedundantProfiles(std::shared_ptr<Graph>& graph);
+TORCH_API void RemoveRedundantProfiles(Block* block, AliasDb& db);
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/replacement_of_old_operators.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/replacement_of_old_operators.h
new file mode 100644
index 0000000000000000000000000000000000000000..ecd443208631738c22aece59391015907f6c7e9b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/replacement_of_old_operators.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// Find the valid upgrader graph for the upgrader and cache the result
+// for later lookups. Will error out if there is no valid upgrader graph
+// provided for the upgrader name.
+std::shared_ptr<Graph> getUpgraderGraph(const std::string& upgrader_name);
+
+TORCH_API void ReplaceOldOperatorsWithUpgraders(std::shared_ptr<Graph> graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/requires_grad_analysis.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/requires_grad_analysis.h
new file mode 100644
index 0000000000000000000000000000000000000000..d791398ff7fd7a0b0faee088595b8f0d0025beda
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/requires_grad_analysis.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+
+#include <memory>
+
+namespace torch::jit {
+
+struct Graph;
+struct ArgumentSpec;
+
+TORCH_API void PropagateRequiresGrad(std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/restore_mutation.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/restore_mutation.h
new file mode 100644
index 0000000000000000000000000000000000000000..d570933e6366d77f4f6303c2ec1a5d55ff5a89a1
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/restore_mutation.h
@@ -0,0 +1,61 @@
+#pragma once
+
+#include <ATen/core/symbol.h>
+#include <c10/util/Exception.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/alias_analysis.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// A map which stores if an activation operator can perform type promotion
+const std::unordered_map<Symbol, bool> activation_type_promotion_mapping = {
+    {aten::sigmoid, true},
+    {aten::tanh, true},
+    {aten::celu, false},
+    {aten::elu, false},
+    {aten::gelu, false},
+    {aten::glu, false},
+    {aten::hardshrink, false},
+    {aten::hardsigmoid, false},
+    {aten::hardswish, false},
+    {aten::hardtanh, false},
+    {aten::leaky_relu, false},
+    {aten::prelu, false},
+    {aten::relu6, false},
+    {aten::relu, false},
+    {aten::rrelu, false},
+    {aten::selu, false},
+    {aten::silu, false}};
+
+class FunctionalToInplaceRewriter {
+ public:
+  FunctionalToInplaceRewriter(std::shared_ptr<Graph> graph);
+
+  bool FunctionalToInplace(Block* block);
+
+ private:
+  AliasDb* getOrCreateAliasDb() {
+    if (!aliasDb_) {
+      aliasDb_ = std::make_unique<AliasDb>(graph_);
+    }
+    return aliasDb_.get();
+  }
+
+  bool CanBeInplace(Node* node);
+
+  std::unique_ptr<AliasDb> aliasDb_ = nullptr;
+  std::shared_ptr<Graph> graph_;
+};
+
+// A common application scenario is to apply InplaceToFunctionalActivation
+// before some JIT optimization passes, so that those passes are less
+// constrained by in-place ops. After those passes are done, we can call
+// FunctionalToInplaceActivation to recover in-place activation ops,
+// so that we won't lose the performance benefit coming from memory reduction.
+
+// Replaces functional aten activation ops with their in-place equivalents
+TORCH_API bool FunctionalToInplaceActivation(
+    const std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/shape_analysis.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/shape_analysis.h
new file mode 100644
index 0000000000000000000000000000000000000000..e7a90214b58ab6611d5deb0c12ec39cff5a22ac0
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/shape_analysis.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <memory>
+
+namespace torch::jit {
+
+struct Graph;
+
+struct propagation_error : std::exception {};
+
+class PropertyPropBase {
+  // Used for both Shape Propagation and Dtype/Device Propagation
+ public:
+  explicit PropertyPropBase(std::shared_ptr<Graph> graph)
+      : graph_(std::move(graph)) {}
+  virtual ~PropertyPropBase() = default;
+
+  void propagateBlock(Block* block, bool insert_expands = true);
+  // insert_expands is used for shape inference
+
+  void processIf(Node* node);
+  void processLoop(Node* node);
+
+ protected:
+  virtual void propagateNode(Node* node, bool insert_expands = true) = 0;
+  void setUnshapedType(Value* o);
+  void setUnshapedType(Node* node);
+  std::shared_ptr<Graph> graph_;
+};
+
+TORCH_API void EraseShapeInformation(const std::shared_ptr<Graph>& graph);
+TORCH_API void PropagateInputShapes(const std::shared_ptr<Graph>& graph);
+
+TORCH_API bool mergeTypes(
+    ArrayRef<Value*> lhs,
+    ArrayRef<Value*> rhs,
+    ArrayRef<Value*> outputs);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/specialize_autogradzero.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/specialize_autogradzero.h
new file mode 100644
index 0000000000000000000000000000000000000000..43300aa6ca0c30abc4ee595063d578e528f78e92
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/specialize_autogradzero.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// propagate autograd zero information through a gradient graph and
+// remove grad_of blocks if present.
+// Note: this is a very limited pass. It only propagates autograd zeros for
+// operations generated by the symbolic autodiff code and cleans up
+// AutogradAdds when possible. Outputs of other nodes are conservatively
+// marked Unknown and not optimized.
+TORCH_API void specializeAutogradZero(std::shared_ptr<Graph> g);
+
+struct ProfilingRecord;
+
+TORCH_API void InsertProfileNodesForSpecializeAutogradZero(ProfilingRecord* pr);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/subgraph_rewrite.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/subgraph_rewrite.h
new file mode 100644
index 0000000000000000000000000000000000000000..add17c666639de97aed54fcdd79890e8eca82a2a
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/subgraph_rewrite.h
@@ -0,0 +1,115 @@
+/** This file defines API for pattern-based subgraph rewrites.
+ *
+ * The API can be used for finding concrete patterns in the model and replacing
+ * the corresponding subgraphs with another subgraph. A special case of such
+ * rewrites is fusion, where the new subgraph consists of just a single node.
+ *
+ * There is a default set of the most common patterns that everyone could use.
+ * Alternatively, an arbitrary pattern can be registered.
+ */
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+#include <functional>
+#include <unordered_set>
+#include <vector>
+
+namespace torch::jit {
+
+// Forward declarations.
+struct RewritePatternDescr;
+struct Match;
+
+using MatchFilter = std::function<
+    bool(const Match&, const std::unordered_map<std::string, Value*>&)>;
+
+/** Run pattern-based subgraph rewrites on all methods in the module.
+ *
+ * This pass will go through all methods in the module and try to replace all
+ * recognized patterns (see SubgraphRewriter::RegisterDefaultPatterns for the
+ * list of these patterns).
+ */
+TORCH_API Module PatternBasedRewrite(const Module& module);
+
+/** A class implementing API for pattern-based subgraph rewrites.
+ *
+ * To perform pattern-based subgraph rewrites on a module using this API, one
+ * needs to create an object of such class, register rewrite patterns and run
+ * the transformation pass (`runOnModule`).
+ *
+ * To use standard patterns, one could use `RegisterDefaultPatterns`.
+ *
+ * To enable rewrites of custom patterns, the custom patterns must be registered
+ * with `RegisterRewritePattern`.
+ */
+class TORCH_API SubgraphRewriter {
+ public:
+  // Run pattern-based subgraph rewrite pass on the module.
+  Module runOnModule(const Module& module);
+
+  // Run pattern-based subgraph rewrite pass on the graph (used in testing).
+  // `filter` is a function that does extra filtering on the match. If it
+  // returns false for a given Match, we'll skip the Match. The filter
+  // function's arguments consist of a Match and a value map from parsing the
+  // pattern graph. Both the Match and the value map are necessary because we
+  // need to 1) do extra filtering on the matched result as well as 2) refer to
+  // the values in the matched result through the values in the pattern graph.
+  void runOnGraph(
+      std::shared_ptr<Graph>& graph,
+      const std::vector<MatchFilter>& filters);
+
+  void runOnGraph(
+      std::shared_ptr<Graph>& graph,
+      const MatchFilter& filter =
+          [](const Match&, const std::unordered_map<std::string, Value*>&) {
+            return true;
+          }) {
+    runOnGraph(graph, std::vector<MatchFilter>({filter}));
+  }
+
+  // Register standard rewrite patterns.
+  void RegisterDefaultPatterns();
+
+  /** Register a custom rewrite pattern.
+   *
+   * The method takes two parameters specifying the pattern:
+   * \p PATTERN - IR string representing the pattern subgraph.
+   * \p REPLACEMENT - IR string representing the replacement subgraph.
+   * \p value name map - vector of pairs mapping values in the replacement graph
+   * to the values in the pattern graph. Used for preserving source range info
+   * across graph rewrite.
+   *
+   * See examples of pattern registering in `RegisterDefaultPatterns`.
+   */
+  void RegisterRewritePattern(
+      const std::string& pattern,
+      const std::string& replacement,
+      const std::vector<std::pair<std::string, std::string>>& value_name_pair =
+          {});
+
+ private:
+  std::vector<RewritePatternDescr> patterns_;
+  std::unordered_set<Node*> nodes_to_delete_;
+
+  void rewriteSinglePatternOnGraph(
+      std::shared_ptr<Graph>& graph,
+      const RewritePatternDescr& pattern,
+      const std::vector<MatchFilter>& filters);
+
+  bool overlapsWithPreviousMatches(const Match* match);
+};
+
+/** Rewrite pattern descriptor.
+ *
+ * This structure is used in the implementation of `SubgraphRewriter` and
+ * is not supposed to be used externally.
+ */
+struct RewritePatternDescr {
+  std::string pattern;
+  std::string replacement;
+  std::unordered_map<std::string, std::string> value_name_map;
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/symbolic_shape_analysis.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/symbolic_shape_analysis.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c4787a3de93203c9246b8f3d117beda0c9de118
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/symbolic_shape_analysis.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <unordered_map>
+#include <utility>
+#include <variant>
+
+namespace torch::jit {
+
+// CAUTION NOT TO BE USED, STILL A WIP, NOT STABLE
+
+TORCH_API void PropagateShapesOnGraph(std::shared_ptr<Graph>& graph);
+
+// CAUTION NOT TO BE USED, STILL A WIP, NOT STABLE
+// From [beg, end) attempt to propagate shapes and
+// build up a graph that will compute all remaining symbolic
+// shapes in [beg, end) that can be executed before beg
+
+struct ShapeComputeGraphMapping {
+  ShapeComputeGraphMapping(
+      std::shared_ptr<Graph> partial_eval_shape_graph,
+      std::unordered_map<Value*, Value*>
+          enclosing_graph_value_to_shape_graph_input,
+      std::unordered_map<Value*, int64_t> graph_output_to_symbolic_shape_dim)
+      : partial_eval_shape_graph(std::move(partial_eval_shape_graph)),
+        enclosing_graph_value_to_shape_graph_input_(
+            std::move(enclosing_graph_value_to_shape_graph_input)),
+        graph_output_to_symbolic_shape_dim_(
+            std::move(graph_output_to_symbolic_shape_dim)) {}
+
+  std::shared_ptr<Graph> partial_eval_shape_graph;
+  std::unordered_map<Value*, Value*>
+      enclosing_graph_value_to_shape_graph_input_;
+  std::unordered_map<Value*, int64_t> graph_output_to_symbolic_shape_dim_;
+};
+
+TORCH_API std::optional<ShapeComputeGraphMapping>
+PropagateShapesAndBuildLargeShapeComputeGraph(
+    std::shared_ptr<Graph>& graph,
+    Node* beg,
+    Node* end);
+
+// don't insert complete tensor shapes in shape compute graphs and instead
+// rely on our partial evaluation pipeline to propagate information.
+// this is a good proxy for our ability to propagate non-complete shape
+// information.
+TORCH_API bool setSymbolicShapeAnalysisTestMode(bool value);
+TORCH_API bool symbolicShapeAnalysisTestModeEnabled();
+
+using SSAInput = std::variant<IValue, c10::SymbolicShape>;
+TORCH_API std::optional<std::vector<c10::SymbolicShape>>
+calculateSymbolicShapesOnOp(
+    const FunctionSchema* schema,
+    const std::vector<SSAInput>& inputs);
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/symbolic_shape_cache.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/symbolic_shape_cache.h
new file mode 100644
index 0000000000000000000000000000000000000000..afb927e66fd2f79b11b060172cb6674db70fa582
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/symbolic_shape_cache.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/passes/symbolic_shape_analysis.h>
+
+namespace torch::jit {
+
+struct TORCH_API CanonicalizedSymbolicShape {
+  // TODO: Consider in the future if it is reasonable to
+  // merge code with SymbolicShape or VaryingShape while keeping
+  // the two not implicitly convertable (and cause bugs).
+  CanonicalizedSymbolicShape(
+      const c10::SymbolicShape& orig_shape,
+      std::unordered_map<int64_t, int64_t>& ss_map) {
+    init(orig_shape, ss_map);
+  }
+
+  CanonicalizedSymbolicShape(c10::SymbolicShape& orig_shape) {
+    std::unordered_map<int64_t, int64_t> new_ssmap;
+    init(orig_shape, new_ssmap);
+  }
+
+  size_t hash() const;
+
+  c10::SymbolicShape toSymbolicShape(
+      std::unordered_map<int64_t, int64_t>& inverse_ss_map) const;
+
+  TORCH_API friend bool operator==(
+      const CanonicalizedSymbolicShape& a,
+      const CanonicalizedSymbolicShape& b);
+
+ private:
+  std::optional<std::vector<int64_t>> values_;
+
+  void init(
+      const c10::SymbolicShape& orig_shape,
+      std::unordered_map<int64_t, int64_t>& ss_map);
+};
+
+// SHAPE CACHE API
+TORCH_API std::optional<std::vector<at::SymbolicShape>>
+get_cached_shape_function(
+    const FunctionSchema* schema,
+    const std::vector<SSAInput>& arg_vec);
+
+TORCH_API void cache_shape_function(
+    const FunctionSchema* schema,
+    const std::vector<SSAInput>& arg_vec,
+    const std::vector<at::SymbolicShape>& ret_vec);
+
+// For use in test code
+TORCH_API void clear_shape_cache();
+TORCH_API size_t get_shape_cache_size();
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/symbolic_shape_runtime_fusion.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/symbolic_shape_runtime_fusion.h
new file mode 100644
index 0000000000000000000000000000000000000000..e31ce0a197ec27aa2d9a5d19dfc4729826e17205
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/symbolic_shape_runtime_fusion.h
@@ -0,0 +1,51 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/passes/symbolic_shape_analysis.h>
+
+namespace torch::jit {
+
+// Takes in a TensorExprGraph of static shapes and generalizes the input shapes
+// to symbolic dimensions. Dimensions of value 1 will be preserved, otherwise
+// dimensions with the same value will be bucketed to the same symbolic shape.
+// E.g. Tensor(5, 3), Tensor(3, 1) -> Tensor(SS(-1), SS(-2)), Tensor(SS(-2), 1)
+// From there, runs symbolic shape inference on the graph, and creates a
+// versioning if in the graph with prim::TensorExprDynamicGuard checking if
+// the inputs at runtime match the Generalized Symbolic Shapes that are inputs
+// to the TE Kernel. The computate to calculate all symbolic dimensions is
+// inlined in to the if block with the TE Kernel. All Sym Dim Value* are
+// appended to the end of the TE Kernel Graph/Node inputs, and the Node is
+// augmented with a integer list attr `symbolic_shape_inputs` that gives the
+// mapping from Value * -> Symbolic Shape int64_t value. For more lengthy IR
+// examples and walkthrough look at ShapeAnalysisTest.DynamicShapesFusion in
+// `test_shape_analysis` Returns True on Success, False on Failure, can fail if
+// shape propagation fails to propagate # of dims or if complete shapes on
+// inputs not set
+
+TORCH_API bool GenerateGuard(
+    Node* tensorexpr_graph_node,
+    bool add_composed_op = false);
+
+TORCH_API void runTensorExprDynamicGroup(const Code& code, Stack& stack);
+
+enum class StrideInput {
+  // Tensors natively store whether they are contiguous or not as a property
+  // this makes it faster to query `is_contiguous` or
+  // `is_contiguous(memory_format=channels_last)`
+  // than looping through the sizes/strides yourself
+  // For tensors with these properties, we only store one value:
+  TENSOR_CONT,
+  TENSOR_CONT_CHANNELS_LAST,
+  // now, we describe other cases, where there is one stride enum
+  // per dimension
+  S_ONE, // STRIDE_ONE: packed
+  S_CONT, // STRIDE_CONTIGUOUS: stride[i + 1] * sizes[i + 1]
+  S_TRAN_CONT, // STRIDE_TRANSPOSED_CONTIGUOUS: stride[i-1] * sizes[i-1]
+  S_AS_ARG, // STRIDE_AS_ARG: stride passed in as runtime value
+};
+
+TORCH_API std::string toString(StrideInput si);
+TORCH_API StrideInput strideInputFromString(const std::string& si);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/tensorexpr_fuser.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/tensorexpr_fuser.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a27caaa216660e93868839cd45d8facf26e6b72
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/tensorexpr_fuser.h
@@ -0,0 +1,77 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <memory>
+
+namespace torch::jit {
+
+// Run TensorExpressions-based fuser.
+// If add_composed_op is true, creates a single operation that
+// performs both the runtime check that types align
+// and then the dispatch to the kernel/unoptimized graph
+TORCH_API void FuseTensorExprs(
+    std::shared_ptr<Graph>& graph,
+    size_t min_group_size = 2,
+    bool add_composed_op = false,
+    bool fuse_to_dynamic_shapes = false);
+
+TORCH_API void setTensorExprFuserEnabled(bool val);
+TORCH_API bool tensorExprFuserEnabled();
+TORCH_API void setTensorExprDynamicShapeFusionEnabled(bool val);
+TORCH_API bool tensorExprDynamicShapeFusionEnabled();
+TORCH_API bool setTexprReductionsEnabled(bool value);
+TORCH_API bool texprReductionsEnabled();
+
+TORCH_API void RemoveProfileNodesAndSpecializeTypes(
+    std::shared_ptr<Graph>& graph);
+TORCH_API bool hasTensorTypeSpecialization(Value* v);
+TORCH_API void RemoveTensorTypeSpecializations(std::shared_ptr<Graph>& graph);
+TORCH_API void removeTensorTypeSpecializations(Block* block);
+
+using tensor_type_converter_t =
+    c10::function_ref<TensorTypePtr(const TensorTypePtr& t)>;
+
+// inserts a TypeCheck pattern
+//
+// around the guarded node that has a Subgraph attribute, this inserts a pattern
+//
+//   if TypeCheck(...):
+//     guarded_node
+//   else:
+//     FallbackGraph(...)
+//
+// The TypeCheck includes the types of all Tensor inputs to the guarded_node,
+// as processed by the type_converter, a lambda
+// TensorTypePtr(const TensorTypePtr& t). This allows to erase irrelevant
+// aspects of the type.
+//
+// The Fallback graph will have the same subgraph as the guarded node (with the
+// expectation that the guarded_node's subgraph will then be optimized.
+TORCH_API void insertTypeGuard(
+    Node* guarded_node,
+    tensor_type_converter_t type_converter,
+    c10::Symbol kind);
+
+TORCH_API bool usedOnlyInSize(Value* v);
+TORCH_API Value* broadcastSizes(at::ArrayRef<Value*> sizes, AliasDb* db);
+
+namespace tensorexpr {
+TORCH_API bool isSupported(Node* node);
+
+/// Get the modifiable custom operator set object.
+///
+/// For static shapes, if a custom operator has been added to the custom
+/// operator set, it will be pulled into the NNC fusion group. But it doesn't
+/// work with dynamic shapes unless explicitly register the shape function via
+/// `torch::jit::RegisterShapeComputeGraphForSchema` for the custom operator.
+///
+/// @return Reference of the custome operator set
+///
+TORCH_API OperatorSet& getCustomOperatorSet();
+
+} // namespace tensorexpr
+} // namespace torch::jit
+
+C10_DECLARE_bool(torch_jit_disable_cat);
+C10_DECLARE_bool(torch_jit_enable_dynamic_shape_fusion);
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/update_differentiable_graph_requires_grad.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/update_differentiable_graph_requires_grad.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a970ab56a804a0ed0a60beb2cd61343f34f9b4f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/update_differentiable_graph_requires_grad.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// Because differentiable graphs detach the gradients of input Tensors,
+// creating and inlining differentiable graphs changes the requires_grad
+// property of tensors in the graph. This pass updates prim::profiles
+// requires_grad to keep profiled properties up to date, it does not update
+// grad properties of other nodes like graph inputs bc the only downstream
+// user of the grad property is the profiling executor, which just uses
+// the types of prim::profiles
+TORCH_API void UpdateDifferentiableGraphRequiresGrad(
+    std::shared_ptr<Graph>& diff_forward_graph,
+    std::optional<bool> new_requires_grad);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/utils/check_alias_annotation.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/utils/check_alias_annotation.h
new file mode 100644
index 0000000000000000000000000000000000000000..9654f4b406765a7e81c495ce54dbf8788bc3f297
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/utils/check_alias_annotation.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace torch::jit {
+
+// Verify that alias annotations are correct. See impl for definition of
+// "correct".
+//
+// This function expects a graph with a single op with `unqualifiedOpName`, plus
+// the inputs that you would otherwise have passed to the graph executor.
+TORCH_API void checkAliasAnnotation(
+    const std::shared_ptr<Graph>& graph,
+    std::vector<IValue> pythonInputs,
+    const std::string& unqualifiedOpName);
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/utils/memory_dag.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/utils/memory_dag.h
new file mode 100644
index 0000000000000000000000000000000000000000..157439d399adc0aa7b0893e65205231e313b3e97
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/utils/memory_dag.h
@@ -0,0 +1,174 @@
+#pragma once
+
+#include <ATen/core/jit_type.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/flat_hash_map.h>
+#include <c10/util/sparse_bitset.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/ir/type_hashing.h>
+#include <memory>
+#include <optional>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include <torch/csrc/Export.h>
+
+// Uses a compressed index representation for faster comparisons
+typedef c10::SparseBitVector<256> MemoryLocations;
+namespace torch::jit {
+
+struct Value;
+
+using AliasTypeSet = std::vector<TypePtr>;
+
+// `Element` represents a vertex in the points-to graph. It represents
+// anything that could have an aliasing relationship--mostly IR
+// `Value`s, but also wildcards or the type inside a container (e.g. `T`
+// in `List[T]`)
+struct Element {
+  Element(const Value* value_, unsigned index_);
+  // wildcard constructor
+  explicit Element(unsigned index_);
+
+  // Index into the owning DAG's bit vector that represents this element.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  unsigned index;
+
+  // All elements that this element *may* point to. It's possible to have
+  // multiple elements that you might point to due to control flow/complex ops
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  MemoryLocations pointsTo;
+  // Backreference for points-to.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  MemoryLocations pointedFrom;
+
+  // Elements can contain other elements (e.g. List[Tensor])
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  MemoryLocations containedElements;
+
+  // The values that this element corresponds to. May be empty if this element
+  // doesn't represent a first-class value.
+  // This is for debug information only.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::unordered_set<const Value*> values;
+
+ private:
+  // Make `from` point at `to`.
+  void makePointerTo(Element* from, Element* to);
+
+  friend class MemoryDAG;
+  // We memoize the results of `getMemoryLocations` to speed up queries.
+  // A nullopt means that this cache is not yet populated. Since `MemoryDAG` is
+  // immutable, this cache should never need to be invalidated.
+  mutable std::optional<MemoryLocations> cachedMemoryLocations_;
+
+  mutable std::optional<MemoryLocations> cachedAllContainedMemoryLocations_;
+};
+
+// class MemoryDAG
+//
+// This class tracks the "A points to B" graph for all values. It is used by
+// AliasDb to provide a higher-level API.
+//
+// We maintain a DAG where:
+//   - Vertices (called "Elements") represent Values and
+//     other aliasing entities (e.g. the stuff inside a list)
+//   - Edges represent a "points-to" relationship.
+//
+// Leaves in this DAG are entities that don't point to anything, and thus
+// correspond to unique "memory locations".
+//
+// So, by traversing the "points-to" graph to the leaves, you can determine
+// which memory locations an element may point to.
+class TORCH_API MemoryDAG {
+ public:
+  explicit MemoryDAG(std::vector<std::unique_ptr<Element>> indexToElementMap)
+      : indexToElementMap_(std::move(indexToElementMap)) {}
+  // explicitly delete copy constructor because otherwise windows build is
+  // confused for an exported class see
+  // https://stackoverflow.com/a/51033485/105137
+  MemoryDAG(const MemoryDAG&) = delete;
+  MemoryDAG& operator=(const MemoryDAG&) = delete;
+
+  // Return the unique memory locations that `Element` might represent.
+  const MemoryLocations& getMemoryLocations(const Element* e) const;
+
+  // Do `a` and `b` potentially share a memory location?
+  bool mayAlias(const Element* a, const Element* b) const;
+
+  // Does `a` hold reference to any memory that is stored in `b`, or vice versa?
+  bool mayContainAlias(const Element* a, const Element* b) const;
+
+  bool mayContainAlias(const Element* a, const at::ArrayRef<Element*> b) const;
+
+  bool mayContainAlias(
+      const at::ArrayRef<Element*> a,
+      const at::ArrayRef<Element*> b) const;
+
+  // Converts from the compressed index representation
+  const Element* fromIndex(unsigned x) const;
+  Element* fromIndex(unsigned x);
+  void collectAllContainedMemoryLocations(
+      const Element* elem,
+      MemoryLocations& cont) const;
+
+  /**
+   * The following methods are special cases where we need to mutate the
+   * internals of MemoryDAG for efficiency reasons. Don't call them unless you
+   * know what you're doing! In particular, don't add new mutating methods
+   * without ensuring that you are maintaining cache consistency for memory
+   * locations.
+   */
+
+  // Adding wildcards can trigger extremely expensive cache invalidations. This
+  // method adds them in a more efficient cache-aware way.
+  void setWildcards(
+      const std::unordered_set<const Value*>& wildcards,
+      const ska::flat_hash_map<const Value*, Element*>& elementMap,
+      const std::function<Element*(const Value*)>& getWildcardElement);
+  Element* unsafeMakeFreshValue(const Value* v);
+
+ private:
+  const MemoryLocations& getAllContainedMemoryLocations(
+      const Element* elem) const;
+  void collectAllContainedMemoryLocationsImpl(
+      const Element* elem,
+      MemoryLocations& cont) const;
+  std::vector<std::unique_ptr<Element>> indexToElementMap_;
+};
+
+/**
+ * Helper to build up the points-to graph.
+ *
+ * We separate the "building" into a different class because it allows us to
+ * cache internally to MemoryDAG without worrying about how the DAG structure
+ * is mutated.
+ */
+class TORCH_API MemoryDAGBuilder {
+ public:
+  MemoryDAGBuilder() = default;
+  MemoryDAGBuilder(const MemoryDAGBuilder&) = delete;
+  MemoryDAGBuilder& operator=(const MemoryDAGBuilder&) = delete;
+
+  // Make `from` point at `to`.
+  void makePointerTo(Element* from, Element* to);
+
+  void addToContainedElements(Element* contained, Element* container);
+
+  std::unique_ptr<MemoryDAG> createMemoryDAG() && {
+    return std::make_unique<MemoryDAG>(std::move(indexToElementMap_));
+  }
+
+  // Make a fresh Element (i.e. an Element that doesn't point to anything) and
+  // return it.
+  Element* makeFreshValue(const Value* v);
+
+  friend MemoryDAG;
+
+ private:
+  // `MemoryDAGBuilder` builds up `indexToElementMap_`, then uses
+  // the map to construct the `MemoryDAG`
+  std::vector<std::unique_ptr<Element>> indexToElementMap_;
+};
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/utils/op_registry.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/utils/op_registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..4e59de14330a42e15f1109288db16b49f876d7ae
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/utils/op_registry.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <memory>
+
+namespace torch::jit {
+// Moved from shape_analysis.cpp
+
+// Requirements:
+//   dims           : preserved from the first argument
+//   scalar type    : preserved from the first argument (doesn't have to
+//                    match other arguments)
+//   device         : always matching and preserved
+//   tensor inputs  : *
+//   tensor outputs : 1
+// NB: those ops (with slight adjustments) are good candidates for restarts.
+//     Knowing the type and device of weights or biases is usually enough to
+//     infer the output type.
+std::shared_ptr<OperatorSet> nn_ops_first_input_preserving();
+
+// Requirements:
+//   dims           : Changed from first argument
+//   scalar type    : preserved from the first argument
+//   device         : always matching and preserved
+//   tensor inputs  : 1
+//   tensor outputs : 1
+std::shared_ptr<OperatorSet> ops_one_tensor_in_shape_transform();
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/utils/optimization_utils.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/utils/optimization_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..aceb723d2c5535c72e49e3e38a28ff662e083aac
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/utils/optimization_utils.h
@@ -0,0 +1,12 @@
+
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// Checks if the parameters, not including the
+// first param are all constants.
+bool nonConstantParameters(Node* n);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/utils/subgraph_utils.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/utils/subgraph_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..3fbce7ecc793d1869dc55c2a36c8eb728d40d6ea
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/utils/subgraph_utils.h
@@ -0,0 +1,70 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/alias_analysis.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+// Utilities for dealing with nodes that contain subgraphs.
+//
+// They handle the complexity of editing inputs/outputs as you merge nodes in
+// and out of subgraphs.
+namespace torch::jit::SubgraphUtils {
+
+// Create a new subgraph node that contains only `n`. The new subgraph will have
+// `subgraphKind` as its type.
+//
+// `n` is destroyed.
+//
+// Returns the new subgraph node.
+TORCH_API Node* createSingletonSubgraph(Node* n, Symbol subgraphKind);
+
+// Creates a new subgraph that only contains `n`, amd updates the new outputs
+// of the subgraph to have the aliasing properties of the original `n` outputs
+TORCH_API Node* createSingletonSubgraphAndUpdateAliasing(
+    Node* to_merge,
+    Symbol subgraphKind,
+    AliasDb& db);
+
+// Merge a node into a subgraph node. If `toMerge` is also a subgraph, the
+// subgraphs are merged.
+// If `destroyNode` is true `toMerge` is destroyed.
+// An optional argument 'vmap' could be used to retrieve value mappings.
+// Values will be mapped to their new subgraph values
+TORCH_API void mergeNodeIntoSubgraph(
+    Node* toMerge,
+    Node* subgraphNode,
+    bool destroyNode = true);
+
+// Merges a node into a subgraph node, and updates the new outputs of the
+// subgraph to have the aliasing properties of the corresponding `to_merge`
+// outputs
+TORCH_API void mergeNodeIntoSubgraphAndUpdateAliasing(
+    Node* to_merge,
+    Node* subgraphNode,
+    AliasDb& db);
+
+TORCH_API std::vector<Node*> unmergeAliasedOutputs(
+    Node* subgraphNode,
+    AliasDb& db);
+
+// Move nodes from a subgraph node to the outer graph.
+// `subgraphNode` is destroyed.
+TORCH_API void unmergeSubgraph(Node* subgraphNode);
+
+// Move `node_to_unmerge` and its descendants after `subgraphNode`
+// promotes any dependencies of `node_to_unmerge` to subgraphNode outputs
+TORCH_API void unmergeNode(Node* node_to_unmerge, Node* subgraphNode);
+
+TORCH_API bool unmergeOutputsAlisingInputs(Node* subgraphNode);
+
+TORCH_API bool unmergeAliasedOutputs(Node* subgraphNode);
+
+// Convenience function
+std::shared_ptr<Graph> getSubgraph(Node* n);
+
+TORCH_API std::string generateNameForGraph(
+    const std::shared_ptr<Graph>& graph,
+    size_t maxlen = 40,
+    const std::string& prefix = "fused");
+
+} // namespace torch::jit::SubgraphUtils
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/value_refinement_utils.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/value_refinement_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..1273fcc988b82e561033d1c24ae10bb80e76f76d
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/value_refinement_utils.h
@@ -0,0 +1,79 @@
+#pragma once
+
+#include <ATen/core/jit_type.h>
+#include <torch/csrc/jit/ir/alias_analysis.h>
+#include <torch/csrc/jit/ir/ir_views.h>
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/dead_code_elimination.h>
+#include <torch/csrc/jit/passes/peephole.h>
+#include <torch/csrc/jit/passes/peephole_list_idioms.h>
+#include <torch/csrc/jit/runtime/graph_executor.h>
+
+namespace torch::jit {
+
+// Refine from Value of type List -> len of list
+// If a refinement mapping of List Value * -> len is present in a block
+// the list is guaranteed to be that length
+// TODO: vector may be faster
+using ListRefinement = std::unordered_map<Value*, int64_t>;
+
+TORCH_API ListRefinement
+intersectRefinements(const ListRefinement& ref1, const ListRefinement& ref2);
+
+TORCH_API ListRefinement
+unionRefinements(const ListRefinement& ref1, const ListRefinement& ref2);
+
+// Represents the refinement information that can be carried on a boolean
+struct BooleanRefinementMapping {
+  BooleanRefinementMapping(
+      ListRefinement true_refine,
+      ListRefinement false_refine)
+      : true_refine_(std::move(true_refine)),
+        false_refine_(std::move(false_refine)) {}
+  BooleanRefinementMapping() = default; // empty
+
+  static BooleanRefinementMapping FalseRefinements(
+      ListRefinement false_refine) {
+    return BooleanRefinementMapping({}, std::move(false_refine));
+  }
+
+  static BooleanRefinementMapping TrueRefinements(ListRefinement true_refine) {
+    return BooleanRefinementMapping(std::move(true_refine), {});
+  }
+
+  BooleanRefinementMapping intersectBooleanRefinementMapping(
+      BooleanRefinementMapping& other) {
+    return BooleanRefinementMapping(
+        intersectRefinements(true_refine_, other.true_refine()),
+        intersectRefinements(false_refine_, other.false_refine()));
+  }
+
+  ListRefinement& true_refine() {
+    return true_refine_;
+  }
+
+  ListRefinement& false_refine() {
+    return false_refine_;
+  }
+
+ private:
+  ListRefinement true_refine_;
+  ListRefinement false_refine_;
+};
+
+TORCH_API void joinIfRefinements(
+    Node* if_node,
+    std::unordered_set<Block*>& throwing_blocks,
+    ListRefinement& curr_block_refinements,
+    ListRefinement& true_block_refinements,
+    ListRefinement& false_block_refinements,
+    std::unordered_map<Value*, BooleanRefinementMapping>& info);
+
+// handles adding blocks to throwing blocks and propagating refinements via
+// boolean comparisons
+TORCH_API bool handleCommonRefinentOperators(
+    Node* n,
+    std::unordered_set<Block*>& throwing_blocks,
+    std::unordered_map<Value*, BooleanRefinementMapping>& info);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/variadic_ops.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/variadic_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..d6f784d9d210b1db2ba12277c710cfa627bbb997
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/variadic_ops.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// Try to replace an op that takes a list input with another op that takes a
+// variadic number of arguments.
+TORCH_API bool UseVariadicOp(
+    const std::shared_ptr<Graph>& graph,
+    NodeKind op,
+    NodeKind variadic_op);
+
+TORCH_API bool RemoveListMutationAndUseVariadicOp(
+    const std::shared_ptr<Graph>& graph,
+    NodeKind op,
+    NodeKind variadic_op);
+
+// Convenient functions for replacing aten::stack/aten::cat with their
+// variadic versions.
+TORCH_API bool UseVariadicCat(const std::shared_ptr<Graph>& graph);
+TORCH_API bool RemoveListMutationAndUseVariadicCat(
+    const std::shared_ptr<Graph>& graph);
+
+TORCH_API bool UseVariadicStack(const std::shared_ptr<Graph>& graph);
+TORCH_API bool RemoveListMutationAndUseVariadicStack(
+    const std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/vulkan_rewrite.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/vulkan_rewrite.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c98cd27a927b28cc54162bd97fe89b220e11970
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/vulkan_rewrite.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/passes/mobile_optimizer_type.h>
+
+namespace torch::jit {
+TORCH_API void vulkanInsertPrePackedOps(std::shared_ptr<Graph>& graph);
+TORCH_API void vulkanInsertPrePackedOps(script::Module& module);
+TORCH_API void vulkanFusePrePackedConvWithClamp(script::Module& module);
+TORCH_API void vulkanFoldPrePackingOps(script::Module& module);
+TORCH_API script::Module vulkanOptimizeForMobile(
+    const script::Module& module,
+    const std::set<MobileOptimizerType>& optimization_blocklist,
+    const std::vector<std::string>& preserved_methods);
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/xnnpack_rewrite.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/xnnpack_rewrite.h
new file mode 100644
index 0000000000000000000000000000000000000000..e8159ae241c5053ee31f874e86009db8288eba68
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/passes/xnnpack_rewrite.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/passes/mobile_optimizer_type.h>
+
+namespace torch::jit {
+
+TORCH_API void transformConv1dToConv2d(std::shared_ptr<Graph>& graph);
+TORCH_API void transformConv1dToConv2d(script::Module& module);
+TORCH_API void insertPrePackedOps(std::shared_ptr<Graph>& graph);
+TORCH_API void insertPrePackedOps(script::Module& module);
+TORCH_API void fusePrePackedLinearConvWithClamp(script::Module& module);
+TORCH_API void FoldPrePackingOps(script::Module& module);
+TORCH_API script::Module optimizeForMobile(
+    const script::Module& module,
+    const std::set<MobileOptimizerType>& optimization_blocklist = {},
+    const std::vector<std::string>& preserved_methods = {});
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/init.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/init.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d03243c6d5cc7d6be586734c73421d1985c7eee
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/init.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch::jit {
+
+void initJITBindings(PyObject* module);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/module_python.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/module_python.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a2670582702f69e6004e11353b5f3bac61f61ff
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/module_python.h
@@ -0,0 +1,64 @@
+#pragma once
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/utils/pybind.h>
+#include <tuple>
+
+namespace py = pybind11;
+
+namespace torch::jit {
+
+inline std::optional<Module> as_module(py::handle obj) {
+#if IS_PYBIND_2_13_PLUS
+  PYBIND11_CONSTINIT static py::gil_safe_call_once_and_store<py::object>
+      storage;
+  auto& ScriptModule =
+      storage
+          .call_once_and_store_result([]() -> py::object {
+            return py::module_::import("torch.jit").attr("ScriptModule");
+          })
+          .get_stored();
+#else
+  static py::handle ScriptModule =
+      py::module::import("torch.jit").attr("ScriptModule");
+#endif
+  if (py::isinstance(obj, ScriptModule)) {
+    return py::cast<Module>(obj.attr("_c"));
+  }
+  return std::nullopt;
+}
+
+inline std::optional<Object> as_object(py::handle obj) {
+#if IS_PYBIND_2_13_PLUS
+  PYBIND11_CONSTINIT static py::gil_safe_call_once_and_store<
+      std::tuple<py::object, py::object>>
+      storage;
+  auto& [ScriptObject, RecursiveScriptClass] =
+      storage
+          .call_once_and_store_result(
+              []() -> std::tuple<py::object, py::object> {
+                return {
+                    py::module_::import("torch").attr("ScriptObject"),
+                    py::module_::import("torch.jit")
+                        .attr("RecursiveScriptClass")};
+              })
+          .get_stored();
+#else
+  static py::handle ScriptObject =
+      py::module::import("torch").attr("ScriptObject");
+
+  static py::handle RecursiveScriptClass =
+      py::module::import("torch.jit").attr("RecursiveScriptClass");
+#endif
+
+  if (py::isinstance(obj, ScriptObject)) {
+    return py::cast<Object>(obj);
+  }
+  if (py::isinstance(obj, RecursiveScriptClass)) {
+    return py::cast<Object>(obj.attr("_c"));
+  }
+  return std::nullopt;
+}
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/pybind.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/pybind.h
new file mode 100644
index 0000000000000000000000000000000000000000..f756c4943c763fb5f27cc86346d2ed50c310b6f7
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/pybind.h
@@ -0,0 +1,213 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+#include <ATen/core/ivalue.h>
+#include <ATen/core/symbol.h>
+#include <c10/util/irange.h>
+#include <torch/csrc/DynamicTypes.h>
+#include <torch/csrc/THP.h>
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/jit/frontend/tracer.h>
+#include <torch/csrc/jit/python/pybind_utils.h>
+#include <torch/csrc/utils/pybind.h>
+
+#include <pybind11/functional.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+namespace py = pybind11;
+
+namespace torch::jit {
+
+// This is a variant of shared_ptr that "sees through" a wrapper.
+// We use it to convert Value, Node, Block and node to "wrapped" Python
+// values. When we destruct the C++ object, the wrapper's pointer will
+// be set to 0 and any future dereferencing will throw. We need this
+// because the Python objects may hang around after the C++ object
+// has already been destroyed.
+// This also needs the magic type_caster below, which is from the
+// workaround offered in https://github.com/pybind/pybind11/issues/2751
+template <typename T>
+class unwrapping_shared_ptr {
+  static_assert(
+      std::is_same_v<T, torch::jit::Value> ||
+          std::is_same_v<T, torch::jit::Node> ||
+          std::is_same_v<T, torch::jit::Block>,
+      "unwrapping type only defined for Graph object types");
+
+ private:
+  std::shared_ptr<torch::jit::Wrap<T>> impl;
+
+ public:
+  unwrapping_shared_ptr() : impl({}) {}
+  explicit unwrapping_shared_ptr(T* p) : impl(p->wrap()) {
+    impl->clear_cb = &clear_registered_instances;
+  }
+  T* get() const {
+    if (!impl->elem) {
+      throw std::logic_error("has been invalidated");
+    }
+    return impl->elem;
+  }
+  // we need to disable the overloaded & for PyBind11 < 2.3 due.
+  // see https://github.com/pybind/pybind11/pull/1435
+#if (PYBIND11_VERSION_MAJOR > 2) || \
+    ((PYBIND11_VERSION_MAJOR == 2) && (PYBIND11_VERSION_MINOR >= 3))
+  T** operator&() {
+    if (!impl->elem) {
+      throw std::logic_error("has been invalidated");
+    }
+    return &(impl->elem);
+  }
+#endif
+};
+
+} // namespace torch::jit
+
+PYBIND11_DECLARE_HOLDER_TYPE(T, torch::jit::unwrapping_shared_ptr<T>, true)
+
+namespace pybind11::detail {
+
+#define CREATE_UNWRAPPING_CASTER(Class)                                                   \
+  template <>                                                                             \
+  struct type_caster<Class> : public type_caster_base<Class> {                            \
+   public:                                                                                \
+    using type = Class;                                                                   \
+    using holder_type = torch::jit::unwrapping_shared_ptr<Class>;                         \
+                                                                                          \
+    bool load(handle src, bool convert) {                                                 \
+      return load_impl<type_caster<Class>>(src, convert);                                 \
+    }                                                                                     \
+                                                                                          \
+    explicit operator type*() {                                                           \
+      return static_cast<type*>(value);                                                   \
+    }                                                                                     \
+    explicit operator type&() {                                                           \
+      return *static_cast<type*>(value);                                                  \
+    }                                                                                     \
+                                                                                          \
+   protected:                                                                             \
+    friend class type_caster_generic;                                                     \
+                                                                                          \
+    bool load_value(const value_and_holder& v_h) {                                        \
+      if (v_h.holder_constructed()) {                                                     \
+        value = v_h.template holder<holder_type>().get();                                 \
+        return true;                                                                      \
+      } else {                                                                            \
+        throw cast_error(                                                                 \
+            "Unable to cast from non-held to held instance (#Class& to Holder<#Class>)"); \
+      }                                                                                   \
+    }                                                                                     \
+  }
+
+CREATE_UNWRAPPING_CASTER(torch::jit::Node);
+CREATE_UNWRAPPING_CASTER(torch::jit::Value);
+CREATE_UNWRAPPING_CASTER(torch::jit::Block);
+
+#undef CREATE_UNWRAPPING_CASTER
+
+template <>
+struct type_caster<torch::jit::IValue> {
+ public:
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  PYBIND11_TYPE_CASTER(torch::jit::IValue, _("IValue"));
+
+  bool load(handle src, bool) {
+    try {
+      value = torch::jit::toTypeInferredIValue(src);
+      return true;
+    } catch (std::exception& e) {
+      return false;
+    }
+  }
+
+  static handle cast(
+      torch::jit::IValue src,
+      return_value_policy /* policy */,
+      handle /* parent */) {
+    return torch::jit::toPyObject(std::move(src)).release();
+  }
+};
+
+template <>
+struct type_caster<torch::jit::Symbol> {
+ public:
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  PYBIND11_TYPE_CASTER(torch::jit::Symbol, _("Symbol"));
+
+  bool load(handle src, bool) {
+    // TODO: Is there a way to py::cast that doesn't raise an exception on
+    // failure?  Can we catch pybind11::cast_error here instead?
+    std::string src_str;
+    try {
+      src_str = py::cast<std::string>(src);
+    } catch (std::exception& e) {
+      return false;
+    }
+    value = torch::jit::Symbol::fromQualString(src_str);
+    return true;
+  }
+
+  static handle cast(
+      torch::jit::Symbol src,
+      return_value_policy /* policy */,
+      handle /* parent */) {
+    return py::cast(std::string(src.toQualString()), return_value_policy::copy)
+        .release();
+  }
+};
+
+template <>
+struct type_caster<torch::jit::AttributeKind> {
+ public:
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  PYBIND11_TYPE_CASTER(torch::jit::AttributeKind, _("AttributeKind"));
+
+  bool load(handle src, bool) {
+    return false;
+  }
+
+  static handle cast(
+      torch::jit::AttributeKind src,
+      return_value_policy /* policy */,
+      handle /* parent */) {
+    return py::cast(
+               std::string(torch::jit::toString(src)),
+               return_value_policy::copy)
+        .release();
+  }
+};
+
+// See https://github.com/pybind/pybind11/issues/637
+using ListCasterBase = pybind11::detail::
+    list_caster<std::vector<torch::jit::Node*>, torch::jit::Node*>;
+template <>
+struct type_caster<std::vector<torch::jit::Node*>> : ListCasterBase {
+  static handle cast(
+      const std::vector<torch::jit::Node*>& src,
+      return_value_policy,
+      handle parent) {
+    return ListCasterBase::cast(src, return_value_policy::reference, parent);
+  }
+  static handle cast(
+      const std::vector<torch::jit::Node*>* src,
+      return_value_policy pol,
+      handle parent) {
+    return cast(*src, pol, parent);
+  }
+};
+
+} // namespace pybind11::detail
+
+namespace torch::jit {
+
+static inline py::tuple tuple_tail(const py::tuple& tup) {
+  py::tuple r(tup.size() - 1);
+  for (const auto i : c10::irange(1, tup.size())) {
+    r[i - 1] = tup[i];
+  }
+  return r;
+}
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/pybind_utils.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/pybind_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..86e8e2bb41b401e849dece12772a97a3ec912087
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/pybind_utils.h
@@ -0,0 +1,1307 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
+#include <ATen/core/qualified_name.h>
+#include <ATen/core/stack.h>
+#include <pybind11/complex.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/pytypes.h>
+#include <torch/csrc/Device.h>
+#include <torch/csrc/Dtype.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/Layout.h>
+#include <torch/csrc/QScheme.h>
+#include <torch/csrc/Stream.h>
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/frontend/schema_matching.h>
+#include <torch/csrc/jit/frontend/tracer.h>
+#include <torch/csrc/jit/python/module_python.h>
+#include <torch/csrc/jit/python/python_custom_class.h>
+#include <torch/csrc/jit/python/python_tracer.h>
+#include <torch/csrc/jit/resource_guard.h>
+#include <torch/csrc/jit/runtime/operator.h>
+#include <torch/csrc/utils/pybind.h>
+#include <torch/csrc/utils/python_arg_parser.h>
+#include <torch/csrc/utils/six.h>
+#ifdef USE_DISTRIBUTED
+#include <torch/csrc/distributed/rpc/py_rref.h>
+#include <torch/csrc/distributed/rpc/rref_impl.h>
+#endif
+
+#include <ATen/core/function_schema.h>
+#include <c10/core/Stream.h>
+#include <c10/util/Exception.h>
+#include <c10/util/irange.h>
+#include <optional>
+
+#include <algorithm>
+#include <cstddef>
+#include <string>
+#include <utility>
+#include <vector>
+
+// The visibility attribute is to avoid a warning about storing a field in the
+// struct that has a different visibility (from pybind) than the struct.
+#ifdef _WIN32
+#define VISIBILITY_HIDDEN
+#else
+#define VISIBILITY_HIDDEN __attribute__((visibility("hidden")))
+#endif
+
+namespace torch::jit {
+
+using ResolutionCallback = std::function<py::object(std::string)>;
+
+void clear_registered_instances(void* ptr);
+
+TORCH_PYTHON_API IValue toIValue(
+    py::handle obj,
+    const TypePtr& type,
+    std::optional<int32_t> N = std::nullopt);
+
+TORCH_PYTHON_API py::object toPyObject(IValue ivalue);
+
+// Hack to overload the behavior of toIValue to accept Python
+// numbers in places where a Tensor is expected
+// See also torch::should_allow_numbers_as_tensors
+class TORCH_PYTHON_API ToIValueAllowNumbersAsTensors {
+  bool old_;
+
+ public:
+  ToIValueAllowNumbersAsTensors(bool enable);
+  ~ToIValueAllowNumbersAsTensors();
+};
+
+// Wrap Python function to guard deref
+// NB: Need VISIBILITY_HIDDEN for silencing compiler error,
+// 'torch::jit::PythonFunctionGuard' declared with greater visibility than the
+// type of its field 'torch::jit::PythonFunctionGuard::func_'
+struct VISIBILITY_HIDDEN PythonFunctionGuard {
+  explicit PythonFunctionGuard(py::function func) : func_(std::move(func)) {}
+  PythonFunctionGuard(const PythonFunctionGuard&) = delete;
+  PythonFunctionGuard(PythonFunctionGuard&&) = delete;
+  PythonFunctionGuard& operator=(const PythonFunctionGuard&) = delete;
+  PythonFunctionGuard& operator=(PythonFunctionGuard&&) = delete;
+
+  ~PythonFunctionGuard() {
+    pybind11::gil_scoped_acquire ag;
+    func_.dec_ref();
+    // explicitly setting PyObject* to nullptr to prevent py::object's dtor to
+    // decref on the PyObject again.
+    // See Note [Destructing py::object] in python_ivalue.h
+    func_.ptr() = nullptr;
+  }
+
+  py::function func_;
+};
+
+// The PythonFutureWrapper for ivalue::Future
+//
+// NB: VISIBILITY_HIDDEN is for silencing compiling error,
+// "error: 'torch::jit::PythonFutureWrapper' declared with greater visibility
+// than the type of its field 'torch::jit::PythonFutureWrapper::unwrap_func'
+// [-Werror=attributes]"
+//
+// NB: inherit from enable_shared_from_this because then(py::function) needs to
+//     get a shared_ptr from this pointer.
+struct VISIBILITY_HIDDEN PythonFutureWrapper
+    : std::enable_shared_from_this<PythonFutureWrapper> {
+  using UnwrapFunc = std::function<void(py::object)>;
+
+  explicit PythonFutureWrapper(
+      c10::intrusive_ptr<c10::ivalue::Future> fut,
+      std::optional<UnwrapFunc> unwrap_func = std::nullopt)
+      : fut(std::move(fut)), unwrap_func(std::move(unwrap_func)) {}
+
+  explicit PythonFutureWrapper(const PythonFutureWrapper&) = delete;
+  PythonFutureWrapper& operator=(const PythonFutureWrapper&) = delete;
+  PythonFutureWrapper(PythonFutureWrapper&&) = default;
+  PythonFutureWrapper& operator=(PythonFutureWrapper&&) = default;
+  ~PythonFutureWrapper() = default;
+
+  bool done() {
+    return fut->completed();
+  }
+
+  py::object value() {
+    // acquiring GIL as toPyObject creates new py::object
+    // without grabbing the GIL.
+    py::gil_scoped_acquire acquire;
+    py::object py_obj = toPyObject(fut->value());
+    // unwrap_func is a general compositional function that takes in a
+    // py::object and executes some python function. It is currently mostly used
+    // to throw python exceptions.
+    if (unwrap_func) {
+      (*unwrap_func)(py_obj);
+    }
+    return py_obj;
+  }
+
+  py::object wait() {
+    fut->wait();
+    if (jit::tracer::isTracing()) {
+      auto graph = jit::tracer::getTracingState()->graph;
+
+      Value* fut_val = jit::tracer::getValueTrace(fut);
+      auto output = graph->insert(aten::wait, {fut_val});
+      jit::tracer::setValueTrace(fut->value(), output);
+    }
+    return value();
+  }
+
+  // The py::function cb arg must take a std::shared_ptr<PythonFutureWrapper>
+  // (i.e., torch._C.Future) as the only argument. If the type mismatches, an
+  // error will be thrown when waiting for the value of this returned Future.
+  std::shared_ptr<PythonFutureWrapper> then(py::function cb) {
+    // We need this an additional layer of wrapper here to guard the
+    // destruction of the py::function object. Because, the
+    // Future owns a reference to the py::function in its callback
+    // vector, but Future does not acquire GIL on destruction.
+    auto pf = std::make_shared<PythonFunctionGuard>(std::move(cb));
+
+    return std::make_shared<jit::PythonFutureWrapper>(fut->then(
+        // Capture a copy of the ivalue::Future instead of the `this` pointer
+        // because the PythonFutureWrapper object could have been deleted
+        // when the callbacks are fired. For example, RPC only captures the
+        // ivalue::Future instead of PythonFutureWrapper in JitFuture's
+        // callback functions. Hence, if user code does not hold a reference to
+        // this PythonFutureWrapper object, there is no guarantee that the
+        // PythonFutureWrapper is still valid when running the callback.
+        [pyFut(this->getPtr()),
+         pf(std::move(pf))](c10::ivalue::Future& /* unused */) -> IValue {
+          try {
+            pybind11::gil_scoped_acquire ag;
+            return toIValue(pf->func_(pyFut), PyObjectType::get());
+          } catch (py::error_already_set& e) {
+            auto err = std::runtime_error(c10::str(
+                "Got the following error when running the callback: ",
+                e.what()));
+            {
+              pybind11::gil_scoped_acquire ag;
+              // Release ownership on py::objects and also restore Python
+              // Error Indicator.
+              e.restore();
+              // Clear the Python Error Indicator as we has recorded the
+              // exception in the response message.
+              PyErr_Clear();
+            }
+
+            throw std::runtime_error(err);
+          }
+        },
+        PyObjectType::get()));
+  }
+
+  void add_done_callback(py::function cb) {
+    auto pf = std::make_shared<PythonFunctionGuard>(std::move(cb));
+    // NOLINTNEXTLINE(modernize-avoid-bind)
+    fut->addCallback(std::bind(
+        [pyFut(this->getPtr())](
+            const std::shared_ptr<PythonFunctionGuard>& pf) {
+          try {
+            pybind11::gil_scoped_acquire ag;
+            pf->func_(pyFut);
+          } catch (py::error_already_set& e) {
+            {
+              pybind11::gil_scoped_acquire ag;
+              // Release ownership on py::objects and also restore Python
+              // Error Indicator.
+              e.restore();
+              // Clear the Python Error Indicator as we has recorded the
+              // exception in the response message.
+              PyErr_Clear();
+            }
+            // Log and ignore exceptions raised through the callback
+            LOG(ERROR) << "Got the following error when running the callback: "
+                       << e.what();
+
+          } catch (const std::exception& e) {
+            // Log and ignore exceptions raised through the callback
+            LOG(ERROR) << "Got the following error when running the callback: "
+                       << e.what();
+          }
+        },
+        std::move(pf)));
+  }
+
+  void markCompleted(const py::object& pyValue) {
+    DCHECK(PyGILState_Check());
+    IValue value = toIValue(pyValue, PyObjectType::get());
+
+    py::gil_scoped_release release;
+    fut->markCompleted(std::move(value));
+  }
+
+  c10::intrusive_ptr<c10::ivalue::Future> fut;
+  // unwrap_func works like a callback for the value returned by
+  // PythonFutureWrapper::wait().
+  std::optional<UnwrapFunc> unwrap_func;
+
+ private:
+  std::shared_ptr<PythonFutureWrapper> getPtr() {
+    return shared_from_this();
+  }
+};
+
+// The PythonAwaitWrapper for ivalue::Await
+//
+// Expresses delayed function execution with Lazy semantic.
+// i.e. Await[W] in eager mode can be used as W.
+// When the attribute of W type is requested, Await[W] will return the
+// attribute of W, transparently calling wait() beforehand.
+// No Lazy semantic for script, explicit wait(Await[W]) -> W must be called to
+// convert to type W.
+//
+// The Await object takes shared ownership of specified function and the
+// arguments. After first call for wait() it owns the result. Deliberately no
+// type inference for eager mode.
+struct VISIBILITY_HIDDEN PythonAwaitWrapper
+    : std::enable_shared_from_this<PythonAwaitWrapper> {
+  explicit PythonAwaitWrapper(c10::intrusive_ptr<c10::ivalue::Await> aw)
+      : aw_(std::move(aw)) {}
+  explicit PythonAwaitWrapper(py::handle input) {
+    args_ = py::tuple(1u);
+    args_[0] = input;
+    auto type = PyObjectType::get();
+    aw_ = c10::make_intrusive<c10::ivalue::Await>(type);
+    aw_->markCompleted(toIValue(input, type));
+  }
+
+  explicit PythonAwaitWrapper(py::function pf, py::tuple args)
+      : args_(std::move(args)) {
+    pyfg_ = std::make_shared<torch::jit::PythonFunctionGuard>(std::move(pf));
+
+    std::function<IValue()> f = [fg(pyfg_), &args(args_)]() {
+      pybind11::gil_scoped_acquire ag;
+      return toIValue(fg->func_(*args), PyObjectType::get());
+    };
+    aw_ = c10::make_intrusive<c10::ivalue::Await>(
+        PyObjectType::get(), std::move(f));
+  }
+
+  explicit PythonAwaitWrapper(const PythonAwaitWrapper&) = delete;
+  PythonAwaitWrapper& operator=(const PythonAwaitWrapper&) = delete;
+  PythonAwaitWrapper(PythonAwaitWrapper&&) = default;
+  PythonAwaitWrapper& operator=(PythonAwaitWrapper&&) = default;
+  ~PythonAwaitWrapper() = default;
+
+  py::object wait() {
+    py::gil_scoped_acquire acquire;
+    return toPyObject(aw_->wait());
+  }
+
+  // Nowait semantic means trivial case when Await is constructed from the
+  // result
+  bool is_nowait() {
+    return pyfg_ == nullptr;
+  }
+
+  const py::function fn() {
+    TORCH_CHECK(
+        pyfg_, "Await constructed as awaitable_nowait does not have fn");
+    return pyfg_->func_;
+  }
+
+  const py::tuple args() {
+    return args_;
+  }
+
+  TypePtr type() {
+    return aw_->type();
+  }
+
+  c10::intrusive_ptr<c10::ivalue::Await> aw_;
+  std::shared_ptr<torch::jit::PythonFunctionGuard> pyfg_;
+  py::tuple args_;
+
+ private:
+  std::shared_ptr<PythonAwaitWrapper> getPtr() {
+    return shared_from_this();
+  }
+};
+
+// error reporting: when reporting user-caused errors, these functions should
+// not use AT_ERROR macros, since these macros add stack trace information
+// that is confusing to display to the end user since it always reports
+// locations in libtorch code rather than user code.
+
+inline std::shared_ptr<CompilationUnit> get_python_cu() {
+  return py::module::import("torch.jit._state")
+      .attr("_python_cu")
+      .cast<std::shared_ptr<CompilationUnit>>();
+}
+
+struct TypedIValue : public std::pair<IValue, TypePtr> {
+  using pair::pair;
+
+  IValue& ivalue() {
+    return this->first;
+  }
+  TypePtr& type() {
+    return this->second;
+  }
+};
+
+inline TypedIValue toDictKeyIValue(py::handle key) {
+  if (py::isinstance<py::str>(key)) {
+    return TypedIValue(
+        ConstantString::create(py::cast<std::string>(key)), StringType::get());
+  } else if (py::isinstance<py::int_>(key)) {
+    return TypedIValue(py::cast<int64_t>(key), IntType::get());
+  } else if (py::isinstance<py::float_>(key)) {
+    return TypedIValue(py::cast<double>(key), FloatType::get());
+  } else {
+    TORCH_CHECK(
+        false, "Dictionary inputs may only have string, int, or float keys");
+  }
+}
+
+inline std::optional<TypePtr> unifyOrInitializeType(
+    const TypePtr& accum,
+    const TypePtr& unify) {
+  if (!accum) {
+    return unify;
+  }
+  return unifyTypes(accum, unify);
+}
+
+using InferredType = c10::InferredType;
+
+InferredType tryToInferContainerType(py::handle input, bool primitiveTypeOnly);
+
+// Try to infer the type of a Python object
+// The type cannot be inferred if:
+//   input is an empty container (list, dict)
+//   input is an list with element types that cannot be unified
+//   input is an dict with key or value types that cannot be unified
+inline InferredType tryToInferType(py::handle input) {
+  // Try tensor types
+  if (THPVariable_Check(input.ptr())) {
+    return InferredType(TensorType::get());
+  }
+
+  if (input.is_none()) {
+    return InferredType(NoneType::get());
+  }
+
+  if (py::isinstance<StrongFunctionPtr>(input)) {
+    auto fn = py::cast<StrongFunctionPtr>(input).function_;
+    return InferredType(FunctionType::create(fn));
+  }
+
+  // Try basic types first
+  if (py::isinstance<py::bool_>(input)) {
+    return InferredType(BoolType::get());
+    // NOLINTNEXTLINE(bugprone-branch-clone)
+  } else if (py::isinstance<py::int_>(input)) {
+    return InferredType(IntType::get());
+  } else if (py::isinstance<py::float_>(input)) {
+    return InferredType(FloatType::get());
+  } else if (PyComplex_CheckExact(input.ptr())) {
+    return InferredType(ComplexType::get());
+    // NOLINTNEXTLINE(bugprone-branch-clone)
+  } else if (py::isinstance<py::bytes>(input)) {
+    // NOTE: We may need a ByteType in the future
+    return InferredType(StringType::get());
+  } else if (py::isinstance<py::str>(input)) {
+    return InferredType(StringType::get());
+  } else if (THPLayout_Check(input.ptr())) {
+    return InferredType(IntType::get());
+  } else if (THPDevice_Check(input.ptr())) {
+    return InferredType(DeviceObjType::get());
+  } else if (THPGenerator_Check(input.ptr())) {
+    return InferredType(GeneratorType::get());
+  } else if (THPStream_Check(input.ptr())) {
+    return InferredType(StreamObjType::get());
+  } else if (THPDtype_Check(input.ptr())) {
+    return InferredType(IntType::get());
+  } else if (THPQScheme_Check(input.ptr())) {
+    return InferredType(IntType::get());
+  } else if (THPLayout_Check(input.ptr())) {
+    return InferredType(IntType::get());
+  }
+
+  auto enum_type = py::module::import("enum").attr("Enum");
+  py::bool_ isEnumValue = py::isinstance(input, enum_type);
+  if (py::cast<bool>(isEnumValue)) {
+    auto enum_class = input.attr("__class__");
+    auto enum_type = py::cast<TypePtr>(
+        py::module::import("torch.jit.annotations")
+            .attr("try_ann_to_type")(enum_class, SourceRange()));
+    return InferredType(std::move(enum_type));
+  }
+
+  py::bool_ isClass =
+      py::module::import("inspect").attr("isclass")(py::type::handle_of(input));
+  if (py::cast<bool>(isClass)) {
+    // Assume that the class is compiled already or will compile. Invalidate
+    // this later if needed.
+    bool class_compiled = true;
+
+    // Check if the type is already compiled.
+    py::object existing_ty =
+        py::module::import("torch.jit._state")
+            .attr("_get_script_class")(py::type::handle_of(input));
+
+    if (existing_ty.is_none()) {
+      // If not, try to compile it.
+      py::bool_ can_compile =
+          py::module::import("torch._jit_internal")
+              .attr("can_compile_class")(py::type::handle_of(input));
+
+      if (py::cast<bool>(can_compile)) {
+        // Try to compile the class. This is wrapped in a try-catch because
+        // compilation of class types can raise an Exception and in that case,
+        // we want to defer to other attempts at type inference below rather
+        // than fail compilation altogether.
+        try {
+          py::module::import("torch.jit._script")
+              .attr("_recursive_compile_class")(
+                  py::type::handle_of(input), SourceRange());
+        } catch (...) {
+          // Invalidate the assumption that the class compiled so that we don't
+          // look up and return its JIT type as the type for the input.
+          class_compiled = false;
+        }
+      }
+    }
+
+    // If the class compiled successfully, look up the existing JIT type by
+    // qualified name and return it.
+    if (class_compiled) {
+      auto script_class =
+          py::module::import("torch.jit._state")
+              .attr("_get_script_class")(py::type::handle_of(input));
+
+      if (!script_class.is_none()) {
+        auto class_type = py::cast<ClassTypePtr>(script_class);
+
+        if (class_type && !class_type->is_module()) {
+          return InferredType(std::move(class_type));
+        }
+      }
+    }
+  }
+
+  if (py::isinstance<Object>(input)) {
+    auto object = py::cast<Object>(input);
+    return InferredType(object.type());
+#ifdef USE_RPC
+  } else if (py::isinstance<torch::distributed::rpc::PyRRef>(input)) {
+    auto rref_ivalue = input.cast<torch::distributed::rpc::PyRRef>().toIValue();
+    return InferredType(rref_ivalue.type());
+#endif
+  }
+
+  auto await_type = py::module::import("torch._awaits").attr("_Await");
+  py::bool_ is_await = py::isinstance(input, await_type);
+  if (py::cast<bool>(is_await)) {
+    auto awptr = input.cast<std::shared_ptr<PythonAwaitWrapper>>();
+    return InferredType(AwaitType::create(awptr->aw_->elementType()));
+  }
+
+  if (as_module(py::cast<py::object>(input))) {
+    return InferredType("Cannot infer type of ScriptModule");
+  }
+
+  auto module_type = py::module::import("torch.nn").attr("Module");
+  py::bool_ is_module = py::isinstance(input, module_type);
+  if (py::cast<bool>(is_module)) {
+    return InferredType("Cannot infer concrete type of torch.nn.Module");
+  }
+
+  // Try container types
+  return tryToInferContainerType(input, false);
+}
+
+// This function is similar to tryToInferType, but it only tries to infer
+// primitive types (int, float, bool, complex) or nested container of primitive
+// types.
+inline InferredType tryToInferPrimitiveType(py::handle input) {
+  if (input.is_none()) {
+    return InferredType(NoneType::get());
+  }
+
+  // Only primitive data type
+  if (py::isinstance<py::bool_>(input)) {
+    return InferredType(BoolType::get());
+    // NOLINTNEXTLINE(bugprone-branch-clone)
+  } else if (py::isinstance<py::int_>(input)) {
+    return InferredType(IntType::get());
+  } else if (py::isinstance<py::float_>(input)) {
+    return InferredType(FloatType::get());
+  } else if (PyComplex_CheckExact(input.ptr())) {
+    return InferredType(ComplexType::get());
+  }
+
+  // Try container types
+  return tryToInferContainerType(input, true);
+}
+
+inline InferredType tryToInferContainerType(
+    py::handle input,
+    bool primitiveTypeOnly = false) {
+  if (six::isTuple(input)) {
+    py::tuple tuple = py::cast<py::tuple>(input);
+    std::vector<TypePtr> element_types;
+    element_types.reserve(tuple.size());
+
+    for (py::handle elem : tuple) {
+      auto type_match = primitiveTypeOnly ? tryToInferPrimitiveType(elem)
+                                          : tryToInferType(elem);
+      if (type_match.success()) {
+        element_types.push_back(type_match.type());
+      } else {
+        // Forward error message along
+        return type_match.reason();
+      }
+    }
+    return InferredType(TupleType::create(std::move(element_types)));
+  } else if (PyDict_Check(input.ptr())) {
+    // Check to make sure we can generate useful input/output types
+    auto dict = py::cast<py::dict>(input);
+    size_t len = py::len(dict);
+    if (!len) {
+      return InferredType("Dictionary inputs must have entries");
+    }
+
+    TypePtr key_type = nullptr;
+    TypePtr value_type = nullptr;
+
+    for (auto entry : dict) {
+      // Try to infer the key type and unify it with the existing one
+      auto entry_key_type_match = primitiveTypeOnly
+          ? tryToInferPrimitiveType(entry.first)
+          : tryToInferType(entry.first);
+      if (!entry_key_type_match.success()) {
+        return entry_key_type_match.reason();
+      }
+      auto unified_key =
+          unifyOrInitializeType(key_type, entry_key_type_match.type());
+      if (!unified_key) {
+        return InferredType(c10::str(
+            "Dictionary inputs to traced functions must have consistent type. Found ",
+            key_type->repr_str(),
+            " and ",
+            (entry_key_type_match.type())->repr_str()));
+      }
+
+      // Try to infer the value type and unify it with the existing one
+      auto entry_value_type_match = primitiveTypeOnly
+          ? tryToInferPrimitiveType(entry.second)
+          : tryToInferType(entry.second);
+      if (!entry_value_type_match.success()) {
+        return entry_value_type_match.reason();
+      }
+      auto unified_value =
+          unifyOrInitializeType(value_type, entry_value_type_match.type());
+      if (!unified_value) {
+        return InferredType(c10::str(
+            "Dictionary inputs to traced functions must have consistent type. Found ",
+            value_type->repr_str(),
+            " and ",
+            (entry_value_type_match.type())->repr_str()));
+      }
+
+      key_type = *unified_key;
+      value_type = *unified_value;
+    }
+    return InferredType(
+        DictType::create(std::move(key_type), std::move(value_type)));
+  } else if (PyList_Check(input.ptr())) {
+    auto list = py::cast<py::list>(input);
+    size_t len = py::len(list);
+    if (!len) {
+      return InferredType("List trace inputs must have elements");
+    }
+
+    TypePtr element_type = nullptr;
+    for (auto elem : list) {
+      auto element_type_match = primitiveTypeOnly
+          ? tryToInferPrimitiveType(elem)
+          : tryToInferType(elem);
+      if (!element_type_match.success()) {
+        return InferredType(c10::str(
+            "Could not infer type of list element: ",
+            element_type_match.reason()));
+      }
+      auto unified_type =
+          unifyOrInitializeType(element_type, element_type_match.type());
+      if (!unified_type) {
+        return InferredType(c10::str(
+            "List inputs to traced functions must have consistent element type. Found ",
+            element_type->repr_str(),
+            " and ",
+            (element_type_match.type())->repr_str()));
+      }
+      element_type = *unified_type;
+    }
+    return InferredType(ListType::create(element_type));
+  } else {
+    if (primitiveTypeOnly) {
+      return InferredType(c10::str(
+          "Only tuple, list, or dict (possibly nested) of primitive types (bool, float, int, complex)",
+          "are supported ",
+          "as inputs or outputs of traced functions",
+          ", but instead got value of type ",
+          py::str(py::type::handle_of(input).attr("__name__")),
+          "."));
+    } else {
+      // TODO: this message is not correct anymore, since this InferredType is
+      // used from a bunch of circumstances unrelated to tracing. We can re-use
+      // this instead of the attribute_failure stuff in concreteType
+      return InferredType(c10::str(
+          "Only tensors and (possibly nested) tuples of tensors, lists, or dicts ",
+          "are supported ",
+          "as inputs or outputs of traced functions",
+          ", but instead got value of type ",
+          py::str(py::type::handle_of(input).attr("__name__")),
+          "."));
+    }
+  }
+}
+
+inline bool isTraceableType(const TypePtr& type) {
+  if (type->isSubtypeOf(*TensorType::get())) {
+    return true;
+  }
+
+  if (auto list_type = type->cast<ListType>()) {
+    return isTraceableType(list_type->getElementType());
+  }
+
+  if (auto tuple_type = type->cast<TupleType>()) {
+    return std::all_of(
+        tuple_type->elements().begin(),
+        tuple_type->elements().end(),
+        [](const TypePtr& element_type) {
+          return isTraceableType(element_type);
+        });
+  }
+
+  if (auto dict_type = type->cast<DictType>()) {
+    return isTraceableType(dict_type->getValueType());
+  }
+
+  return false;
+}
+
+inline IValue toTypeInferredIValue(py::handle input) {
+  auto match = tryToInferType(input);
+  if (!match.success()) {
+    auto object = py::cast<py::object>(input);
+    if (auto mod = as_module(object)) {
+      // if obj is already a ScriptModule, just return its ivalue
+      auto ptr = mod.value()._ivalue();
+      // explicit copy semantics for strong ownership of the resource.
+      return c10::intrusive_ptr<c10::ivalue::Object>::reclaim_copy(
+          ptr.release());
+    }
+
+    // Check if the obj is a ScriptObject.
+    if (auto script_obj = as_object(object)) {
+      auto ptr = script_obj.value()._ivalue();
+      return c10::intrusive_ptr<c10::ivalue::Object>::reclaim_copy(
+          ptr.release());
+    }
+    TORCH_CHECK(
+        false,
+        "Tracer cannot infer type of ",
+        py::str(input),
+        "\n:",
+        match.reason());
+  }
+  return toIValue(input, match.type());
+}
+
+inline Stack toTraceableStack(const py::tuple& inputs) {
+  auto info = toTypeInferredIValue(inputs);
+  TORCH_CHECK(
+      isTraceableType(info.type()),
+      "Type '",
+      info.type()->repr_str(),
+      "' cannot be traced. Only Tensors and (possibly nested) Lists, Dicts, and"
+      " Tuples of Tensors can be traced");
+  return info.toTupleRef().elements().vec();
+}
+
+// Serialize the python dictionary into a traceable stack.
+inline Stack toTraceableStack(const py::dict& inputs) {
+  Stack res;
+  for (auto it = inputs.begin(); it != inputs.end(); it++) {
+    if (THPVariable_Check(it->second.ptr())) {
+      res.push_back(toIValue(it->second, tryToInferType(it->second).type()));
+    }
+  }
+  return res;
+}
+
+inline IValue createGenericList(py::handle obj, const TypePtr& elem_type) {
+  auto elems = c10::impl::GenericList(elem_type);
+  for (auto elem : obj) {
+    elems.push_back(toIValue(elem, elem_type));
+  }
+  return IValue(elems);
+}
+
+inline IValue createGenericDict(
+    const py::dict& obj,
+    const TypePtr& key_type,
+    const TypePtr& value_type) {
+  c10::impl::GenericDict elems(key_type, value_type);
+  elems.reserve(py::len(obj));
+  for (auto& entry : obj) {
+    elems.insert(
+        toIValue(entry.first, key_type), toIValue(entry.second, value_type));
+  }
+  return IValue(elems);
+}
+
+template <class T>
+inline void guardAgainstNamedTensor(const T& var) {
+  TORCH_CHECK(
+      !var.has_names(),
+      "NYI: Named tensors are currently unsupported in TorchScript. As a  "
+      "workaround please drop names via `tensor = tensor.rename(None)`.");
+}
+
+// Extract custom class registered with torchbind
+template <typename T>
+c10::intrusive_ptr<T> toCustomClass(py::handle obj) {
+  static_assert(
+      std::is_base_of_v<CustomClassHolder, T>, "T is not a CustomClass");
+  const auto& type = c10::getCustomClassType<c10::intrusive_ptr<T>>();
+  c10::IValue ivalue = toIValue(obj, type);
+  return std::move(ivalue).toCustomClass<T>();
+}
+
+// Small wrapper around getting the type name string from Python to make
+// types easier to interpret, e.g. give the structural type for a NamedTuple
+inline std::string friendlyTypeName(py::handle obj) {
+  if (py::isinstance<py::tuple>(obj) && py::hasattr(obj, "_fields")) {
+    auto field_names =
+        py::cast<std::vector<std::string>>(py::getattr(obj, "_fields"));
+    std::stringstream ss;
+    ss << py::str(py::type::handle_of(obj).attr("__name__"));
+    ss << " (aka NamedTuple(";
+    bool first = true;
+    for (auto& field_name : field_names) {
+      if (!first) {
+        ss << ", ";
+      }
+      ss << field_name;
+      first = false;
+    }
+    ss << "))";
+    return ss.str();
+  } else {
+    return py::str(py::type::handle_of(obj).attr("__name__"));
+  }
+}
+
+// Thrown when trying to create a schema for a list of python
+// arguments that cannot be converted.
+// Can be caught by the caller to attempt to use other schema
+// when there is an overloaded operator.
+struct schema_match_error : public std::runtime_error {
+  using std::runtime_error::runtime_error;
+};
+
+inline IValue argumentToIValue(
+    const FunctionSchema& schema,
+    size_t argumentPosition,
+    py::handle object) {
+  const auto& argument = schema.arguments().at(argumentPosition);
+  try {
+    return toIValue(object, argument.real_type(), argument.N());
+  } catch (const py::cast_error& error) {
+    throw schema_match_error(c10::str(
+        schema.formatTypeMismatchMsg(
+            argument,
+            friendlyTypeName(object),
+            argumentPosition,
+            py::repr(object)),
+        "\nCast error details: ",
+        error.what()));
+  } catch (const py::error_already_set& error) {
+    throw schema_match_error(c10::str(
+        schema.formatTypeMismatchMsg(
+            argument,
+            friendlyTypeName(object),
+            argumentPosition,
+            py::repr(object)),
+        "\n Python error details: ",
+        error.what()));
+  }
+}
+
+inline IValue returnToIValue(const TypePtr& type, py::handle object) {
+  try {
+    return toIValue(object, type);
+  } catch (const py::cast_error& error) {
+    throw std::runtime_error(c10::str(
+        " expected value of type ",
+        type->str(),
+        " for return value but instead got value of type ",
+        py::str(py::type::handle_of(object).attr("__name__")),
+        ".",
+        "\nValue: ",
+        py::repr(object),
+        "\nCast error details: ",
+        error.what()));
+  }
+}
+
+inline py::object getScriptedClassOrError(const c10::NamedTypePtr& classType) {
+  auto py_class =
+      py::module::import("torch.jit._state")
+          .attr("_get_python_class")(classType->name()->qualifiedName());
+  if (py_class.is_none()) {
+    std::stringstream err;
+    err << "Unknown reference to ScriptClass ";
+    err << classType->name()->qualifiedName();
+    err << ". (Did you forget to import it?)";
+    throw std::runtime_error(err.str());
+  }
+  return py_class;
+}
+
+struct VISIBILITY_HIDDEN tuple_slice {
+  /*implicit*/ tuple_slice(py::tuple tup_)
+      : tup(std::move(tup_)), b(0), e(static_cast<int64_t>(tup.size())) {}
+  tuple_slice(py::tuple tup_, int64_t b_)
+      : tup(std::move(tup_)), b(b_), e(static_cast<int64_t>(tup.size())) {}
+  tuple_slice(py::tuple tup_, int64_t b_, int64_t e_)
+      : tup(std::move(tup_)), b(b_), e(e_) {}
+  py::detail::tuple_iterator begin() const {
+    return {tup, static_cast<pybind11::ssize_t>(b)};
+  }
+  py::detail::tuple_iterator end() const {
+    return {tup, static_cast<pybind11::ssize_t>(e)};
+  }
+  size_t size() const {
+    return e - b;
+  }
+  py::detail::tuple_accessor operator[](size_t index) const {
+    return {tup, static_cast<size_t>(b + index)};
+  }
+
+ private:
+  py::tuple tup;
+  int64_t b;
+  int64_t e;
+};
+
+inline bool validateFakeScriptObjectSchema(
+    const c10::FunctionSchema& schema,
+    size_t argumentPosition,
+    py::handle object) {
+  auto argument = schema.arguments().at(argumentPosition);
+  auto class_type = argument.real_type()->expect<c10::ClassType>();
+  auto fake_class_registry =
+      py::module::import("torch._library.fake_class_registry");
+  auto fake_class = fake_class_registry.attr("find_fake_class")(
+      class_type->name().value().qualifiedName());
+  if (!py::isinstance(object.attr("wrapped_obj"), fake_class)) {
+    throw schema_match_error(c10::str(
+        schema.formatTypeMismatchMsg(
+            argument,
+            friendlyTypeName(object),
+            argumentPosition,
+            py::repr(object.attr("wrapped_obj"))),
+        "\nCast error details: ",
+        argument.name(),
+        " is expected to be a FakeScriptObject of ",
+        class_type->name().value().qualifiedName()));
+  }
+  return true;
+}
+
+inline bool matchSchemaAllowFakeScriptObject(
+    const FunctionSchema& schema,
+    const tuple_slice& args,
+    const py::kwargs& kwargs) {
+  size_t all_arguments = args.size() + kwargs.size();
+  if (all_arguments > schema.arguments().size()) {
+    throw schema_match_error(c10::str(
+        schema.name(),
+        "() expected at most ",
+        schema.arguments().size(),
+        " argument(s) but received ",
+        all_arguments,
+        " argument(s). Declaration: ",
+        schema));
+  }
+
+  int64_t arg_idx = 0;
+  auto fake_class_registry =
+      py::module::import("torch._library.fake_class_registry");
+
+  // First push all positional args.
+  for (const auto& arg : args) {
+    // ...but refuse to do it if the schema says that this was supposed
+    // to be keyword only
+    if (schema.arguments()[arg_idx].kwarg_only()) {
+      throw schema_match_error(c10::str(
+          schema.name(),
+          "() takes ",
+          arg_idx,
+          " positional argument(s) but ",
+          args.size(),
+          " was/were given.  Declaration: ",
+          schema));
+    }
+    // Use the type information from the schema to convert the PyObject.
+    const auto& argument = schema.arguments().at(arg_idx);
+    if (argument.real_type()->kind() == TypeKind::ClassType &&
+        py::isinstance(arg, fake_class_registry.attr("FakeScriptObject"))) {
+      validateFakeScriptObjectSchema(schema, arg_idx, arg);
+    } else {
+      argumentToIValue(schema, arg_idx, arg);
+    }
+
+    arg_idx++;
+  }
+
+  // Now for every remaining non-positional argument in the schema, look for it
+  // in the kwargs dict and push it if found, or use its default value if it
+  // has one.
+  size_t consumed_kwargs = 0;
+  for (size_t i = arg_idx; i < schema.arguments().size(); ++i) {
+    const auto& arg = schema.arguments()[i];
+    if (kwargs.contains(arg.name().c_str())) {
+      auto cur_kwarg = kwargs[arg.name().c_str()];
+      if (arg.real_type()->kind() == TypeKind::ClassType &&
+          py::isinstance(
+              cur_kwarg, fake_class_registry.attr("FakeScriptObject"))) {
+        validateFakeScriptObjectSchema(schema, i, cur_kwarg);
+      } else {
+        argumentToIValue(schema, i, cur_kwarg);
+      }
+      consumed_kwargs += 1;
+    } else if (arg.default_value()) {
+      continue;
+    } else {
+      throw schema_match_error(c10::str(
+          schema.name(),
+          "() is missing value for argument '",
+          arg.name(),
+          "'. Declaration: ",
+          schema));
+    }
+  }
+
+  if (consumed_kwargs != kwargs.size()) {
+    std::vector<std::string> names;
+    for (const auto& kwarg : kwargs) {
+      names.emplace_back(py::cast<std::string>(kwarg.first));
+    }
+    throw schema_match_error(schema.findErrorInKwargs(names));
+  }
+
+  return true;
+}
+
+inline Stack createStackForSchema(
+    const FunctionSchema& schema,
+    const tuple_slice& args,
+    const py::kwargs& kwargs,
+    std::optional<IValue> self) {
+  size_t all_arguments = (self ? 1 : 0) + args.size() + kwargs.size();
+  if (all_arguments > schema.arguments().size()) {
+    throw schema_match_error(c10::str(
+        schema.name(),
+        "() expected at most ",
+        schema.arguments().size(),
+        " argument(s) but received ",
+        all_arguments,
+        " argument(s). Declaration: ",
+        schema));
+  }
+  Stack stack;
+  stack.reserve(schema.arguments().size());
+
+  int64_t arg_idx = 0;
+  if (self) {
+    push(stack, std::move(*self));
+    arg_idx++;
+  }
+  // First push all positional args.
+  for (const auto& arg : args) {
+    // ...but refuse to do it if the schema says that this was supposed
+    // to be keyword only
+    if (schema.arguments()[arg_idx].kwarg_only()) {
+      throw schema_match_error(c10::str(
+          schema.name(),
+          "() takes ",
+          arg_idx,
+          " positional argument(s) but ",
+          self ? 1 + args.size() : args.size(),
+          " was/were given.  Declaration: ",
+          schema));
+    }
+    // Use the type information from the schema to convert the PyObject.
+    push(stack, argumentToIValue(schema, stack.size(), arg));
+    arg_idx++;
+  }
+
+  // Now for every remaining non-positional argument in the schema, look for it
+  // in the kwargs dict and push it if found, or use its default value if it
+  // has one.
+  size_t consumed_kwargs = 0;
+  for (size_t i = stack.size(); i < schema.arguments().size(); ++i) {
+    const auto& arg = schema.arguments()[i];
+    if (kwargs.contains(arg.name().c_str())) {
+      push(stack, argumentToIValue(schema, i, kwargs[arg.name().c_str()]));
+      consumed_kwargs += 1;
+    } else if (arg.default_value()) {
+      push(stack, *arg.default_value());
+    } else {
+      throw schema_match_error(c10::str(
+          schema.name(),
+          "() is missing value for argument '",
+          arg.name(),
+          "'. Declaration: ",
+          schema));
+    }
+  }
+
+  if (consumed_kwargs != kwargs.size()) {
+    std::vector<std::string> names;
+    for (const auto& kwarg : kwargs) {
+      names.emplace_back(py::cast<std::string>(kwarg.first));
+    }
+    throw schema_match_error(schema.findErrorInKwargs(names));
+  }
+
+  return stack;
+}
+
+// NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved)
+inline py::object createPyObjectForStack(Stack&& stack) {
+  if (stack.empty()) {
+    return py::none();
+  }
+
+  // Return a simple value and not a single-element tuple if there is only one
+  // return value.
+  if (stack.size() == 1) {
+    return toPyObject(std::move(stack[0]));
+  }
+
+  // If there is more than one return value, pop them into a py::tuple.
+  py::tuple return_values(stack.size());
+  for (const auto ret : c10::irange(return_values.size())) {
+    return_values[ret] = toPyObject(std::move(stack[ret]));
+  }
+
+#if defined(__clang__)
+  return std::move(return_values);
+#else
+  return return_values;
+#endif
+}
+
+// TODO: Remove once we clean up the GraphExecutor usage.
+inline Stack evilDeprecatedBadCreateStackDoNotUse(
+    const py::tuple& tuple,
+    at::ArrayRef<Value*> inputs,
+    size_t reserve_extra_space = 0) {
+  if (tuple.size() != inputs.size()) {
+    TORCH_CHECK(
+        false,
+        "expected " + std::to_string(inputs.size()) + " inputs, but got " +
+            std::to_string(tuple.size()));
+  }
+  Stack result;
+  result.reserve(tuple.size() + reserve_extra_space);
+  for (const auto i : c10::irange(inputs.size())) {
+    result.push_back(toIValue(std::move(tuple[i]), inputs[i]->type()));
+  }
+  return result;
+}
+
+// Run `callee`, potentially inserting a CallFunction/CallMethod node into the
+// tracing graph.
+inline py::object runAndInsertCall(
+    Function& callee,
+    const tuple_slice& args,
+    const py::kwargs& kwargs,
+    std::optional<IValue> self,
+    // Lambda that tells this function how to insert `callee` into the graph if
+    // we're tracing.
+    const std::function<Value*(Graph&, const MatchedSchema& match)>&
+        callInserter) {
+  auto stack =
+      createStackForSchema(callee.getSchema(), args, kwargs, std::move(self));
+  const auto& tracing_state = tracer::getTracingState();
+  if (!tracing_state) {
+    pybind11::gil_scoped_release no_gil_guard;
+    // If we're not tracing, just run the callee as normal.
+    callee.run(stack);
+  } else {
+    // If we are tracing, insert the appropriate CallFunction or CallMethod node
+    // and then run the callee with tracing disabled.
+
+    // Get the graph `Value`s that represent the input IValues
+    auto inputs = last(stack, callee.num_inputs());
+    auto input_values =
+        fmap(inputs, [](const IValue& v) { return tracer::getValueTrace(v); });
+    TORCH_INTERNAL_ASSERT(callee.getSchema().returns().size() == 1)
+    auto return_type = callee.getSchema().returns().at(0).type();
+    auto graph = tracing_state->graph;
+    std::vector<NamedValue> named_values;
+    named_values.reserve(input_values.size());
+    for (Value* v : input_values) {
+      named_values.emplace_back(v);
+    }
+
+    // Add a call node.
+    MatchedSchema match = matchSchema(
+        callee.getSchema(),
+        tracer::getPythonInterpreterSourceRange(),
+        *graph,
+        named_values,
+        {});
+    auto output_value = callInserter(*graph, match);
+
+    // Actually run the callee. Pause the tracer so that we don't double-add the
+    // callee nodes.
+    {
+      pybind11::gil_scoped_release no_gil_guard;
+      ResourceGuard guard(tracer::pauseTracing());
+      callee.run(stack);
+    }
+
+    // Associate the output IValues with the output `Value`s in the graph
+    tracer::setValueTrace(stack.back(), output_value);
+  }
+
+  TORCH_CHECK(
+      !stack.empty(),
+      "Expected values in the stack after execution but found none");
+  return toPyObject(std::move(stack.back()));
+}
+
+inline std::optional<py::object> maybeTorchFunctionDispatch(
+    const py::object& callee,
+    const tuple_slice& args_no_self,
+    const py::kwargs& kwargs,
+    const c10::QualifiedName& qualname) {
+  std::vector<py::handle> args_vec;
+  for (const auto& arg : args_no_self) {
+    args_vec.push_back(arg);
+  }
+  py::tuple args = py::cast(args_vec);
+
+  // Handle __torch_function__ dispatch
+  std::vector<PyObject*> overloaded_args;
+  size_t total_arg_num = args.size() + kwargs.size();
+  for (const auto& arg : args) {
+    is_tensor_and_append_overloaded(arg.ptr(), &overloaded_args);
+    is_tensor_list_and_append_overloaded(
+        arg.ptr(),
+        &overloaded_args,
+        static_cast<int>(total_arg_num),
+        false /* throw_error */);
+  }
+  // NB: for kwargs, we cannot guarantee the order of appending
+  // is the same as the argument order in operator's schema.
+  // This is suboptimal, but should be fine. Later when we have
+  // better schema matching and argument parsing, we could
+  // match the operator in `operations` first, then the order will
+  // be guaranteed.
+  for (auto item : kwargs) {
+    is_tensor_and_append_overloaded(item.second.ptr(), &overloaded_args);
+    is_tensor_list_and_append_overloaded(
+        item.second.ptr(),
+        &overloaded_args,
+        total_arg_num,
+        false /* throw_error */);
+  }
+  if (!overloaded_args.empty()) {
+    return pybind11::reinterpret_steal<py::object>(
+        handle_torch_function_no_python_arg_parser(
+            /*overloaded_args=*/overloaded_args,
+            /*args=*/args.ptr(),
+            /*kwargs=*/kwargs.ptr(),
+            /*func_name=*/qualname.name().c_str(),
+            /*torch_api_function=*/callee.ptr(),
+            /*module_name=*/qualname.prefix().c_str()));
+  }
+
+  return std::nullopt;
+}
+
+inline py::object invokeScriptFunctionFromPython(
+    Function& callee,
+    const tuple_slice& args,
+    const py::kwargs& kwargs) {
+  // TODO: we could add __torch_function__ dispatch here but I don't know
+  // the implications of doing so
+
+  return runAndInsertCall(
+      callee,
+      args,
+      kwargs,
+      /*self=*/std::nullopt,
+      [&](Graph& graph, const MatchedSchema& match) {
+        return graph.insertFunctionCall(&callee, match);
+      });
+}
+
+inline py::object invokeScriptMethodFromPython(
+    Method& callee,
+    const tuple_slice& args,
+    const py::kwargs& kwargs) {
+  auto self = callee.owner()._ivalue();
+
+  if (auto torch_fn_result = maybeTorchFunctionDispatch(
+          py::cast(callee), args, kwargs, callee.name())) {
+    return *torch_fn_result;
+  }
+
+  return runAndInsertCall(
+      callee.function(),
+      args,
+      kwargs,
+      self,
+      [&](Graph& graph, const MatchedSchema& match) {
+        return graph.insertMethodCall(callee.name(), match);
+      });
+}
+
+TORCH_PYTHON_API std::pair<std::shared_ptr<Operator>, Stack> getOpWithStack(
+    const std::vector<std::shared_ptr<Operator>>& operations,
+    const py::args& args,
+    const py::kwargs& kwargs);
+
+TORCH_PYTHON_API py::object invokeOperatorFromPython(
+    const std::vector<std::shared_ptr<Operator>>& operations,
+    const py::args& args,
+    const py::kwargs& kwargs,
+    std::optional<c10::DispatchKey> dk = std::nullopt);
+
+TORCH_PYTHON_API std::optional<py::object> _maybe_handle_torch_function(
+    const std::string& ns,
+    const std::string& method_name,
+    const std::string& overload_name,
+    bool is_overload,
+    const py::args& args,
+    const py::kwargs& kwargs);
+
+TORCH_PYTHON_API bool checkSchemaAllowFakeScriptObject(
+    const FunctionSchema& schema,
+    const py::args& args,
+    const py::kwargs& kwargs);
+
+TORCH_PYTHON_API py::object _get_operation_for_overload_or_packet(
+    const std::vector<std::shared_ptr<Operator>>& operations,
+    Symbol symbol,
+    const py::args& args,
+    const py::kwargs& kwargs,
+    bool is_overload,
+    std::optional<c10::DispatchKey> dk = std::nullopt);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/python_arg_flatten.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/python_arg_flatten.h
new file mode 100644
index 0000000000000000000000000000000000000000..2626ebe72b3f5df2c61139a356291dfa92a34dbb
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/python_arg_flatten.h
@@ -0,0 +1,119 @@
+#pragma once
+
+#include <c10/util/hash.h>
+#include <c10/util/irange.h>
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/jit/python/pybind.h>
+
+#include <ATen/ATen.h>
+#include <functional>
+#include <tuple>
+#include <vector>
+
+namespace torch::jit::python {
+
+struct IODescriptor {
+  struct VariableMetadata {
+    VariableMetadata(const autograd::Variable& var)
+        : sizes(var.sizes().vec()),
+          type(var.scalar_type()),
+          device(var.device()),
+          requires_grad(var.requires_grad()) {}
+
+    bool operator==(const VariableMetadata& o) const {
+      return std::tie(device, requires_grad, type, sizes) ==
+          std::tie(o.device, o.requires_grad, o.type, o.sizes);
+    }
+
+    static size_t hash(const VariableMetadata& m) {
+      return c10::get_hash(m.sizes, m.device, m.requires_grad, m.type);
+    }
+
+    std::vector<int64_t> sizes;
+    at::ScalarType type;
+    at::Device device;
+    bool requires_grad;
+  };
+
+  bool operator==(const IODescriptor& o) const {
+    return std::tie(structure, metadata, grad_enabled) ==
+        std::tie(o.structure, o.metadata, o.grad_enabled);
+  }
+
+  static size_t hash(const IODescriptor& o) {
+    return c10::get_hash(o.structure, o.metadata, o.grad_enabled);
+  }
+
+  void extend(const autograd::variable_list& list) {
+    metadata.reserve(metadata.size() + list.size());
+    for (auto& var : list)
+      metadata.emplace_back(var);
+  }
+
+  // Description of argument structure. Variables are replaced with
+  // different characters, depending on their flags, beginnings and
+  // ends of tuples and lists are denoted by a pair of parenthesis
+  // of their corresponding kind. They should always be paired.
+  // Example desc: (vv[v(v)v])
+  // NOTE: if extend() was ever called then metadata.size() can be
+  // different than the number of 'v's in structure.
+  std::string structure;
+  std::vector<std::string> strings;
+  std::vector<VariableMetadata> metadata;
+  bool grad_enabled = false;
+};
+
+static inline std::ostream& operator<<(
+    std::ostream& out,
+    const IODescriptor::VariableMetadata& meta) {
+  at::Device meta_device = meta.device;
+  auto& t = at::getDeprecatedTypeProperties(
+      meta_device.is_cpu() ? at::Backend::CPU : at::Backend::CUDA, meta.type);
+  out << t << "(requires_grad=" << meta.requires_grad;
+  if (meta_device.is_cuda()) {
+    out << ", device=" << meta_device.index();
+  }
+  out << ") {";
+  for (const auto i : c10::irange(meta.sizes.size())) {
+    if (i > 0)
+      out << ", ";
+    out << meta.sizes[i];
+  }
+  out << "}";
+  return out;
+}
+
+static inline std::ostream& operator<<(
+    std::ostream& out,
+    const IODescriptor& desc) {
+  out << desc.structure << "\n";
+  out << "  with grad_enabled=" << desc.grad_enabled << "\n";
+  for (const auto i : c10::irange(desc.metadata.size())) {
+    out << "  with v" << i << " having type " << desc.metadata[i] << "\n";
+  }
+  return out;
+}
+
+struct ParsedArgs {
+  // Flat vector of Variables found in arguments
+  autograd::variable_list vars;
+  // Metadata describing nesting of objects received from Python and
+  // metadata of vars and whether grad is enabled.
+  IODescriptor desc;
+
+  void extend(const autograd::variable_list& list) {
+    if (list.empty())
+      return;
+    vars.reserve(vars.size() + list.size());
+    for (auto& var : list)
+      vars.emplace_back(var);
+    desc.extend(list);
+  }
+};
+
+ParsedArgs flatten(py::handle obj);
+PyObject* unflatten(
+    at::ArrayRef<autograd::Variable> vars,
+    const IODescriptor& structure);
+
+} // namespace torch::jit::python
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/python_custom_class.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/python_custom_class.h
new file mode 100644
index 0000000000000000000000000000000000000000..65ffbc7756fe2b98262526a2eb7b6aa312395e6b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/python_custom_class.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <torch/csrc/utils/pybind.h>
+#include <torch/custom_class.h>
+
+namespace torch::jit {
+
+void initPythonCustomClassBindings(PyObject* module);
+
+struct ScriptClass {
+  ScriptClass(c10::StrongTypePtr class_type)
+      : class_type_(std::move(class_type)) {}
+
+  py::object __call__(const py::args& args, const py::kwargs& kwargs);
+
+  c10::StrongTypePtr class_type_;
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/python_dict.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/python_dict.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c79155a02e81bf528891393935558c37648e513
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/python_dict.h
@@ -0,0 +1,127 @@
+#pragma once
+
+#include <ATen/core/Dict.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch::jit {
+
+void initScriptDictBindings(PyObject* module);
+
+/// An iterator over the keys of ScriptDict. This is used to support
+/// .keys() and iteration.
+class ScriptDictKeyIterator final {
+ public:
+  ScriptDictKeyIterator(
+      c10::impl::GenericDict::iterator iter,
+      c10::impl::GenericDict::iterator end)
+      : iter_(std::move(iter)), end_(std::move(end)) {}
+  at::IValue next();
+
+ private:
+  c10::impl::GenericDict::iterator iter_;
+  c10::impl::GenericDict::iterator end_;
+};
+
+/// An iterator over the key-value pairs of ScriptDict. This is used to support
+/// .items().
+class ScriptDictIterator final {
+ public:
+  ScriptDictIterator(
+      c10::impl::GenericDict::iterator iter,
+      c10::impl::GenericDict::iterator end)
+      : iter_(std::move(iter)), end_(std::move(end)) {}
+  at::IValue next();
+
+ private:
+  c10::impl::GenericDict::iterator iter_;
+  c10::impl::GenericDict::iterator end_;
+};
+
+/// A wrapper around c10::Dict that can be exposed in Python via pybind
+/// with an API identical to the Python dictionary class. This allows
+/// dictionaries to have reference semantics across the Python/TorchScript
+/// boundary.
+class ScriptDict final {
+ public:
+  // Constructor.
+  ScriptDict(const at::IValue& data)
+      : dict_(at::AnyType::get(), at::AnyType::get()) {
+    TORCH_INTERNAL_ASSERT(data.isGenericDict());
+    dict_ = data.toGenericDict();
+  }
+
+  // Get the type of the dictionary.
+  at::DictTypePtr type() const {
+    return at::DictType::create(dict_.keyType(), dict_.valueType());
+  }
+
+  // Return a string representation that can be used
+  // to reconstruct the instance.
+  std::string repr() const {
+    std::ostringstream s;
+    s << '{';
+    bool f = false;
+    for (auto const& kv : dict_) {
+      if (f) {
+        s << ", ";
+      }
+      s << kv.key() << ": " << kv.value();
+      f = true;
+    }
+    s << '}';
+    return s.str();
+  }
+
+  // Return an iterator over the keys of the dictionary.
+  ScriptDictKeyIterator iter() const {
+    auto begin = dict_.begin();
+    auto end = dict_.end();
+    return ScriptDictKeyIterator(begin, end);
+  }
+
+  // Return an iterator over the key-value pairs of the dictionary.
+  ScriptDictIterator items() const {
+    auto begin = dict_.begin();
+    auto end = dict_.end();
+    return ScriptDictIterator(begin, end);
+  }
+
+  // Interpret the dictionary as a boolean; empty means false, non-empty means
+  // true.
+  bool toBool() const {
+    return !(dict_.empty());
+  }
+
+  // Get the value for the given key. Throws std::out_of_range if the key does
+  // not exist.
+  at::IValue getItem(const at::IValue& key) {
+    return dict_.at(key);
+  }
+
+  // Set the value for the given key.
+  void setItem(const at::IValue& key, const at::IValue& value) {
+    dict_.insert_or_assign(key, value);
+  }
+
+  // Check whether the dictionary contains the given key.
+  bool contains(const at::IValue& key) {
+    return dict_.contains(key);
+  }
+
+  // Delete the given key from the dictionary.
+  bool delItem(const at::IValue& key) {
+    return dict_.erase(key);
+  }
+
+  // Get the size of the dictionary.
+  int64_t len() const {
+    return dict_.size();
+  }
+
+  // A c10::Dict instance that holds the actual data.
+  c10::impl::GenericDict dict_;
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/python_ir.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/python_ir.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad9e9f0855029617e5c50a8dbfbc0fe543118f93
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/python_ir.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/utils/object_ptr.h>
+
+namespace torch::jit {
+
+void initPythonIRBindings(PyObject* module);
+
+// execute a Python function, used for Ops we can't optimize but that we want to
+// optimize around
+struct ConcretePythonOp : public PythonOp {
+  static Symbol Kind;
+
+  ConcretePythonOp(Graph* graph) : PythonOp(graph, ::c10::prim::PythonOp) {}
+  ConcretePythonOp* init(
+      THPObjectPtr&& pyobj,
+      const std::string& cconv,
+      pyobj_list&& scalar_args) {
+    this->pyobj = std::move(pyobj);
+    this->scalar_args = std::move(scalar_args);
+    this->cconv = cconv;
+    return this;
+  }
+  // The Python object which contains the implementation of this function.
+  // This is either a class (non-legacy) or an object (legacy).  See
+  // TraceInterpreterState for execution semantics.
+  THPObjectPtr pyobj;
+  // The calling convention for the Python function.
+  // 'c' -- constant argument
+  // 'd' -- dynamic argument
+  std::string cconv;
+  // Scalar arguments to the Python function.  Not necessarily passed to
+  // the function in this order; see cconv for the correct order.
+  std::vector<THPObjectPtr> scalar_args;
+
+  std::string name() const override;
+  void cloneFrom(Node* other_) override;
+  Node* allocNewInstance(Graph* g) override {
+    return new ConcretePythonOp(g);
+  }
+  // recover the autograd.Function instance, if this PythonOp's function
+  // was originally SomeFunction.apply
+  // used in ONNX for discovering symbolics
+  std::optional<THPObjectPtr> autogradFunction() const override;
+  void writeScalars(std::ostream& out) const override;
+  void lint_python() const override;
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/python_ivalue.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/python_ivalue.h
new file mode 100644
index 0000000000000000000000000000000000000000..f6955df48c79315612f5dd0501556057172762e2
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/python_ivalue.h
@@ -0,0 +1,111 @@
+#pragma once
+#include <ATen/core/ivalue.h>
+#include <pybind11/pybind11.h>
+#include <torch/csrc/jit/python/pybind_utils.h>
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace py = pybind11;
+
+namespace c10::ivalue {
+
+// concrete ivalue Holder that hold a py::object
+struct C10_EXPORT ConcretePyObjectHolder final : PyObjectHolder {
+ public:
+  static c10::intrusive_ptr<PyObjectHolder> create(py::object py_obj) {
+    return c10::make_intrusive<ConcretePyObjectHolder>(std::move(py_obj));
+  }
+
+  static c10::intrusive_ptr<PyObjectHolder> create(const py::handle& handle) {
+    py::gil_scoped_acquire ag;
+    return c10::make_intrusive<ConcretePyObjectHolder>(
+        handle.cast<py::object>());
+  }
+
+  PyObject* getPyObject() override {
+    return py_obj_.ptr();
+  }
+
+  InferredType tryToInferType() override {
+    pybind11::gil_scoped_acquire ag;
+    return torch::jit::tryToInferType(py_obj_);
+  }
+
+  IValue toIValue(const TypePtr& type, std::optional<int32_t> N = std::nullopt)
+      override {
+    pybind11::gil_scoped_acquire ag;
+    return torch::jit::toIValue(py_obj_, type, N);
+  }
+
+  std::string toStr() override {
+    pybind11::gil_scoped_acquire ag;
+    return py::str(py_obj_);
+  }
+
+  std::vector<at::Tensor> extractTensors() override {
+    // We could implement this entirely in C++ via pybind11 but it turns out to
+    // be substantially slower. Namely, the total time taken by markCompleted on
+    // a CUDAFuture is 21.5us with this implementation, but goes up to 58.7us
+    // when using C++. The reason is unclear.
+    try {
+      pybind11::gil_scoped_acquire ag;
+
+#if IS_PYBIND_2_13_PLUS
+      PYBIND11_CONSTINIT static py::gil_safe_call_once_and_store<py::object>
+          storage;
+      auto& extractorFn =
+          storage
+              .call_once_and_store_result([]() -> py::object {
+                return py::module_::import("torch._jit_internal")
+                    .attr("_extract_tensors");
+              })
+              .get_stored();
+#else
+      static py::object& extractorFn = *new py::object(
+          py::module::import("torch._jit_internal").attr("_extract_tensors"));
+#endif
+
+      return extractorFn(py_obj_).cast<std::vector<at::Tensor>>();
+    } catch (py::error_already_set& e) {
+      auto err = std::runtime_error(
+          c10::str("Cannot extract tensors from value: ", e.what()));
+      {
+        pybind11::gil_scoped_acquire ag;
+        e.restore();
+        PyErr_Clear();
+      }
+      throw std::runtime_error(err);
+    }
+  }
+
+  // Note [Destructing py::object]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~
+  //
+  // (1) Why py_obj_ = py::none(); does not work. Because we also need to
+  // acquire GIL when destructing py::object of None that de-references None.
+  // https://docs.python.org/3/c-api/none.html#c.Py_RETURN_NONE
+  //
+  // https://stackoverflow.com/questions/15287590/why-should-py-increfpy-none-be-required-before-returning-py-none-in-c
+  //
+  // (2) Why we need to call dec_ref() explicitly. Because py::object of
+  // nullptr, on destruction, effectively does nothing because of it calls
+  // Py_XDECREF(NULL) underlying.
+  // https://docs.python.org/3/c-api/refcounting.html#c.Py_XDECREF
+  ~ConcretePyObjectHolder() override {
+    pybind11::gil_scoped_acquire ag;
+    py_obj_.dec_ref();
+    // explicitly setting PyObject* to nullptr to prevent py::object's dtor to
+    // decref on the PyObject again.
+    py_obj_.ptr() = nullptr;
+  }
+
+  // explicit construction to avoid errornous implicit conversion and
+  // copy-initialization
+  explicit ConcretePyObjectHolder(py::object py_obj)
+      : py_obj_(std::move(py_obj)) {}
+
+ private:
+  py::object py_obj_;
+};
+
+} // namespace c10::ivalue
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/python_list.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/python_list.h
new file mode 100644
index 0000000000000000000000000000000000000000..553c1953fbcf8d5dfb513e4fdc51ffd589855a35
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/python_list.h
@@ -0,0 +1,228 @@
+#pragma once
+
+#include <ATen/core/Dict.h>
+#include <ATen/core/List.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
+#include <pybind11/detail/common.h>
+#include <torch/csrc/utils/pybind.h>
+#include <cstddef>
+#include <optional>
+#include <stdexcept>
+
+namespace torch::jit {
+
+void initScriptListBindings(PyObject* module);
+
+/// An iterator over the elements of ScriptList. This is used to support
+/// __iter__(), .
+class ScriptListIterator final {
+ public:
+  ScriptListIterator(
+      c10::impl::GenericList::iterator iter,
+      c10::impl::GenericList::iterator end)
+      : iter_(iter), end_(end) {}
+  at::IValue next();
+  bool done() const;
+
+ private:
+  c10::impl::GenericList::iterator iter_;
+  c10::impl::GenericList::iterator end_;
+};
+
+/// A wrapper around c10::List that can be exposed in Python via pybind
+/// with an API identical to the Python list class. This allows
+/// lists to have reference semantics across the Python/TorchScript
+/// boundary.
+class ScriptList final {
+ public:
+  // TODO: Do these make sense?
+  using size_type = size_t;
+  using diff_type = ptrdiff_t;
+  using ssize_t = Py_ssize_t;
+
+  // Constructor for empty lists created during slicing, extending, etc.
+  ScriptList(const at::TypePtr& type) : list_(at::AnyType::get()) {
+    auto list_type = type->expect<at::ListType>();
+    list_ = c10::impl::GenericList(list_type);
+  }
+
+  // Constructor for instances based on existing lists (e.g. a
+  // Python instance or a list nested inside another).
+  ScriptList(const at::IValue& data) : list_(at::AnyType::get()) {
+    TORCH_INTERNAL_ASSERT(data.isList());
+    list_ = data.toList();
+  }
+
+  at::ListTypePtr type() const {
+    return at::ListType::create(list_.elementType());
+  }
+
+  // Return a string representation that can be used
+  // to reconstruct the instance.
+  std::string repr() const {
+    std::ostringstream s;
+    s << '[';
+    bool f = false;
+    for (auto const& elem : list_) {
+      if (f) {
+        s << ", ";
+      }
+      s << at::IValue(elem);
+      f = true;
+    }
+    s << ']';
+    return s.str();
+  }
+
+  // Return an iterator over the elements of the list.
+  ScriptListIterator iter() const {
+    auto begin = list_.begin();
+    auto end = list_.end();
+    return ScriptListIterator(begin, end);
+  }
+
+  // Interpret the list as a boolean; empty means false, non-empty means
+  // true.
+  bool toBool() const {
+    return !(list_.empty());
+  }
+
+  // Get the value for the given index.
+  at::IValue getItem(diff_type idx) {
+    idx = wrap_index(idx);
+    return list_.get(idx);
+  }
+
+  // Set the value corresponding to the given index.
+  void setItem(diff_type idx, const at::IValue& value) {
+    idx = wrap_index(idx);
+    return list_.set(idx, value);
+  }
+
+  // Check whether the list contains the given value.
+  bool contains(const at::IValue& value) {
+    for (const auto& elem : list_) {
+      if (elem == value) {
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+  // Delete the item at the given index from the list.
+  void delItem(diff_type idx) {
+    idx = wrap_index(idx);
+    auto iter = list_.begin() + idx;
+    list_.erase(iter);
+  }
+
+  // Get the size of the list.
+  ssize_t len() const {
+    return list_.size();
+  }
+
+  // Count the number of times a value appears in the list.
+  ssize_t count(const at::IValue& value) const {
+    ssize_t total = 0;
+
+    for (const auto& elem : list_) {
+      if (elem == value) {
+        ++total;
+      }
+    }
+
+    return total;
+  }
+
+  // Remove the first occurrence of a value from the list.
+  void remove(const at::IValue& value) {
+    auto list = list_;
+
+    int64_t idx = -1, i = 0;
+
+    for (const auto& elem : list) {
+      if (elem == value) {
+        idx = i;
+        break;
+      }
+
+      ++i;
+    }
+
+    if (idx == -1) {
+      throw py::value_error();
+    }
+
+    list.erase(list.begin() + idx);
+  }
+
+  // Append a value to the end of the list.
+  void append(const at::IValue& value) {
+    list_.emplace_back(value);
+  }
+
+  // Clear the contents of the list.
+  void clear() {
+    list_.clear();
+  }
+
+  // Append the contents of an iterable to the list.
+  void extend(const at::IValue& iterable) {
+    list_.append(iterable.toList());
+  }
+
+  // Remove and return the element at the specified index from the list. If no
+  // index is passed, the last element is removed and returned.
+  at::IValue pop(std::optional<size_type> idx = std::nullopt) {
+    at::IValue ret;
+
+    if (idx) {
+      idx = wrap_index(*idx);
+      ret = list_.get(*idx);
+      list_.erase(list_.begin() + *idx);
+    } else {
+      ret = list_.get(list_.size() - 1);
+      list_.pop_back();
+    }
+
+    return ret;
+  }
+
+  // Insert a value before the given index.
+  void insert(const at::IValue& value, diff_type idx) {
+    // wrap_index cannot be used; idx == len() is allowed
+    if (idx < 0) {
+      idx += len();
+    }
+
+    if (idx < 0 || idx > len()) {
+      throw std::out_of_range("list index out of range");
+    }
+
+    list_.insert(list_.begin() + idx, value);
+  }
+
+  // A c10::List instance that holds the actual data.
+  c10::impl::GenericList list_;
+
+ private:
+  // Wrap an index so that it can safely be used to access
+  // the list. For list of size sz, this function can successfully
+  // wrap indices in the range [-sz, sz-1]
+  diff_type wrap_index(diff_type idx) {
+    auto sz = len();
+    if (idx < 0) {
+      idx += sz;
+    }
+
+    if (idx < 0 || idx >= sz) {
+      throw std::out_of_range("list index out of range");
+    }
+
+    return idx;
+  }
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/python_sugared_value.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/python_sugared_value.h
new file mode 100644
index 0000000000000000000000000000000000000000..35155da7a4526ec09c3061dcf6a7fefa75d9bd67
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/python_sugared_value.h
@@ -0,0 +1,378 @@
+#pragma once
+
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/frontend/concrete_module_type.h>
+#include <torch/csrc/jit/frontend/sugared_value.h>
+#include <torch/csrc/jit/python/pybind_utils.h>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace torch::jit {
+
+std::string typeString(py::handle h);
+
+inline std::shared_ptr<SugaredValue> toSimple(Value* v) {
+  return std::make_shared<SimpleValue>(v);
+}
+
+// NB: This should be the single entry-point for instantiating a SugaredValue
+// from a Python object. If you are adding support for converting a new Python
+// type, *add it in this function's implementation*.
+std::shared_ptr<SugaredValue> toSugaredValue(
+    py::object obj,
+    GraphFunction& m,
+    const SourceRange& loc,
+    bool is_constant = false);
+
+std::optional<StrongFunctionPtr> as_function(const py::object& obj);
+
+struct VISIBILITY_HIDDEN PythonValue : public SugaredValue {
+  PythonValue(
+      py::object the_self,
+      std::optional<py::object> rcb = std::nullopt,
+      Value* module_self = nullptr)
+      : self(std::move(the_self)),
+        rcb(std::move(rcb)),
+        moduleSelf_(module_self) {}
+
+  FunctionSchema getSchema(
+      const size_t n_args,
+      const size_t n_binders,
+      const SourceRange& loc);
+
+  // call it like a function, e.g. `outputs = this(inputs)`
+  std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& m,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
+      size_t n_binders) override;
+
+  std::string kind() const override;
+
+  std::vector<std::shared_ptr<SugaredValue>> asTuple(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::optional<size_t>& size_hint = {}) override;
+
+  std::shared_ptr<SugaredValue> attr(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::string& field) override;
+
+  Value* asValue(const SourceRange& loc, GraphFunction& m) override {
+    throw(
+        ErrorReport(loc)
+        << kind() << " cannot be used as a value. "
+        << "Perhaps it is a closed over global variable? If so, please "
+        << "consider passing it in as an argument or use a local varible "
+        << "instead.");
+  }
+
+ protected:
+  py::object getattr(const SourceRange& loc, const std::string& name);
+
+  void checkForAddToConstantsError(std::stringstream& ss);
+
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  py::object self;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::optional<py::object> rcb;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  Value* moduleSelf_ = nullptr;
+};
+
+struct VISIBILITY_HIDDEN PythonModuleValue : public PythonValue {
+  explicit PythonModuleValue(py::object mod) : PythonValue(std::move(mod)) {}
+
+  std::shared_ptr<SugaredValue> attr(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::string& field) override;
+};
+
+// Used for desugaring uses of the torch.cuda module. All the CUDA APIs with
+// torch.cuda.* are resolved using CUDAPythonModuleValue.
+struct VISIBILITY_HIDDEN CUDAPythonModuleValue : public PythonValue {
+  explicit CUDAPythonModuleValue(py::object mod)
+      : PythonValue(std::move(mod)) {}
+
+  std::shared_ptr<SugaredValue> attr(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::string& field) override;
+};
+
+// Represents all the parameters of a module as a List[Tensor]
+struct VISIBILITY_HIDDEN ConstantParameterList : public SugaredValue {
+  ConstantParameterList(Value* the_list) : the_list_(the_list) {}
+  std::string kind() const override {
+    return "constant parameter list";
+  }
+  std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& caller,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
+      size_t n_binders) override {
+    return toSimple(the_list_);
+  }
+
+ private:
+  Value* the_list_;
+};
+
+struct VISIBILITY_HIDDEN ModuleDictMethod : public SugaredValue {
+  explicit ModuleDictMethod(SugaredValuePtr iterable, std::string name)
+      : iterable_(std::move(iterable)), name_(std::move(name)) {}
+
+  std::string kind() const override {
+    return name_;
+  }
+
+  std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& f,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
+      size_t n_binders) override {
+    if (!args.empty() || !kwargs.empty()) {
+      throw(
+          ErrorReport(loc) << name_ << " method does not accept any arguments");
+    }
+    return iterable_;
+  }
+
+  SugaredValuePtr iterable_;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const std::string name_;
+};
+
+struct SugaredDict;
+
+// defines how modules/methods behave inside the script subset.
+// for now this does not have any interaction with python.
+// in the future, we will add the ability to resolve `self.foo` to python
+// {functions, modules, constants} so this SugaredValue is defined here
+// anticipating we will eventually need to replace Module with a py::object
+// holding the actual nn.Module class.
+
+struct VISIBILITY_HIDDEN ModuleValue : public SugaredValue {
+  ModuleValue(Value* self, std::shared_ptr<ConcreteModuleType> concreteType)
+      : self_(self), concreteType_(std::move(concreteType)) {}
+
+  std::string kind() const override {
+    return "module";
+  }
+
+  Value* asValue(const SourceRange& loc, GraphFunction& m) override;
+
+  SugaredValuePtr asTupleValue(const SourceRange& loc, GraphFunction& m)
+      override;
+
+  // select an attribute on it, e.g. `this.field`
+  std::shared_ptr<SugaredValue> tryGetAttr(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::string& field);
+
+  // select an attribute on it, e.g. `this.field`
+  std::shared_ptr<SugaredValue> attr(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::string& field) override;
+
+  // select an attribute on it, e.g. `this.field`
+  bool hasAttr(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::string& field) override;
+
+  // call module.forward with pre_hooks and hooks
+  std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& caller,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
+      size_t n_binders) override;
+
+  std::shared_ptr<SugaredDict> getSugaredDict(
+      const SourceRange& loc,
+      GraphFunction& m);
+
+  std::shared_ptr<SugaredDict> getSugaredNamedBufferDict(
+      const SourceRange& loc,
+      GraphFunction& m);
+
+  std::shared_ptr<SugaredDict> getSugaredNamedParameterList(
+      const SourceRange& loc,
+      GraphFunction& m);
+
+  std::shared_ptr<SugaredDict> getSugaredNamedParameterDict(
+      const SourceRange& loc,
+      GraphFunction& m);
+
+  void setAttr(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::string& field,
+      Value* newValue) override;
+
+  SugaredValuePtr iter(const SourceRange& loc, GraphFunction& m) override;
+
+  std::shared_ptr<SugaredValue> getitem(
+      const SourceRange& loc,
+      GraphFunction& m,
+      Value* idx,
+      TypePtr type_hint) override;
+
+ private:
+  // Check that the type of all submodules is a subtype of ty. If the function
+  // returns false, more information about why it returns false (e.g. which
+  // submodule's type is not a subtype of ty) is printed it why_not if it is not
+  // null.
+  bool areAllSubmodulesSubtypeOf(
+      const TypePtr& ty,
+      std::ostream* why_not = nullptr) const;
+
+  Value* self_;
+  std::shared_ptr<ConcreteModuleType> concreteType_;
+};
+
+bool isNamedTupleClass(const py::object& obj);
+TypePtr registerNamedTuple(
+    const py::object& obj,
+    const SourceRange& loc,
+    const ResolutionCallback& rcb);
+
+void recurseThroughNestedModules(
+    const SourceRange& loc,
+    GraphFunction& m,
+    std::vector<SugaredValuePtr>& keys,
+    std::vector<SugaredValuePtr>& values,
+    std::shared_ptr<ModuleValue>& self,
+    const std::string& prefix,
+    const std::string& field);
+
+// Used to support named_modules()
+struct VISIBILITY_HIDDEN SugaredDict : public SugaredValue {
+  explicit SugaredDict(
+      std::shared_ptr<ModuleValue> self,
+      std::shared_ptr<SugaredTupleValue> keys,
+      std::shared_ptr<SugaredTupleValue> modules)
+      : self_(std::move(self)),
+        keys_(std::move(keys)),
+        modules_(std::move(modules)) {}
+
+  std::string kind() const override {
+    return "ModuleDict";
+  }
+
+  std::shared_ptr<SugaredTupleValue> getKeys() {
+    return keys_;
+  }
+
+  std::shared_ptr<SugaredTupleValue> getModules() {
+    return modules_;
+  }
+
+  std::shared_ptr<SugaredValue> attr(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::string& field) override;
+
+  SugaredValuePtr iter(const SourceRange& loc, GraphFunction& m) override {
+    return keys_;
+  }
+
+  std::shared_ptr<ModuleValue> self_;
+  std::shared_ptr<SugaredTupleValue> keys_;
+  std::shared_ptr<SugaredTupleValue> modules_;
+};
+
+struct VISIBILITY_HIDDEN BooleanDispatchValue : public SugaredValue {
+  BooleanDispatchValue(py::dict dispatched_fn)
+      : dispatched_fn_(std::move(dispatched_fn)) {}
+
+  std::string kind() const override {
+    return "boolean dispatch";
+  }
+
+  std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& caller,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
+      size_t n_binders) override;
+
+ private:
+  py::dict dispatched_fn_;
+};
+
+struct VISIBILITY_HIDDEN PythonClassValue : public ClassValue {
+  PythonClassValue(ClassTypePtr type, py::object py_type)
+      : ClassValue(std::move(type)), py_type_(std::move(py_type)) {}
+
+  std::string kind() const override {
+    return "Python type";
+  }
+
+  std::shared_ptr<SugaredValue> attr(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::string& field) override;
+
+  bool hasAttr(
+      const SourceRange& loc,
+      GraphFunction& m,
+      const std::string& field) override;
+
+ private:
+  py::object py_type_;
+};
+
+struct VISIBILITY_HIDDEN PythonExceptionValue : public ExceptionValue {
+  explicit PythonExceptionValue(const py::object& exception_class)
+      : ExceptionValue(
+            py::str(py::getattr(exception_class, "__name__", py::str("")))),
+        exception_class_qualified_name_(
+            py::str(py::module::import("torch._jit_internal")
+                        .attr("_qualified_name")(
+                            exception_class,
+                            /*mangle_name=*/false))) {}
+
+  std::string kind() const override {
+    return "Python exception";
+  }
+
+  std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& caller,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
+      size_t n_binders) override;
+
+ private:
+  std::string exception_class_qualified_name_;
+};
+
+// Python Slice class.
+struct VISIBILITY_HIDDEN PythonSliceClass : public SugaredValue {
+  explicit PythonSliceClass() = default;
+
+  std::string kind() const override {
+    return "Python slice class";
+  }
+
+  std::shared_ptr<SugaredValue> call(
+      const SourceRange& loc,
+      GraphFunction& caller,
+      at::ArrayRef<NamedValue> args,
+      at::ArrayRef<NamedValue> kwargs,
+      size_t n_binders) override;
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/python_tracer.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/python_tracer.h
new file mode 100644
index 0000000000000000000000000000000000000000..fccc562d11b4814e30ec16831b2df84b34b1ca22
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/python_tracer.h
@@ -0,0 +1,45 @@
+#pragma once
+
+#include <torch/csrc/jit/frontend/source_range.h>
+#include <torch/csrc/jit/frontend/tracer.h>
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/pybind.h>
+
+#include <memory>
+#include <string>
+
+namespace torch::jit {
+
+struct Module;
+
+namespace tracer {
+void initPythonTracerBindings(PyObject* module);
+
+SourceRange getPythonInterpreterSourceRange();
+
+Node* preRecordPythonTrace(
+    THPObjectPtr pyobj,
+    const std::string& arg_types,
+    at::ArrayRef<autograd::Variable> inputs,
+    std::vector<THPObjectPtr> scalar_args);
+
+std::pair<std::shared_ptr<Graph>, Stack> createGraphByTracingWithDict(
+    const py::function& func,
+    const py::dict& inputs_dict,
+    const Stack& inputs,
+    const py::function& var_name_lookup_fn,
+    bool strict,
+    bool force_outplace,
+    Module* self = nullptr,
+    const std::vector<std::string>& argument_names = {});
+
+std::pair<std::shared_ptr<Graph>, Stack> createGraphByTracing(
+    const py::function& func,
+    Stack inputs,
+    const py::function& var_name_lookup_fn,
+    bool strict,
+    bool force_outplace,
+    Module* self = nullptr,
+    const std::vector<std::string>& argument_names = {});
+} // namespace tracer
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/python_tree_views.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/python_tree_views.h
new file mode 100644
index 0000000000000000000000000000000000000000..0ae90f30d7b589d83d19becc03e482aa87c7255e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/python_tree_views.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+namespace torch::jit {
+
+void initTreeViewBindings(PyObject* module);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/script_init.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/script_init.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb511660acc99cc946860c84f21d650a3f849076
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/script_init.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#include <torch/csrc/jit/python/pybind.h>
+
+namespace torch::jit {
+void initJitScriptBindings(PyObject* module);
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/update_graph_executor_opt.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/update_graph_executor_opt.h
new file mode 100644
index 0000000000000000000000000000000000000000..8afe3ea8f50810ffe5e4730bda8c49cbbee6456c
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/update_graph_executor_opt.h
@@ -0,0 +1,6 @@
+#pragma once
+#include <torch/csrc/Export.h>
+namespace torch::jit {
+TORCH_API void setGraphExecutorOptimize(bool o);
+TORCH_API bool getGraphExecutorOptimize();
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/utf8_decoding_ignore.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/utf8_decoding_ignore.h
new file mode 100644
index 0000000000000000000000000000000000000000..293ef8994cefe91eb0b94f31ea2f1af816180c64
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/python/utf8_decoding_ignore.h
@@ -0,0 +1,6 @@
+#pragma once
+#include <torch/csrc/Export.h>
+namespace torch::jit {
+TORCH_API void setUTF8DecodingIgnore(bool o);
+TORCH_API bool getUTF8DecodingIgnore();
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/resource_guard.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/resource_guard.h
new file mode 100644
index 0000000000000000000000000000000000000000..f826931fd6869c6b7ef25b1ef840a9d4fd879072
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/resource_guard.h
@@ -0,0 +1,25 @@
+#pragma once
+#include <functional>
+
+namespace torch::jit {
+
+class ResourceGuard {
+  std::function<void()> _destructor;
+  bool _released{false};
+
+ public:
+  ResourceGuard(std::function<void()> destructor)
+      : _destructor(std::move(destructor)) {}
+
+  // NOLINTNEXTLINE(bugprone-exception-escape)
+  ~ResourceGuard() {
+    if (!_released)
+      _destructor();
+  }
+
+  void release() {
+    _released = true;
+  }
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/argument_spec.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/argument_spec.h
new file mode 100644
index 0000000000000000000000000000000000000000..00c14cea5ac25e15758e7bcddb4cd68080188be4
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/argument_spec.h
@@ -0,0 +1,503 @@
+#pragma once
+
+#include <ATen/core/jit_type.h>
+#include <ATen/core/stack.h>
+#include <c10/util/hash.h>
+#include <c10/util/irange.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <ostream>
+#include <vector>
+
+C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wshorten-64-to-32")
+C10_CLANG_DIAGNOSTIC_IGNORE("-Wshorten-64-to-32")
+#endif
+
+namespace torch::jit {
+
+// GraphExecutor creates specializations of Graphs for different
+// dimensionalitities and types of inputs.
+
+struct ArgumentInfo {
+  friend struct ArgumentSpec;
+  using plain_data_type = uint64_t;
+
+  bool defined() const {
+    return defined_;
+  }
+  at::Device device() const {
+    return at::Device(DeviceType(dev_type_), device_);
+  }
+  // XXX: It is guaranteed that this will return false when called on non-tensor
+  // arguments
+  bool requires_grad() const {
+    return requires_grad_;
+  }
+  int dim() const {
+    return dim_;
+  }
+  at::ScalarType type() const {
+    return at::ScalarType(type_);
+  }
+  TypePtr toType() const {
+    if (!defined())
+      return TensorType::get();
+
+    return TensorType::create(
+        type(), device(), std::optional<size_t>(dim()), requires_grad());
+  }
+  operator TypePtr() const {
+    return toType();
+  }
+
+ private:
+  unsigned defined_ : 1;
+  unsigned requires_grad_ : 1;
+  unsigned : 5;
+  unsigned dim_ : 8;
+  unsigned device_ : 8;
+  unsigned type_ : 8;
+  unsigned dev_type_ : 16;
+  unsigned : 16;
+};
+
+static_assert(
+    std::is_standard_layout_v<ArgumentInfo>,
+    "ArgumentInfo is to be a POD struct");
+static_assert(
+    sizeof(ArgumentInfo) == sizeof(ArgumentInfo::plain_data_type),
+    "ArgumentInfo is expected to be a 32-bit struct");
+
+struct ArgumentSpec {
+  ArgumentSpec(size_t num_flat_tensor_inputs, size_t num_flat_optional_inputs)
+      : hash_code(c10::hash_combine(
+            num_flat_tensor_inputs,
+            num_flat_optional_inputs)) {
+    tensor_args.reserve(num_flat_tensor_inputs);
+    optional_presence.reserve(num_flat_optional_inputs);
+  }
+
+  void addOptional(const IValue& input) {
+    bool is_present = !input.isNone();
+    optional_presence.push_back(is_present);
+    hash_code = c10::hash_combine(hash_code, is_present);
+  }
+
+  void addTensor(const IValue& input, bool with_grad) {
+    AT_ASSERT(input.isTensor(), "Expected Tensor but found ", input.tagKind());
+    tensor_args.emplace_back();
+    auto& arg = tensor_args.back();
+    // Initialize all fields to 0. This is convenient, because e.g.
+    // requires_grad() can be checked even on tensors AND will make
+    // padding bits all 0s.
+    std::memset(&arg, 0, sizeof(ArgumentInfo));
+
+    // [argspec refcounting] reinterpret the IValue to avoid having to refcount
+    // the Tensor microbenchmarks
+    // https://github.com/zdevito/pytorch/commit/21e7200a0a0fc456bea2f10e95b1781f83933d10
+    // show overhead in extra refcounting along this path
+    const at::Tensor* t = reinterpret_cast<const at::Tensor*>(&input);
+    arg.defined_ = t->defined();
+    if (arg.defined_) {
+      arg.requires_grad_ = with_grad && t->requires_grad();
+      arg.dim_ = t->dim();
+      at::Device device = t->device();
+      arg.dev_type_ =
+          // NOLINTNEXTLINE(bugprone-signed-char-misuse)
+          static_cast<std::underlying_type_t<DeviceType>>(device.type());
+      // NOLINTNEXTLINE(bugprone-signed-char-misuse)
+      arg.device_ = device.index();
+      arg.type_ = static_cast<unsigned>(t->scalar_type());
+    }
+    combineHash(arg);
+  }
+
+  void combineHash(const ArgumentInfo& arg) {
+    ArgumentInfo::plain_data_type arg_data = 0;
+    std::memcpy(&arg_data, &arg, sizeof(ArgumentInfo));
+    hash_code = c10::hash_combine(hash_code, arg_data);
+  }
+
+  // equality is fast: check ninputs, and then check the raw array data,
+  // there are no size/stride indirections
+  // hopefully std::vector<bool> has fast equality
+  bool operator==(const ArgumentSpec& spec) const {
+    if (optional_presence != spec.optional_presence) {
+      return false;
+    }
+    if (tensor_args.size() != spec.tensor_args.size())
+      return false;
+    // NB: we need to break out early when there are no elements, because
+    // passing a nullptr to memcmp is UB.
+    if (tensor_args.empty())
+      return true;
+    return std::memcmp(
+               tensor_args.data(),
+               spec.tensor_args.data(),
+               tensor_args.size() * sizeof(ArgumentInfo)) == 0;
+  }
+  bool operator!=(const ArgumentSpec& spec) const {
+    return !(*this == spec);
+  }
+  size_t numTensors() const {
+    return tensor_args.size();
+  }
+  const ArgumentInfo& tensorAt(size_t i) const {
+    return tensor_args[i];
+  }
+  size_t numOptionals() const {
+    return optional_presence.size();
+  }
+  bool isPresent(size_t i) const {
+    return optional_presence[i];
+  }
+  size_t hashCode() const {
+    return hash_code;
+  }
+
+ private:
+  size_t hash_code; // precomputed on construction
+  std::vector<ArgumentInfo> tensor_args;
+  std::vector<bool> optional_presence;
+};
+
+namespace {
+static constexpr size_t ARG_SPEC_DEPTH_LIMIT = 128;
+}
+
+// ArgumentSpecCreator takes an initial graph and comes up with a set
+// of simple instructions to compute the ArgumentSpec given a set of
+// input tensors.
+struct TORCH_API ArgumentSpecCreator {
+  // instructs acts on a stack of a list of input IValues
+  // at the beginning the stack contains a single list of the inputs to the
+  // function the ENTER_ instructs descend into subobjects and push new lists
+  // onto the stack
+  enum Inst : char {
+    ENTER_TUPLE, // consume a tuple ivalue from the top-most list, and push the
+                 // list of its elements onto the stack as a new list
+    ENTER_OBJECT, // same as ENTER_TUPLE, but the input is a class
+    LEAVE, // pop the top-most list from the stack
+    SKIP, // consume an element from the top-most list, and discard
+    SPECIALIZE_OPTIONAL_TENSOR, // consume a optional tensor for the top-most
+                                // list, and add it to the ArgSpec key being
+                                // created
+    SPECIALIZE_TENSOR, // consume a tensor for the top-most
+                       // list, and add it to the ArgSpec key being created
+    SPECIALIZE_OPTIONAL,
+    // consume a nontensor optional from the top-most list,
+    // and add it to the ArgSpec key being created
+  };
+  ArgumentSpecCreator(Graph& graph);
+  ArgumentSpec create(bool with_grad, const Stack& stack) const;
+  void specializeTypes(Graph& g, const ArgumentSpec& spec) const;
+  void dump() const;
+  using WrittenSlots = std::unordered_set<std::string>;
+
+ private:
+  void scan(
+      const TypePtr& typ,
+      size_t depth,
+      const WrittenSlots& written_slots);
+  size_t num_inputs_;
+  size_t num_tensors_ = 0;
+  size_t num_optionals_ = 0;
+  std::vector<Inst> instructions_;
+};
+
+// CompleteArgumentSpec represents one particular specialization.
+// It is designed so that it can be created, hashed, and compared quickly
+// since it is used along the hot-path of the JIT to check if the code
+// we have created is valid for the given inputs.
+
+// COmpleteArgumentInfoPOD is only used internally in CompleteArgumentSpec
+// API users should use ArgumentInfo
+struct CompleteArgumentInfoPOD {
+  // total size is 64-bit
+  unsigned is_tensor : 8; // all other fields are invalid if this is false
+  unsigned type : 8; // scalar type
+  unsigned defined : 1;
+  unsigned requires_grad : 1;
+  signed device : 14;
+  unsigned dev_type : 16;
+  unsigned
+      total_dims : 16; // all TensorInfoPODs are in CompleteArgumentSpec's
+                       // tensor_info() array. total_dims is the total number of
+                       // dimensions seen so far in all previous members of
+                       // tensor_info(), including this tensor 2*total_dims
+                       // becomes the offset into the sizes_strides list for the
+                       // _next_ tensor in the tensor_info array for tensor 0,
+                       // the offset is always 0
+};
+
+static_assert(
+    sizeof(CompleteArgumentInfoPOD) == sizeof(int64_t),
+    "CompleteArgumentInfoPOD must be 64-bit struct for CompleteArgumentSpec encoding to work");
+
+struct CompleteArgumentInfo;
+
+struct CompleteArgumentSpec {
+  CompleteArgumentSpec(bool with_grad, at::ArrayRef<IValue> inputs)
+      : ninputs(inputs.size()) {
+    int64_t all_dims = 0;
+    const auto num_inputs = inputs.size();
+    for (const auto i : c10::irange(num_inputs)) {
+      if (!inputs[i].isTensor())
+        continue;
+      auto& tensor = inputs[i].toTensor();
+      all_dims += tensor.defined() ? tensor.ndimension() : 0;
+    }
+    // allocate enough room for all TensorPODs and dimensions
+    data.resize(ninputs + all_dims * 2);
+
+    // and reinterpret our data array as these structs
+    auto* pods = reinterpret_cast<CompleteArgumentInfoPOD*>(data.data());
+    int64_t* next_dim = sizes_strides();
+    int32_t total_dims = 0;
+    for (const auto i : c10::irange(num_inputs)) {
+      auto& pod = pods[i];
+      pod.is_tensor = static_cast<uint32_t>(inputs[i].isTensor());
+      if (pod.is_tensor) {
+        at::Tensor t = inputs[i].toTensor();
+        pod.defined = t.defined();
+        if (pod.defined) {
+          pod.type = static_cast<int>(t.scalar_type());
+          at::Device device = t.device();
+          // NOLINTNEXTLINE(bugprone-signed-char-misuse)
+          pod.dev_type =
+              static_cast<std::underlying_type_t<DeviceType>>(device.type());
+          // NOLINTNEXTLINE(bugprone-signed-char-misuse)
+          pod.device = device.index();
+          pod.requires_grad = with_grad && t.requires_grad();
+          total_dims += t.ndimension();
+          auto sizes = t.sizes();
+          std::copy(sizes.begin(), sizes.end(), next_dim);
+          next_dim += sizes.size();
+          auto strides = t.strides();
+          std::copy(strides.begin(), strides.end(), next_dim);
+          next_dim += strides.size();
+        }
+      }
+      // each POD has a running tally of all dimensions including its own
+      TORCH_CHECK(
+          total_dims < std::numeric_limits<uint16_t>::max(),
+          "The number of dims cannot be packed into CompleteArgumentSpec:",
+          total_dims);
+      pod.total_dims = total_dims;
+    }
+    // we precompute the hash_code to minimize the time inside of hash
+    // table operations where we may need to hold a compiler cache lock.
+    hash_code = c10::hash_combine(0, ninputs);
+    for (auto d : data) {
+      hash_code = c10::hash_combine(hash_code, d);
+    }
+  }
+
+  // equality is fast: check ninputs, and then check the raw array data,
+  // there are no size/stride indirections
+  bool operator==(const CompleteArgumentSpec& spec) const {
+    return ninputs == spec.ninputs && data == spec.data;
+  }
+  bool operator!=(const CompleteArgumentSpec& spec) const {
+    return !(*this == spec);
+  }
+  friend struct CompleteArgumentInfo;
+  CompleteArgumentInfo at(size_t i) const;
+  size_t size() const {
+    return ninputs;
+  }
+  size_t hashCode() const {
+    return hash_code;
+  }
+
+ private:
+  ArrayRef<CompleteArgumentInfoPOD> tensor_info() const {
+    return ArrayRef<CompleteArgumentInfoPOD>(
+        reinterpret_cast<const CompleteArgumentInfoPOD*>(data.data()), ninputs);
+  }
+  // the start of the sizes_strides information, which comes after the
+  // CompleteArgumentInfoPOD list.
+  const int64_t* sizes_strides() const {
+    return data.data() + ninputs;
+  }
+  int64_t* sizes_strides() {
+    return data.data() + ninputs;
+  }
+  size_t hash_code{0}; // precomputed on construction
+  size_t ninputs;
+  // layout is ninputs of TensorPOD (each 64-bit) followed by their size and
+  // stride info for 3 tensors:
+  // [t0POD][t1POD][t2POD]...
+  // [t0 sizes][t0 strides][t1 sizes][t1 strides][t2 sizes][t2 strides]
+  std::vector<int64_t> data;
+};
+
+// public view of compressed CompleteArgumentInfo
+struct CompleteArgumentInfo {
+  CompleteArgumentInfo(const CompleteArgumentSpec& spec, const int i)
+      : spec(spec), i(i) {}
+  bool isTensor() const {
+    return pod(i).is_tensor;
+  }
+  at::ScalarType type() const {
+    return at::ScalarType(pod(i).type);
+  }
+  bool defined() const {
+    return pod(i).defined;
+  }
+  bool requires_grad() const {
+    return pod(i).requires_grad;
+  }
+  at::Device device() const {
+    return at::Device(
+        DeviceType(pod(i).dev_type),
+        static_cast<c10::DeviceIndex>(pod(i).device));
+  }
+  int ndimension() const {
+    // See [valid range], it is always valid to ask for offset for (i + 1)
+    return (sizes_strides_offset(i + 1) - sizes_strides_offset(i)) / 2;
+  }
+  at::IntArrayRef sizes() const {
+    return at::IntArrayRef(
+        spec.sizes_strides() + sizes_strides_offset(i), ndimension());
+  }
+  at::IntArrayRef strides() const {
+    int ndim = ndimension();
+    return at::IntArrayRef(
+        spec.sizes_strides() + sizes_strides_offset(i) + ndim, ndim);
+  }
+  operator TypePtr() const {
+    if (!defined())
+      return TensorType::get();
+    return TensorType::create(
+        type(),
+        device(),
+        c10::VaryingShape<int64_t>{sizes()},
+        c10::VaryingShape<int64_t>{strides()},
+        requires_grad());
+  }
+
+ private:
+  // offsetinto sizes_strides() array where the sizes start for tensor j
+  // [valid range] valid range is [0, ninputs]
+  // (i.e. you can ask for the offset at ninputs, which would be the offset of
+  // the next tensor if it existed)
+  int sizes_strides_offset(int j) const {
+    if (j == 0)
+      return 0;
+    return 2 * pod(j - 1).total_dims;
+  }
+  const CompleteArgumentInfoPOD& pod(int j) const {
+    return spec.tensor_info().at(j);
+  }
+  const CompleteArgumentSpec& spec;
+  const int i;
+};
+
+inline std::ostream& operator<<(std::ostream& out, const ArgumentInfo& info) {
+  if (!info.defined()) {
+    return out << "<undefined>";
+  }
+  out << "Tensor(device=" << info.device() << ", type=" << toString(info.type())
+      << ", requires_grad=" << info.requires_grad() << ", dims=" << info.dim()
+      << ")";
+  return out;
+}
+
+inline std::ostream& operator<<(std::ostream& out, const ArgumentSpec& spec) {
+  out << "{";
+  for (const auto i : c10::irange(spec.numTensors())) {
+    if (i > 0)
+      out << ", ";
+    out << spec.tensorAt(i);
+  }
+  out << "; ";
+  for (const auto i : c10::irange(spec.numOptionals())) {
+    if (i > 0)
+      out << ", ";
+    out << spec.isPresent(i);
+  }
+  out << "}";
+  return out;
+}
+
+inline std::ostream& operator<<(
+    std::ostream& out,
+    const CompleteArgumentInfo& info) {
+  if (!info.defined()) {
+    return out << "<undefined>";
+  }
+  out << "Tensor(device=" << info.device() << ", type=" << toString(info.type())
+      << ", requires_grad=" << info.requires_grad()
+      << ", sizes=" << info.sizes() << ", strides=" << info.strides() << ")";
+  return out;
+}
+
+inline std::ostream& operator<<(
+    std::ostream& out,
+    const CompleteArgumentSpec& spec) {
+  out << "{";
+  for (const auto i : c10::irange(spec.size())) {
+    if (i > 0)
+      out << ", ";
+    out << spec.at(i);
+  }
+  out << "}";
+  return out;
+}
+
+inline CompleteArgumentInfo CompleteArgumentSpec::at(size_t i) const {
+  return CompleteArgumentInfo(*this, i);
+}
+
+inline std::optional<int8_t> convertOptional(
+    std::optional<c10::ScalarType> const& from) {
+  return (from) ? std::optional<int8_t>(static_cast<int8_t>(*from))
+                : std::optional<int8_t>{};
+}
+
+} // namespace torch::jit
+
+namespace std {
+
+template <typename T>
+struct hash<c10::VaryingShape<T>> {
+  size_t operator()(const c10::VaryingShape<T>& vs) const {
+    return c10::get_hash(
+        vs.size(),
+        vs.size() ? vs.sizes().value() : std::vector<std::optional<T>>());
+  }
+};
+
+template <>
+struct hash<c10::TensorType> {
+  size_t operator()(const c10::TensorType& ptt) const {
+    return c10::get_hash<
+        std::optional<int8_t>,
+        c10::VaryingShape<int64_t>,
+        c10::VaryingShape<int64_t>,
+        std::optional<bool>>(
+        torch::jit::convertOptional(ptt.scalarType()),
+        ptt.sizes(),
+        ptt.strides(),
+        ptt.requiresGrad());
+  }
+};
+
+template <>
+struct hash<torch::jit::ArgumentSpec> {
+  size_t operator()(const torch::jit::ArgumentSpec& spec) const {
+    return spec.hashCode();
+  }
+};
+template <>
+struct hash<torch::jit::CompleteArgumentSpec> {
+  size_t operator()(const torch::jit::CompleteArgumentSpec& spec) const {
+    return spec.hashCode();
+  }
+};
+} // namespace std
+
+C10_CLANG_DIAGNOSTIC_POP()
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/autodiff.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/autodiff.h
new file mode 100644
index 0000000000000000000000000000000000000000..380985ee24aef00553c0042b500c1f2f01ab6030
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/autodiff.h
@@ -0,0 +1,94 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+#include <memory>
+#include <vector>
+
+namespace torch::jit {
+
+using value_list = std::vector<Value*>;
+// clang-format off
+// Example showcasing how Gradient is constructed:
+//
+// Let's assume we have a function f, `m` and `n` do not require grad
+// (`n` can depend only on `m`):
+//   y, n = f(x, m)
+//
+// Now, let's assume that the reverse of f (called f') needs to use values of `x`, `t` and `y`.
+// `t` is an intermediate value produced in the body of f, and let's assume that it requires
+// grad too.
+//
+// In this case differentiate(f) will return this:
+//   y, n, t = f(x, m)        // `t` is appended to the output list
+//   dx = f'(dy, dt, x, t, y) // No `dm` or `dn` because they do not require gradient
+//                            // All needed values from f are prepended to the input list
+//
+//   f_real_outputs = 2       // Only first two outputs were present in f originally
+//   df_input_vjps = {0, 2}   // i.e. connect grad_fn of y and t variables produced by f,
+//                    y  t    // with y's output_nr = 0 and t's output_nr = 1
+//   df_input_captures = {I0, O2, O0} // Order matches the prefix of inputs to df
+//                        x   t   y
+//   df_output_vjps = {0}     // i.e. connect next_edge[0] of grad_fn to x's (grad_fn, output_nr).
+//
+// Terminology: vjp = vector-jacobian product
+// clang-format on
+
+struct Gradient {
+  explicit operator bool() const {
+    return df != nullptr;
+  }
+  std::shared_ptr<Graph> f;
+  std::shared_ptr<Graph> df;
+
+  // Describes how to construct outputs of f from what its graph will return.
+  // This is necessary because some trailing outputs are intermediates produced
+  // only to be saved for df (and should be ignored).
+  size_t f_real_outputs = 0; // initialized for safety.
+
+  // df inputs are split into two sections: vjps (aka grad_outputs) and
+  // captures. VJPs are "seeds" for the gradient computation given for each
+  // input capture of an Output kind. Captures are values the need to be saved
+  // when f is run. We handle inputs specially, because this allows us to avoid
+  // adding extra vjps as df inputs.
+
+  std::vector<size_t> df_input_vjps; // Offsets into f's outputs.
+  // capture can come from inputs or outputs
+  std::vector<size_t> df_input_captured_inputs; // Offsets into f's inputs
+  std::vector<size_t> df_input_captured_outputs; // Offsets into f's outputs
+
+  // df will produce vjps for a subset of inputs of f that required grad.
+  // df_output_vjps[idx] == inp_idx means that idx-th output of df produces a
+  // vjp for inp_idx-th input of f.
+  std::vector<size_t> df_output_vjps; // Offsets into f's inputs.
+
+  // How to use gradient to implement a differentiable autograd function:
+  // When running f:
+  //   - Unwrap input Variables
+  //   - Run f's graph
+  //   - Create grad_fn
+  //   - Wrap outputs in Variables (assume we have a tensor_outputs array):
+  //       outputs = map(Variable, tensor_output)
+  //       for i, offset in enumerate(df_input_vjps):
+  //         outputs[offset].set_grad_fn(grad_fn, output_nr=i)
+  //   - Use df_output_vjps to connect next_edges of grad_fn:
+  //       for idx in df_output_vjps:
+  //         grad_fn.add_next_edge(inputs[idx].gradient_edge())
+  //   - Save captures for df (care needs to be taken to use SavedVariables for
+  //                           inputs and outputs that we will actually return)
+  //   - Return outputs[:f_real_outputs]
+  //
+  // When running df:
+  //   - Concatenate received vjps and captured Variables
+  //   - Interpret df
+  //   - Wrap outputs of df into Variables (that don't require grad)
+};
+TORCH_API Gradient differentiate(std::shared_ptr<Graph>& graph);
+
+// can we take a derivative of this node symbolically?
+TORCH_API bool isDifferentiable(const Node* n);
+TORCH_API bool isDifferentiable(Graph& g);
+TORCH_API bool isZero(Value* v);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/calculate_necessary_args.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/calculate_necessary_args.h
new file mode 100644
index 0000000000000000000000000000000000000000..215c645d72832bfe7243f985256e63844a945087
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/calculate_necessary_args.h
@@ -0,0 +1,69 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/frontend/schema_matching.h>
+#include <cstddef>
+
+namespace torch::jit {
+
+// Calculates the number of args that need to be passed in.
+// Less args may be needed if defaults are provided.
+// Returns: {number args needed, number of out args}
+inline std::pair<int64_t, int64_t> CalculateNecessaryArgs(
+    const std::vector<Argument>& schema_args,
+    at::ArrayRef<Value*> actual_inputs,
+    bool allow_trailing_out_args) {
+  if (schema_args.empty()) {
+    return std::make_pair(0, 0);
+  }
+
+  // count number of out arguments
+  int64_t schema_idx = static_cast<int64_t>(schema_args.size()) - 1;
+  if (allow_trailing_out_args) {
+    // skip over out arguments in the end.
+    while (schema_idx >= 0) {
+      const auto& current_arg = schema_args.at(schema_idx);
+      if (!current_arg.is_out()) {
+        break;
+      }
+      schema_idx--;
+    }
+  }
+
+  int64_t num_out = static_cast<int64_t>(schema_args.size()) - schema_idx - 1;
+
+  if (schema_args.size() < actual_inputs.size()) {
+    return std::make_pair(actual_inputs.size(), num_out);
+  }
+
+  // if it is the default args, we reset the index to the last element
+  if (!allow_trailing_out_args) {
+    schema_idx = schema_args.size() - 1;
+  }
+  // keeps track of trailing unnecessary args
+  while (schema_idx >= 0) {
+    // this means it is not default argument, so it is necessary
+    if (!schema_args.at(schema_idx).default_value().has_value()) {
+      return std::make_pair(schema_idx + 1, num_out);
+    } else {
+      auto schema_value =
+          schema_args.at(schema_idx).default_value().value().toIValue();
+      // non-const value will become nullptr here, so will be marked necessary
+      // non-const would include prim::ListConstruct, prim::DictConstruct as
+      // well.
+      auto actual_value = toIValue(actual_inputs[schema_idx]);
+      if (!actual_value.has_value()) {
+        return std::make_pair(schema_idx + 1, num_out);
+      }
+      // if the IR has same value as default value of the schema,
+      // it is not necessary argument.
+      if (schema_value != actual_value.value()) {
+        return std::make_pair(schema_idx + 1, num_out);
+      }
+    }
+    schema_idx--;
+  }
+  return std::make_pair(0, num_out);
+}
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/custom_operator.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/custom_operator.h
new file mode 100644
index 0000000000000000000000000000000000000000..574b775dc5b9a30640f2ba1da5a6c708677e498c
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/custom_operator.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/core/stack.h>
+#include <torch/csrc/jit/runtime/operator.h>
+
+namespace torch::jit {
+
+/// Registration class for new operators. Effectively calls
+/// `torch::jit::registerOperator` for every supplied operator, but allows doing
+/// so in the global scope when a `RegisterOperators` object is assigned to a
+/// static variable.
+/// Note: This is *not* the custom operator API. If you want to register custom
+/// operators, take a look at torch::RegisterOperators.
+struct TORCH_API RegisterOperators {
+  RegisterOperators() = default;
+
+  /// Registers a vector of already created `Operator`s.
+  /// The operator element is now optional to filter null ops. It's backward
+  /// compatible and works for selective operator registration.
+  explicit RegisterOperators(std::vector<std::optional<Operator>> operators) {
+    for (std::optional<Operator>& o : operators) {
+      if (o) {
+        registerOperator(std::move(o.value()));
+      }
+    }
+  }
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/decomposition_registry.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/decomposition_registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..3441fac420656656f5e37a4493f0e68981befa8c
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/decomposition_registry.h
@@ -0,0 +1,33 @@
+#pragma once
+// This file is temporary until native_functions.yaml and derivatives.yaml are
+// merged. Ideally this should all go into native_functions.yaml
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API std::optional<std::shared_ptr<Graph>> GetDecomposition(
+    const FunctionSchema& schema);
+
+TORCH_API void RegisterDecomposition(
+    const FunctionSchema& schema,
+    std::shared_ptr<Graph> g);
+
+TORCH_API void RunDecompositions(std::shared_ptr<Graph> g);
+
+TORCH_API std::optional<GraphFunction*> GetDecompositionFunction(
+    const FunctionSchema& schema);
+
+// For invocation in C++, recommended is to assign to static local variable
+TORCH_API Function* GetDecompositionExecutor(const char* schema_literal);
+
+TORCH_API Function* GetDecompositionExecutor(const FunctionSchema& schema);
+
+TORCH_API void run_jit_decomposition(
+    const c10::OperatorHandle& op,
+    torch::jit::Stack* stack);
+
+TORCH_API bool has_jit_decomposition(const FunctionSchema& schema);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/decomposition_registry_util.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/decomposition_registry_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..26083e44659785919d9184dcb47d0ed3f592cca5
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/decomposition_registry_util.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API const std::string& GetSerializedDecompositions();
+
+TORCH_API const OperatorMap<std::string>& GetDecompositionMapping();
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/exception_message.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/exception_message.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab634fecfe30a264a65243e6e99cad59f2c266d8
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/exception_message.h
@@ -0,0 +1,29 @@
+#pragma once
+#include <c10/util/Exception.h>
+#include <stdexcept>
+
+namespace torch::jit {
+
+struct ExceptionMessage {
+  ExceptionMessage(const std::exception& e) : e_(e) {}
+
+ private:
+  const std::exception& e_;
+  friend std::ostream& operator<<(
+      std::ostream& out,
+      const ExceptionMessage& msg);
+};
+
+inline std::ostream& operator<<(
+    std::ostream& out,
+    const ExceptionMessage& msg) {
+  auto c10_error = dynamic_cast<const c10::Error*>(&msg.e_);
+  if (c10_error) {
+    out << c10_error->what_without_backtrace();
+  } else {
+    out << msg.e_.what();
+  }
+  return out;
+}
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/graph_executor.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/graph_executor.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d13ffd95a510720fe75dd3bf7393cb863c22a1e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/graph_executor.h
@@ -0,0 +1,147 @@
+#pragma once
+
+#include <atomic>
+#include <memory>
+
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/python/update_graph_executor_opt.h>
+#include <torch/csrc/jit/runtime/argument_spec.h>
+#include <torch/csrc/jit/runtime/interpreter.h>
+#include <torch/csrc/jit/runtime/variable_tensor_list.h>
+
+TORCH_DECLARE_bool(torch_jit_enable_new_executor);
+
+TORCH_DECLARE_bool(torch_jit_execution_plan_reuse_code_graph);
+
+namespace torch::jit {
+struct GraphExecutorState;
+struct Code;
+
+enum ExecutorExecutionMode {
+  SIMPLE,
+  PROFILING,
+};
+
+struct ExecutionPlan {
+  ExecutionPlan() = default;
+  ExecutionPlan(std::shared_ptr<Graph> graph, std::string function_name)
+      : code(graph, std::move(function_name)),
+        graph(
+            FLAGS_torch_jit_execution_plan_reuse_code_graph
+                ? code.graph()
+                : std::move(graph)) {}
+
+  operator bool() const {
+    return static_cast<bool>(graph);
+  }
+
+  Code code;
+  std::shared_ptr<Graph> graph;
+};
+
+// Notice that those structs don't manage lifetime of their members.
+// They are only valid only right after you call getDebugState() and should
+// never be used again once another GraphExecutor function is called.
+
+struct GraphExecutorState {
+  const Graph* graph = nullptr;
+  ExecutionPlan fallback; // XXX: members of this field are optional
+  std::unordered_map<ArgumentSpec, ExecutionPlan> execution_plans;
+};
+
+struct TORCH_API EnableProfilingGuard {
+  EnableProfilingGuard();
+  ~EnableProfilingGuard();
+
+ private:
+  bool old_executor_mode = false;
+  bool old_get_optimize = false;
+};
+
+struct GraphExecutorImplBase;
+struct TORCH_API GraphExecutor {
+  GraphExecutor() = default;
+  GraphExecutor(const std::shared_ptr<Graph>& graph, std::string function_name);
+
+  GraphExecutor(
+      const std::shared_ptr<Graph>& graph,
+      std::string function_name,
+      ExecutorExecutionMode executor_mode);
+
+  void run(Stack& inputs);
+  c10::intrusive_ptr<Future> runAsync(
+      Stack& stack,
+      TaskLauncher taskLauncher = at::launch);
+
+  // `remaining_bailout_depth` stands for the maximum number of profiled and
+  // specialized recompilations allowed for the current `GraphExecutor`. if
+  // remaining_bailout_depth is equal to 0, `GraphExecutor` won't perform any
+  // profiling and specialization. This is also equivalent to the
+  // SIMPLE_EXECUTOR mode. if remaining_bailout_depth is greater than 0,
+  // `GraphExecutor` will profile and specialize its input graph based on the
+  // profiled information whenever a bailout check is failed/triggered, a new
+  // `GraphExecutor` will be created. This new `GraphExecutor`'s
+  // remaining_bailout_depth will be reduced by 1.
+  // If no bailout depth is passed, the depth will be initialized from the
+  // current global fusion strategy settings.
+  const ExecutionPlan& getPlanFor(
+      Stack& inputs,
+      std::optional<size_t> remaining_bailout_depth = std::nullopt);
+  GraphExecutorState getDebugState();
+
+  void debugFlushCompilationCache();
+
+  bool isOptimized() const;
+
+ private:
+  std::shared_ptr<GraphExecutorImplBase> pImpl;
+};
+
+TORCH_API Node* replaceBlockWithFallbackGraph(
+    Block* b,
+    ArrayRef<Value*> inputs);
+
+// These passes need to run before it is valid to pass to the interpreter
+// regardless of whether sizes have been specialized or not.
+TORCH_API void runRequiredPasses(const std::shared_ptr<Graph>& g);
+
+TORCH_API void debugSetFusionGroupInlining(bool state);
+TORCH_API bool getFusionGroupInlining();
+
+TORCH_API void debugSetAutodiffSubgraphInlining(bool state);
+TORCH_API std::shared_ptr<Graph> lastExecutedOptimizedGraph();
+
+TORCH_API std::atomic<bool>& getProfilingMode();
+TORCH_API std::atomic<bool>& getExecutorMode();
+TORCH_API std::atomic<size_t>& getNumProfiledRuns();
+TORCH_API size_t getBailoutDepth();
+TORCH_API bool IsNewExecutorEnabled();
+
+struct TORCH_API GraphOptimizerEnabledGuard {
+  GraphOptimizerEnabledGuard(bool state)
+      : old_state_(getGraphExecutorOptimize()) {
+    setGraphExecutorOptimize(state);
+  }
+
+  ~GraphOptimizerEnabledGuard() {
+    setGraphExecutorOptimize(old_state_);
+  }
+
+  bool old_state_;
+};
+
+namespace detail {
+
+GraphExecutor* getGradExecutor(Operation& op);
+
+GraphExecutor* getDifferentiableGraphOpExecutor(Operation& op);
+
+// for debugging information we expose a way to get the last actually
+// run graph. Previous approaches allowed querying the GraphExecutor
+// for what graph it would run in certain circumstances (graphFor), but
+// this is fragile because we sometimes change how these decisions are made.
+// This interface still allows our tests to look at optimized graphs, but
+// with less plumbing.
+} // namespace detail
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/graph_executor_impl.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/graph_executor_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..4dfb2a412babc567cda22dfcb852043042145a51
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/graph_executor_impl.h
@@ -0,0 +1,113 @@
+#pragma once
+#include <torch/csrc/jit/runtime/graph_executor.h>
+
+#include <ATen/core/ivalue.h>
+#include <c10/util/Exception.h>
+#include <torch/csrc/autograd/grad_mode.h>
+#include <torch/csrc/jit/frontend/tracer.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/passes/shape_analysis.h>
+#include <torch/csrc/jit/resource_guard.h>
+#include <torch/csrc/jit/runtime/argument_spec.h>
+#include <torch/csrc/jit/runtime/autodiff.h>
+#include <torch/csrc/jit/runtime/custom_operator.h>
+#include <torch/csrc/jit/runtime/interpreter.h>
+#include <torch/csrc/jit/runtime/profiling_record.h>
+
+#include <torch/csrc/autograd/edge.h>
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/jit/frontend/ir_emitter.h>
+#include <torch/csrc/jit/runtime/logging.h>
+
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace torch::jit {
+
+void packGradient(const Gradient& gradient, Node* dnode);
+bool needsGradient(const std::shared_ptr<const Graph>& graph);
+void runOptimization(
+    std::shared_ptr<Graph>& graph,
+    bool unroll_non_constant_loops = true,
+    bool const_prop_user_classes = true);
+void runNondiffOptimization(
+    std::shared_ptr<Graph>& graph,
+    bool strict_fuser_check = false);
+void debugSetAutodiffSubgraphInlining(bool state);
+bool TORCH_API getAutodiffSubgraphInlining();
+
+void debugSetFusionGroupInlining(bool state);
+bool getFusionGroupInlining();
+
+// Tunable parameters for deciding when to create/keep subgraphs of
+// differentiable code
+const size_t autodiffSubgraphNodeThreshold = 2;
+const size_t autodiffSubgraphInlineThreshold = 5;
+
+// a Graph can be created via tracing, or via a language-based frontend
+// GraphExecutor runs it. It can run the same graph on many different sizes
+// and different requires_grad states, and handles specializations for each
+// situation. GraphExecutor is completely unaware of tracing or module
+// parameters to keep the tracing concerns separated.
+struct GraphExecutorImplBase {
+  static std::shared_ptr<Graph> prepareGraph(
+      const std::shared_ptr<Graph>& graph) {
+    auto copy = graph->copy();
+    EraseShapeInformation(copy);
+    return copy;
+  }
+
+  GraphExecutorImplBase(
+      const std::shared_ptr<Graph>& graph,
+      std::string function_name)
+      : graph(prepareGraph(graph)),
+        function_name_(std::move(function_name)),
+        num_inputs(this->graph->inputs().size()),
+        num_outputs(this->graph->outputs().size()) {}
+
+  // entry point where execution begins
+  void run(Stack& stack);
+  c10::intrusive_ptr<Future> runAsync(
+      Stack& stack,
+      TaskLauncher taskLauncher = at::launch);
+
+  virtual const ExecutionPlan& getPlanFor(
+      Stack& stack,
+      std::optional<size_t> remaining_bailout_depth = std::nullopt) = 0;
+  virtual GraphExecutorState getDebugState() = 0;
+  virtual ~GraphExecutorImplBase() = default;
+
+  virtual bool isOptimized() const {
+    return false;
+  }
+
+ protected:
+  friend struct GraphExecutor;
+
+  // The unoptimized starting graph. This field is effectively const, but we
+  // can't make it so because Graph::copy() is not const (and making it const is
+  // not that easy at this point).
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::shared_ptr<Graph> graph;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::string function_name_;
+
+  // If false, we'll run the graph as we get it, without any optimizations.
+  // Useful for debugging.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  const size_t num_inputs;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  const size_t num_outputs;
+
+  // GraphExecutors can be accessed from multiple threads, so this thread needs
+  // to be held every time we access the fallback or plan_cache.
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::mutex compile_mutex;
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/graph_iterator.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/graph_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..46e4df0a1dcc8a788cdf51f67bafbb187583059a
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/graph_iterator.h
@@ -0,0 +1,147 @@
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+// This class facilitates depth-first iteration over all nodes in a graph.
+class DepthFirstGraphNodeIterator {
+  Node* current_;
+
+ public:
+  // Constructor.
+  explicit DepthFirstGraphNodeIterator(std::shared_ptr<Graph>& graph)
+      : current_(*(graph->block()->nodes().begin())) {}
+
+  // Moves up and to the next node (may move up recursively).
+  void move_up() {
+    if (current_ == nullptr) {
+      return;
+    }
+    // Basically we start from the child block (which is current_)
+    // and we try to find the block that owns it. Now we need to check
+    // if that block is the graph root block, or if it is an If/Loop/etc
+    // block.
+    //
+    // If it's the graph root block we can stop because there is no "up"
+    // but if it is a node (e.g. If/Loop/etc) we need to apply logic
+    // based on where we are coming from to move to the next block.
+    // This might mean that we need to traverse up again (e.g. if we've
+    // reached the end of the else clause in an if block we need to go)
+    // up to the parent block that contains the if.
+    //
+    // Similarly if we've reached the end of the parent block containing
+    // the else clause we might need to go up again so this is a recursive
+    // function.
+    //
+    //              BlockNode (if/loop/with)
+    //                       |
+    //            [Block1]  ... [Block2]
+    //                |
+    //   [ Node1, Node2, Node3, FromNode]
+    //
+    auto parent_block = current_->owningBlock();
+    TORCH_INTERNAL_ASSERT(parent_block, "Every node must be owned by a block");
+
+    // Get the node that owns the parent block. This node has to be an if,
+    // loop, or with.
+    auto parent_node = parent_block->owningNode();
+    if (parent_node == nullptr) {
+      // If there's no node that owns this current block then we're at the
+      // top of the graph and since we're trying to move up we have reached
+      // the end of the traversal.
+      current_ = nullptr;
+      return;
+    }
+
+    // Check the type of node this root is.
+    if (parent_node->kind() == prim::If) {
+      // Need to check if we came from the `then` branch or the `else` branch.
+      auto* then_block = parent_node->blocks().at(0);
+      auto* else_block = parent_node->blocks().at(1);
+
+      if (parent_block == else_block) {
+        // If else block then we move to the next node in the parent block.
+        current_ = parent_node->next();
+        if (current_->kind() == prim::Return) {
+          move_up();
+        }
+      } else {
+        // If then block then move to the else block if it is not empty.
+        TORCH_INTERNAL_ASSERT(parent_block == then_block);
+        bool else_block_empty =
+            else_block->nodes().begin() == else_block->nodes().end();
+
+        if (!else_block_empty) {
+          current_ = *(else_block->nodes().begin());
+        } else {
+          // Since it's empty we move to the next node.
+          current_ = parent_node->next();
+          if (current_->kind() == prim::Return) {
+            move_up();
+          }
+        }
+      }
+    } else if (
+        parent_node->kind() == prim::Loop ||
+        parent_node->kind() == prim::With) {
+      current_ = parent_node->next();
+      if (current_->kind() == prim::Return) {
+        move_up();
+      }
+    } else {
+      TORCH_INTERNAL_ASSERT(
+          false, "Only if/loop/with nodes should have child blocks");
+    }
+  }
+
+  // Moves to the next adjacent node or up in to the parent if that is not
+  // possible.
+  void move_next() {
+    if (current_ == nullptr) {
+      return;
+    }
+
+    // Increment to the next node in the current block.
+    current_ = current_->next();
+
+    // Check if we're at the end of the block. If so we need
+    // to move upwards (if it makes sense to).
+    if (current_->kind() == prim::Return) {
+      move_up();
+    }
+  }
+
+  // Moves to the next node in the graph into children if it can.
+  void move_into() {
+    if (current_ == nullptr) {
+      return;
+    }
+
+    // Check if we're currently on a node that contains sub-nodes.
+    if (current_->kind() == prim::If || current_->kind() == prim::Loop ||
+        current_->kind() == prim::With) {
+      auto* first_block = current_->blocks().at(0);
+      current_ = first_block->param_node();
+      // Move next will move up and out of the current node if the block is
+      // empty. `move_up` which is called by `move_next` will handle the
+      // difference between If, Loop, and With blocks appropriately.
+      move_next();
+    } else {
+      move_next();
+    }
+  }
+
+  // Get the next Node in the graph. \returns nullptr if there are no nodes
+  // left.
+  Node* next() {
+    auto result = current_;
+
+    // Try move into the existing node to set the next node to be returned.
+    // This will move to the next node if not possible, or move upwards and
+    // to the next.
+    move_into();
+
+    return result;
+  }
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/instruction.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/instruction.h
new file mode 100644
index 0000000000000000000000000000000000000000..da1ece04acc8af1df61ff5a6e42b53e08158a415
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/instruction.h
@@ -0,0 +1,100 @@
+#pragma once
+
+#include <cstdint>
+#include <typeinfo>
+#include <unordered_set>
+
+namespace torch::jit {
+// instruction look like:
+// op_code X, N
+// meaning of X, N depend on the op:
+// O - index into operator table
+// R - index into register table
+// I - literal integer
+// C - index into constant table
+// P - jump offset relative to beginning of current instruction
+// F - index into function table
+// T - index into the type table, used for guard instructions
+// S - index into object slots
+// C - index into code table
+
+#define FORALL_OPCODES(_)                                                      \
+  _(OP, "O") /* invoke operator X */                                           \
+  _(OPN, "OI") /* invoke vararg operator X with N arguments */                 \
+  _(LOAD, "R") /* push a value from a register X */                            \
+  _(MOVE, "R") /* push a value from register X, clearing the register */       \
+  _(STOREN, "RI") /* store N values to registers [X, X+N) */                   \
+  _(STORE, "R") /* store 1 value to registers X */                             \
+  _(DROP, "") /* drop 1 value from the top of the stack */                     \
+  _(DROPR, "R") /* clear register X */                                         \
+  _(LOADC, "C") /* push the constant X */                                      \
+  _(JF, "P") /* pop the top of the stack, if false, branch to P */             \
+  _(JMP, "P") /* unconditional branch to X */                                  \
+  _(LOOP, "PI") /* perform a loop, X is where to branch if cond is false */    \
+  _(RET, "") /* exit execution */                                              \
+  _(WAIT, "") /* wait for a future to be complete */                           \
+  _(CALL, "F") /* call function X */                                           \
+  _(GUARD, "T") /* check a guard against type_table, true if passes */         \
+  _(TYPECHECK, "TN") /* check each type of input[i] against type_table[X+N] */ \
+  _(FAIL_GUARD, "T") /* fail a guard, patch back to GUARD */                   \
+  _(PROFILE_OP, "F") /* get a callback from profile_function_table at X */     \
+  _(TAIL_CALL, "F") /* replace current frame with function F */                \
+  _(INTERFACE_CALL, "CI") /* call method X on the first argument (of N) */     \
+  _(GET_ATTR, "S") /* get attribute from slot X in an Object */                \
+  _(SET_ATTR, "S") /* set attribute to slot X in an Object */                  \
+  _(LIST_UNPACK, "I") /* unpack list expecting length I */                     \
+  _(TUPLE_CONSTRUCT, "I") /* construct a tuple using X inputs */               \
+  _(NAMED_TUPLE_CONSTRUCT,                                                     \
+    "TI") /* construct a tuple of type X, using N inputs */                    \
+  _(LIST_CONSTRUCT, "TI") /* construct a list of type X, using N inputs */     \
+  _(DICT_CONSTRUCT, "TI") /* construct a dict of type X, using N inputs */     \
+  _(CREATE_OBJECT, "T") /* create an object of type X */                       \
+  _(ISINSTANCE, "TI") /* check object is one of  types[X:X+N]  */              \
+  _(TUPLE_SLICE, "II") /* slice tup[X:(X+N)] */                                \
+  _(TUPLE_INDEX, "") /* get the value from a tuple at that index */            \
+  _(RAISE_EXCEPTION, "") /* throws the exception from Python */                \
+  _(DICT_INDEX, "") /* gets the value from the dict for given key */           \
+  _(UNCHECKED_CAST, "") /* perform an unchecked cast operation */              \
+  _(__IS__, "") /* performs `is` operator from Python */                       \
+  _(UN_INITIALIZED,                                                            \
+    "") /* sets default values to variables that are uninitialized */          \
+  _(__ISNOT__, "") /* performs `is not` operator from Python  */               \
+  _(FORMAT, "I") /* performs string format function `f strings` or `{}.format` \
+                     the number of inputs in stored in X */                    \
+  _(DEVICE, "") /* invokes aten::device for a Tensor */                        \
+  _(DTYPE, "") /* invokes aten::dtype for a Tensor */                          \
+  _(DIM, "") /* invokes aten::dim for a Tensor */                              \
+  _(__NOT__, "") /* performs `not` operator from Python  */                    \
+  _(TO_LIST, "") /* convert the input to a list */                             \
+  _(NUM_TO_TENSOR,                                                             \
+    "") /* performs the conversion of a number/scalar to Tensor */             \
+  _(IS_CUDA, "") /* invokes aten::is_cuda for a Tensor */                      \
+  _(FORK, "CN") /* launch a thread to run code entry x with N inputs  */       \
+  _(WARN, "I") /* emit a warning with line information */                      \
+  _(ENTER, "EN") /* enter scope of a contextmanager */                         \
+  _(EXIT, "EX") /* exit the last entered contextmanager */                     \
+  _(AWAITABLE, "CN") /* initialize await for code entry x with N inputs  */
+
+enum OpCode : uint8_t {
+#define DEFINE_OP(op, _) op,
+  FORALL_OPCODES(DEFINE_OP)
+#undef DEFINE_OP
+};
+
+struct Instruction {
+  OpCode op;
+  uint8_t unused;
+  uint16_t N;
+  int32_t X;
+  // TODO: check for overflow
+  Instruction(OpCode op, int32_t X, uint16_t N)
+      : op(op), unused(0), N(N), X(X) {}
+};
+std::ostream& operator<<(std::ostream& out, Instruction inst);
+
+bool isOpSupportedInMobile(OpCode op);
+char const* toString(OpCode op);
+OpCode parseOpCode(const char* str);
+std::ostream& operator<<(std::ostream& out, Instruction inst);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/interpreter.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/interpreter.h
new file mode 100644
index 0000000000000000000000000000000000000000..79d2c2d01d449e7fab5ed3fe9437a4ed3243b2b9
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/interpreter.h
@@ -0,0 +1,160 @@
+#pragma once
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include <ATen/ThreadLocalState.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/frontend/source_range.h>
+
+TORCH_DECLARE_bool(torch_jit_disable_warning_prints);
+TORCH_DECLARE_bool(torch_jit_enable_rethrow_caught_exception);
+
+namespace at {
+class Tensor;
+TORCH_API void launch(std::function<void()> func);
+} // namespace at
+namespace c10 {
+struct IValue;
+struct OperatorName;
+} // namespace c10
+
+namespace torch::jit {
+
+// The interpreter run Graphs with Tensor inputs and Tensor outputs
+// a separate component in the autograd handles unwrapping and wrapping
+// variable objects for use in the interpreter.
+namespace interpreter {
+struct CodeImpl;
+}
+
+struct Node;
+struct GraphExecutor;
+struct InterpreterStateImpl;
+struct Graph;
+struct Node;
+struct Instruction;
+using Stack = std::vector<c10::IValue>;
+using c10::ivalue::Future;
+using TaskLauncher = std::function<void(std::function<void()>)>;
+
+bool TORCH_API in_torchscript_runtime();
+
+struct TORCH_API Code {
+  Code() = default;
+  explicit Code(interpreter::CodeImpl* pImpl);
+  // remaining_bailout_depth is irrelevant in a `Code` object unless the `Code`
+  // is directly created by `GraphExecutor` in which case it's likely to contain
+  // `prim::BailOut`s to control the maximum depth of bailout chains
+  explicit Code(
+      const std::shared_ptr<Graph>& graph,
+      std::string function_name,
+      size_t remaining_bailout_depth = 0);
+
+  const std::vector<GraphExecutor*>& grad_executors();
+  const std::vector<GraphExecutor*>& diff_graph_op_executors();
+
+  explicit operator bool() const {
+    return pImpl != nullptr;
+  }
+  size_t num_inputs() const;
+  size_t num_outputs() const;
+  size_t num_bailouts() const;
+  const std::vector<c10::IValue>& constant_table() const;
+  const std::vector<c10::TypePtr>& type_table() const;
+  const std::vector<Instruction>& instructions() const;
+  const std::unordered_map<std::string, size_t>& op_to_num_specified_args()
+      const;
+  const std::vector<Node*>& instructions_source() const;
+  void request_bailout(size_t index);
+  size_t register_size() const;
+  std::shared_ptr<Graph> graph() const;
+
+ private:
+  std::shared_ptr<interpreter::CodeImpl> pImpl;
+  friend struct InterpreterStateImpl;
+  friend std::ostream& operator<<(std::ostream& out, const Code& code);
+};
+
+struct TORCH_API MobileCode : Code {
+  explicit MobileCode(
+      const std::shared_ptr<Graph>& graph,
+      std::string function_name,
+      bool emit_default_input_instructions = true,
+      bool support_default_args_before_out = true,
+      bool emit_promoted_ops = true,
+      size_t remaining_bailout_depth = 0);
+};
+
+struct InterpreterState {
+  TORCH_API InterpreterState(
+      const Code& code,
+      TaskLauncher taskLauncher = at::launch);
+  TORCH_API void run(Stack& stack);
+  TORCH_API c10::intrusive_ptr<Future> runAsync(Stack& stack);
+  c10::intrusive_ptr<Future> getFuture();
+
+ private:
+  InterpreterState(c10::intrusive_ptr<c10::intrusive_ptr_target> pImpl);
+  // Ideally we should use c10::intrusive_ptr<InterpreterStateImpl> for pImpl;
+  // but intrusive_ptr requires full definition of InterpreterStateImpl,
+  // which we need to hide in the header.
+  c10::intrusive_ptr<c10::intrusive_ptr_target> pImpl;
+  friend struct InterpreterStateImpl;
+};
+
+// Created by wait()
+struct Suspend : public std::exception {
+  const char* what() const noexcept override {
+    return "Suspend";
+  }
+
+  explicit Suspend(c10::intrusive_ptr<Future> future_)
+      : future(std::move(future_)) {}
+
+  c10::intrusive_ptr<Future> future;
+};
+
+// InterpreterContinuation propagates dist_autograd_context_id
+// through (and only through) the forward pass manually, other
+// thread local settings are propagated with ThreadLocalState
+struct InterpreterContinuation {
+  InterpreterContinuation(
+      InterpreterState state_,
+      Stack stack_,
+      int64_t dist_autograd_context_id = 0,
+      std::optional<at::ThreadLocalState> tls_state = std::nullopt)
+      : state(std::move(state_)),
+        stack(std::move(stack_)),
+        tls_state_(std::move(tls_state))
+#ifdef USE_DISTRIBUTED
+        ,
+        dist_autograd_context_id_(dist_autograd_context_id)
+#endif
+  {
+  }
+
+  void operator()();
+
+ private:
+  InterpreterState state;
+  Stack stack;
+  std::optional<at::ThreadLocalState> tls_state_ = std::nullopt;
+#ifdef USE_DISTRIBUTED
+  int64_t dist_autograd_context_id_;
+#endif
+};
+
+// what is the tensors type, including state from the current execution context
+// that modifies how the tensor behaves. For instance if no_grad is enabled
+// this will cause the TensorType to have requires_grad=False.
+TORCH_API at::TensorTypePtr tensorTypeInCurrentExecutionContext(
+    const at::Tensor& t);
+
+// current (TLS) TorchScript interpreter callstack
+TORCH_API std::vector<StackEntry> currentCallstack();
+TORCH_API std::vector<std::string> currentModuleHierarchy();
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/interpreter/can_emit_inline.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/interpreter/can_emit_inline.h
new file mode 100644
index 0000000000000000000000000000000000000000..270a0dd084cd0f990e0d9e4f0b13f46c4a944d7b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/interpreter/can_emit_inline.h
@@ -0,0 +1,106 @@
+#pragma once
+
+#include <memory>
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit::interpreter {
+/*
+This is an optimization that reduces the number of store/load/move nodes needed
+by recognizing that parts of the graph are simple trees like a*x + b*y. When
+this happens it is possible to work directly off of the stack by emitting the
+tree in a depth-first left-to-right manner:
+  load a
+  load x
+  mul # stack now is a*x
+  load b
+  load y
+  mul # stack now is a*x, b*y
+  add
+
+can_emit_inline_[node] == true means that this node participates as a non-root
+member of one of these trees. The code emitter will not emit this node when
+it is encountered in the node. Instead the node is emitted in a depth first
+traversal from where it is used in a tree.
+
+To participate in a tree a node must have a single use (otherwise it is not
+tree-like) and output a single value (for simplicity.) If our IR was functional,
+these would be the only constraints. However, many nodes have side effects, so
+we must ensure that emitting the nodes in depth first order from the tree's root
+_does not reorder the emission of the nodes_. To ensure this, we work backward
+from the root of a potential tree, visiting its inputs in reverse depth first
+order, while scanning the node list backward (with the block_point node). When
+these traversal line up we know it is safe to emit the tree in this way. We
+ignore constant nodes, which do not have side effects.
+*/
+struct CanEmitInline {
+  explicit CanEmitInline(Graph& graph) {
+    scanBlock(graph.block());
+  }
+  bool canInline(Value* v) {
+    return v->node()->kind() != prim::Param &&
+        // without this a BailOut may float downstream past some later
+        // BailOut
+        // and receive a higher jf_index. Then a GUARD instruction
+        // we generated for the floated BailOut will get popped up from the
+        // instruction stack
+        // by the later BailOut in createBailoutBlock and its jf_index
+        // will become invalid.
+        v->node()->kind() != prim::TensorExprGroup &&
+        v->node()->kind() != prim::TensorExprDynamicGroup &&
+        v->node()->kind() != prim::StaticSubgraph &&
+        v->node()->kind() != prim::CudaFusionGroup &&
+        v->node()->kind() != prim::FusionGroup &&
+        v->node()->kind() != prim::BailOut && v->uses().size() == 1 &&
+        v->node()->outputs().size() == 1;
+  }
+
+  Node* previousNonConstant(Node* n) {
+    do {
+      n = n->prev();
+    } while (n->kind() == prim::Constant);
+    return n;
+  }
+
+  Node* scanValue(Node* block_point, Value* v) {
+    // this node is a candidate for inline, if our reverse scan of the
+    // node list lines up with the use of v, we know it will be emitted in
+    // tree order, and we can inlining. Scan continues for further nodes.
+    if (v->node() == block_point && canInline(v)) {
+      // since we inlined this node, we may be able to recursively inline
+      // its inputs, so we continue scanning it
+      block_point = scanNode(v->node());
+      can_emit_inline_[v->node()] = true;
+    }
+    // if it does not line up, we can't inline 'v', and will just generate
+    // a load/move for it. However, other inputs may still appear in tree
+    // order so we continue the scan of the inputs.
+    return block_point;
+  }
+
+  Node* scanNode(Node* n) {
+    // don't bother to scan nodes we have already determined to be inline
+    if (can_emit_inline_.count(n)) {
+      return nullptr;
+    }
+    for (auto b : n->blocks()) {
+      scanBlock(b);
+    }
+    Node* block_point = previousNonConstant(n);
+    for (auto it = n->inputs().rbegin(), end = n->inputs().rend(); it != end;
+         ++it) {
+      block_point = scanValue(block_point, *it);
+    }
+    return block_point;
+  }
+
+  void scanBlock(Block* b) {
+    scanNode(b->return_node());
+    for (auto node : b->nodes().reverse()) {
+      scanNode(node);
+    }
+  }
+  std::unordered_map<Node*, bool> can_emit_inline_;
+};
+
+} // namespace torch::jit::interpreter
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/interpreter/code_impl.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/interpreter/code_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..3af9a4dd8ca2007c143ccf89ce67fbb44969b68f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/interpreter/code_impl.h
@@ -0,0 +1,1061 @@
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <c10/util/irange.h>
+#include <torch/csrc/jit/api/function_impl.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/bailout_graph.h>
+#include <torch/csrc/jit/runtime/calculate_necessary_args.h>
+#include <torch/csrc/jit/runtime/graph_iterator.h>
+#include <torch/csrc/jit/runtime/instruction.h>
+#include <torch/csrc/jit/runtime/interpreter/preprocess_graph.h>
+
+TORCH_DECLARE_bool(torch_jit_enable_expanded_stacks);
+TORCH_DECLARE_bool(torch_jit_expanded_stacks_mangled);
+
+namespace torch::jit::interpreter {
+
+template <class Ttarget, class Tsource>
+Ttarget safe_narrow_cast(Tsource v) {
+  Ttarget res = static_cast<Ttarget>(v);
+  // Casting it back to check whether it overflew.
+  if (static_cast<Tsource>(res) != v) {
+    TORCH_WARN(
+        "ATTENTION: your model computation is overflowing, safe_narrow_cast<>() failed");
+    return v;
+  }
+  return res;
+}
+
+// BailoutBlocks are used to temporarily store
+// instructions (typically, argument LOADs and TAIL_CALL)
+// generated for prim::BailOut nodes
+// before they are merged back into
+// CodeImpl._instructions_ by insertBailoutBlocks
+struct BailoutBlock {
+  size_t jf_instruction_index; // this node gets patched to jump here on failure
+  std::vector<Instruction> instructions; // ends in a TAIL_CALL
+
+  explicit BailoutBlock(size_t jf_index) : jf_instruction_index(jf_index) {}
+};
+
+// for keeping track of the current node
+struct WithCurrentNode {
+  WithCurrentNode(Node** loc, Node* new_value) : loc_(loc), old_value_(*loc_) {
+    *loc = new_value;
+  }
+  ~WithCurrentNode() {
+    *loc_ = old_value_;
+  }
+
+ private:
+  Node** loc_;
+  Node* old_value_;
+};
+
+struct NodeSourceInfo {
+  const char* func_name_{nullptr};
+  const char* file_name_{nullptr};
+  size_t line_{0};
+  NodeSourceInfo() = default;
+};
+
+struct CodeImpl {
+  friend struct InterpreterState;
+  std::vector<Instruction> instructions_;
+
+  const c10::unique_t node_stack_attr_symbol_ =
+      static_cast<c10::unique_t>(attr::node_stack_idx);
+  // Expanded inlined stacks as pointers to values in inlined call stack.
+  std::vector<std::vector<NodeSourceInfo>> expanded_node_stacks_;
+
+  // same length as instructions.
+  // what node in the graph cause this
+  // instruction to be emitted?
+  std::vector<Node*> instructions_source_;
+  std::vector<IValue> constant_table_;
+  std::vector<Operation> operator_table_;
+#ifndef NDEBUG
+  std::vector<Operator> full_operator_table_;
+#endif
+  // map<(op name, num inputs), index in operator table>, to avoid duplicates,
+  // not including vararg operators
+  std::unordered_map<
+      std::pair<std::string, int>,
+      int,
+      std::function<size_t(const std::pair<std::string, int>& p)>>
+      operator_table_inv_;
+  std::vector<Function*> function_table_;
+  std::vector<std::unique_ptr<GraphFunction>> forked_functions_;
+  std::vector<std::unique_ptr<GraphFunction>> awaited_functions_;
+  std::vector<TypePtr> type_table_;
+  std::vector<std::function<void(std::vector<IValue>&)>>
+      profile_function_table_;
+
+  int register_size_ = 0;
+  size_t n_outputs;
+  size_t n_inputs;
+  TypePtr return_type_;
+  std::string function_name_;
+
+  // We MUST hold onto graph here because some Operators stored in the
+  // instruction lists have dependencies on meta-data stored in the graph
+  // that would be dead otherwise.
+  // It is also very useful for debugging interpreter problems to
+  // keep this around.
+  std::shared_ptr<Graph> graph_;
+  std::optional<std::vector<GraphExecutor*>> grad_executors_;
+  std::optional<std::vector<GraphExecutor*>> forward_executors_;
+  PreprocessGraph preprocess_;
+
+  // map from unique of nodes to register in register table
+  std::unordered_map<Value*, int> value_to_reg_;
+
+  // map from operator name to specified arguments
+  // Example: for a schema of aten::foo.str
+  // aten::foo.str(arg0: str="default", arg1: int=0,
+  //               arg2: bool=False, arg3: float=0.0)
+  // If the usages in a graph is:
+  //    aten::foo("somestr", arg1=0, arg2=True, arg3=0.0)
+  //    aten::foo("somestr", arg1=1, arg2=False, arg3=0.0)
+  // op_to_num_specified_args_["aten::foo.str"] = 3
+  // This is because for all usages, at most 3 args are used.
+  std::unordered_map<std::string, size_t> op_to_num_specified_args_;
+
+  std::unordered_map<std::string, size_t> op_to_num_out_args_;
+
+  // running count of uses as we emit. When we reach use_count_[v] =
+  // v.uses().size() we know it is the final use and we can move rather than
+  // load.
+  std::unordered_map<Value*, size_t> use_count_;
+
+  Node* current_node_; // used in creation of code to keep track
+                       // of node being emitted
+  Node* last_inserted_op_ = nullptr;
+
+  // out-of-line jumps for bailouts that are patched in at the end
+  std::vector<BailoutBlock> bailout_blocks_;
+  std::vector<std::unique_ptr<Function>> bailout_functions_;
+  size_t remaining_bailout_depth_;
+
+  CodeImpl(
+      const std::shared_ptr<Graph>& graph,
+      std::string function_name,
+      size_t remaining_bailout_depth,
+      bool emit_instructions = true)
+      : operator_table_inv_(
+            0,
+            [](const std::pair<std::string, int>& p) {
+              return std::hash<std::string>()(p.first) ^
+                  std::hash<int>()(p.second);
+            }),
+        function_name_(std::move(function_name)),
+        preprocess_(*graph),
+        current_node_(preprocess_.graph->return_node()),
+        remaining_bailout_depth_(remaining_bailout_depth) {
+    graph_ = preprocess_.graph;
+    n_outputs = graph_->outputs().size();
+    if (n_outputs == 1) {
+      return_type_ = graph->outputs().at(0)->type();
+    } else {
+      return_type_ = TupleType::create(
+          fmap(graph->outputs(), [](const Value* v) { return v->type(); }));
+    }
+    n_inputs = graph_->inputs().size();
+    if (emit_instructions) {
+      // NOLINTNEXTLINE(clang-analyzer-optin.cplusplus.VirtualCall)
+      run();
+    }
+  }
+
+  virtual ~CodeImpl() = default;
+
+  // since subclass of CodeImpl needs to populate
+  // op_to_num_specified_args, we separate the calls
+  // that changes internals of CodeImpl into a separate
+  // function.
+  virtual void run() {
+    emitCodeForBlock(graph_->block());
+    insertInstruction(RET);
+    // we deferred the emission of bailout blocks so they appear at the end
+    // emit them now and patch up the jumps
+    insertBailoutBlocks();
+  }
+
+  const std::vector<c10::IValue>& constant_table() const {
+    return constant_table_;
+  }
+
+  void request_bailout(size_t index) {
+    auto count = index;
+    for (const auto instr_index : c10::irange(instructions_.size())) {
+      if (instructions_[instr_index].op == GUARD ||
+          instructions_[instr_index].op == FAIL_GUARD) {
+        if (count-- == 0) {
+          // patching GUARD to FAIL_GUARD
+          instructions_[instr_index].op = FAIL_GUARD;
+          GRAPH_DEBUG(
+              "Added a bailout request for ",
+              index,
+              " at instruction ",
+              instr_index);
+          break;
+        }
+      }
+    }
+  }
+
+  const std::vector<Instruction>& instructions() const {
+    return instructions_;
+  }
+
+  const std::unordered_map<std::string, size_t>& op_to_num_specified_args()
+      const {
+    return op_to_num_specified_args_;
+  }
+
+  const std::vector<Node*>& instructions_source() const {
+    return instructions_source_;
+  }
+
+  NodeSourceInfo getSourceInfoFromSourceRange(const SourceRange& range) {
+    NodeSourceInfo nodeSource;
+    SourceRange r = range;
+    if (!FLAGS_torch_jit_expanded_stacks_mangled && range.source()) {
+      if (auto orig = range.source()->findSourceRangeThatGenerated(r)) {
+        r = *orig;
+      }
+    }
+    if (r.source()) {
+      auto lineno = r.source()->lineno_for_offset(r.start());
+      nodeSource.line_ = r.source()->lineno_to_source_lineno(lineno);
+      if (r.source()->filename()) {
+        nodeSource.file_name_ = r.source()->filename().value().c_str();
+      }
+    }
+    return nodeSource;
+  }
+
+  void expandInlinedNodeStack(
+      const InlinedCallStackPtr& cs,
+      std::vector<NodeSourceInfo>* expandedstack) {
+    auto nodeSourceInfo = getSourceInfoFromSourceRange(cs->source_range());
+    nodeSourceInfo.func_name_ = cs->function_name().c_str();
+    expandedstack->emplace_back(nodeSourceInfo);
+
+    if (cs->callee()) {
+      expandInlinedNodeStack(cs->callee().value(), expandedstack);
+    }
+  }
+
+  void getNodeStack(
+      const Node* node,
+      std::vector<NodeSourceInfo>* expandedstack) {
+    if (current_node_->callstack()) {
+      expandInlinedNodeStack(current_node_->callstack().value(), expandedstack);
+    }
+    auto nodeSourceInfo = getSourceInfoFromSourceRange(node->sourceRange());
+    expandedstack->emplace_back(nodeSourceInfo);
+  }
+
+  void insertInstruction(OpCode op, int64_t X = 0, uint64_t N = 0) {
+    instructions_.emplace_back(
+        op,
+        safe_narrow_cast<int32_t, int64_t>(X),
+        safe_narrow_cast<uint16_t, uint64_t>(N));
+    instructions_source_.emplace_back(current_node_);
+
+    if (FLAGS_torch_jit_enable_expanded_stacks &&
+        !current_node_->hasAttribute(attr::node_stack_idx)) {
+      std::vector<NodeSourceInfo> expandedStack;
+      getNodeStack(current_node_, &expandedStack);
+      auto insertIdx = expanded_node_stacks_.size();
+      expanded_node_stacks_.emplace_back(expandedStack);
+      current_node_->i_(attr::node_stack_idx, insertIdx);
+    }
+
+    // check that we didn't accidentally emit nodes out of topological order
+    if (op == OP) {
+      if (last_inserted_op_ != nullptr && current_node_ != last_inserted_op_ &&
+          current_node_->owningBlock() == last_inserted_op_->owningBlock()) {
+        TORCH_INTERNAL_ASSERT(
+            current_node_->isAfter(last_inserted_op_),
+            *current_node_,
+            " is not after ",
+            *last_inserted_op_);
+      }
+      last_inserted_op_ = current_node_;
+    }
+  }
+
+  void truncateInstructions(size_t size) {
+    while (instructions_.size() > size) {
+      instructions_.pop_back();
+      instructions_source_.pop_back();
+    }
+  }
+
+  void createBailoutBlock(size_t jf_index) {
+    bailout_blocks_.emplace_back(jf_index);
+    auto& bailout_instructions = bailout_blocks_.back().instructions;
+
+    bailout_instructions.insert(
+        bailout_instructions.end(),
+        instructions_.begin() + jf_index + 1,
+        instructions_.end());
+    truncateInstructions(jf_index + 1);
+  }
+
+  int allocRegs(at::ArrayRef<Value*> vs) {
+    int result = register_size_ + 1;
+    for (Value* v : vs) {
+      AT_ASSERT(value_to_reg_.count(v) == 0);
+      value_to_reg_[v] = ++register_size_;
+    }
+    return result;
+  }
+
+  int registerFor(Value* v) {
+    return value_to_reg_.at(v);
+  }
+
+  void emitUse(Value* input, bool drop) {
+    // drop - if true, we are not actually going to use this thing
+    // and we can short circuit doing many instructions here
+    // by either clearing the register (DROPR) or just popping the stack
+    // (DROP)
+    if (preprocess_.can_emit_inline[input->node()]) {
+      emitNode(input->node());
+      if (drop) {
+        insertInstruction(DROP);
+      }
+    } else {
+      int reg = registerFor(input);
+      bool moved = input->uses().size() == ++use_count_[input];
+
+      OpCode op{};
+      if (input->node()->kind() == prim::Constant) {
+        op = LOADC;
+      } else if (moved) {
+        op = MOVE;
+      } else {
+        op = LOAD;
+      }
+
+      if (drop) {
+        op = DROPR;
+      }
+      insertInstruction(op, reg);
+    }
+  }
+
+  void emitLoadInputs(at::ArrayRef<Value*> inputs) {
+    for (Value* input : inputs) {
+      emitUse(input, false);
+    }
+  }
+
+  void emitLoadInputs(at::ArrayRef<Value*> inputs, int num_include) {
+    int count = 0;
+    for (Value* input : inputs) {
+      if (count < num_include) {
+        emitUse(input, false);
+        count++;
+      }
+    }
+  }
+
+  void emitLoadInputs(at::ArrayRef<Value*> inputs, size_t start, size_t end) {
+    for (size_t i = start; i < end; i++) {
+      emitUse(inputs[i], false);
+    }
+  }
+
+  virtual void emitOperator(Node* node) {
+    emitLoadInputs(node->inputs());
+    const Operator& op = node->getOperator();
+    int num_inputs = node->inputs().size();
+    bool is_vararg = op.schema().is_vararg();
+
+    int operation_index = add_to_operator_table(
+        op,
+        node,
+        c10::toString(op.schema().operator_name()),
+        num_inputs,
+        is_vararg);
+
+    if (op.hasOperation() && is_vararg) {
+      insertInstruction(OPN, operation_index, num_inputs);
+    } else {
+      insertInstruction(OP, operation_index);
+    }
+  }
+
+  void emitWait(Node* node) {
+    emitLoadInputs(node->inputs());
+    insertInstruction(WAIT);
+  }
+
+  void emitDrop(at::ArrayRef<Value*> to_drop) {
+    for (Value* input : to_drop) {
+      emitUse(input, true);
+    }
+  }
+
+  void emitStoreOutputs(Node* node) {
+    size_t N = node->outputs().size();
+    if (N == 0) {
+      return;
+    }
+    int regs = allocRegs(node->outputs());
+    if (N == 1) {
+      insertInstruction(STORE, regs);
+    } else {
+      insertInstruction(STOREN, regs, node->outputs().size());
+    }
+  }
+
+  int insertConstant(IValue value) {
+    int result = constant_table_.size();
+    constant_table_.emplace_back(std::move(value));
+    return result;
+  }
+
+  virtual void emitOperatorOrInstruction(
+      Node* node,
+      OpCode op,
+      int64_t X = 0,
+      uint64_t N = 0,
+      bool emit_inputs = true) {
+    if (emit_inputs) {
+      emitLoadInputs(node->inputs());
+    }
+    insertInstruction(op, X, N);
+  }
+
+  void emitFormat(Node* node) {
+    emitOperatorOrInstruction(node, FORMAT, node->inputs().size(), 0);
+  }
+
+  void checkNodeAndEmit(Node* node) {
+    // check if the node should be emitted as instruction or operator
+    const Operator& op = node->getOperator();
+    std::string unique_op_name = c10::toString(op.schema().operator_name());
+    if (unique_op_name.find("aten::__getitem__.Dict") == 0) {
+      // __get_item__ overloaded operator for Dict
+      // needs to be emitted an instruction
+      emitOperatorOrInstruction(node, DICT_INDEX);
+    } else {
+      emitOperator(node);
+    }
+  }
+
+  void emitConstant(Node* node) {
+    if (node->output()->type()->kind() == FunctionType::Kind) {
+      return;
+    }
+    // constants are just put in the constant table
+    value_to_reg_[node->output()] =
+        insertConstant(toIValue(node->output()).value());
+  }
+
+  void emitIf(Node* node) {
+    emitLoadInputs(node->inputs());
+    size_t start_if = instructions_.size();
+    insertInstruction(JF, 0); // dummy offset to be filled in
+    emitCodeForBlock(node->blocks().at(0));
+    insertInstruction(JMP, 0); // dummy offset
+    size_t start_else = instructions_.size();
+    instructions_[start_if].X = start_else - start_if;
+    emitCodeForBlock(node->blocks().at(1));
+    instructions_[start_else - 1].X = instructions_.size() - (start_else - 1);
+  }
+
+  void emitLoop(Node* loop) {
+    insertInstruction(LOADC, insertConstant(0));
+    emitLoadInputs(loop->inputs());
+    size_t start = instructions_.size();
+    insertInstruction(LOOP, 0, loop->inputs().size()); // dummy offset
+    emitCodeForBlock(loop->blocks().at(0));
+    insertInstruction(JMP, start - instructions_.size());
+    instructions_[start].X = instructions_.size() - start;
+  }
+
+  void emitCall(Function* func, at::ArrayRef<Value*> inputs) {
+    emitLoadInputs(inputs);
+    insertInstruction(CALL, function_table_.size());
+    function_table_.emplace_back(func);
+  }
+
+  void emitNodeAtBlockLevel(Node* node) {
+    WithCurrentNode guard(&current_node_, node);
+    switch (node->kind()) {
+      case prim::Constant:
+        emitConstant(node);
+        break;
+      case prim::Return:
+        emitLoadInputs(node->inputs());
+        break;
+      default:
+        if (!preprocess_.can_emit_inline[node]) {
+          emitNode(node);
+          emitStoreOutputs(node);
+        }
+        break;
+    }
+  }
+
+  size_t emitType(TypePtr t) {
+    size_t r = type_table_.size();
+    type_table_.emplace_back(std::move(t));
+    return r;
+  }
+
+  void emitTypeCheck(Node* node) {
+    auto num_inputs = node->inputs().size();
+
+    // Check that TypeCheck has at least one input.
+    TORCH_INTERNAL_ASSERT(
+        num_inputs && num_inputs + 1 == node->outputs().size());
+    emitLoadInputs(node->inputs());
+
+    // Emit the expected type.
+    size_t types_start = type_table_.size();
+    auto types = node->tys(attr::types);
+    for (const auto i : c10::irange(num_inputs)) {
+      emitType(types[i]);
+    }
+    insertInstruction(TYPECHECK, types_start, num_inputs);
+  }
+
+  size_t emitGuard(Node* node) {
+    // unoptimized graph is at index 0
+    // guarded input is at index 1
+    // the rest of args follow
+    emitLoadInputs(node->inputs().slice(1, 1));
+    insertInstruction(GUARD, emitType(node->outputs().at(0)->type()));
+    insertInstruction(JF, 0 /* to be patched */);
+    return instructions_.size() - 1;
+  }
+
+  void emitBailOut(Node* node) {
+    auto jf_index = emitGuard(node);
+    auto unoptimized_graph = node->inputs().at(0)->node()->g(attr::Subgraph);
+    // note, guaded input is already loaded onto the stack
+    // for GUARD instruction
+    emitLoadInputs(node->inputs().slice(2));
+    insertInstruction(TAIL_CALL, function_table_.size());
+    TORCH_INTERNAL_ASSERT(node->kind() == prim::BailOut);
+    auto bailout_index = node->i(attr::index);
+    TORCH_INTERNAL_ASSERT(bailout_index >= 0);
+
+    auto build_bailout_graph = [bailout_index,
+                                unoptimized_graph](GraphFunction& func) {
+      BuildBailOutGraphFrom(bailout_index, unoptimized_graph, func.graph());
+    };
+
+    auto empty_graph = std::make_shared<Graph>();
+    auto func = std::make_unique<GraphFunction>(
+        "bailout", empty_graph, build_bailout_graph);
+    function_table_.emplace_back(func.get());
+    bailout_functions_.emplace_back(std::move(func));
+    createBailoutBlock(jf_index);
+  }
+
+  void emitProfile(Node* node) {
+    emitLoadInputs(node->inputs());
+    insertInstruction(PROFILE_OP, profile_function_table_.size());
+    if (node->cast<ProfileOp>()) {
+      profile_function_table_.push_back(node->cast<ProfileOp>()->getCallback());
+    } else if (node->cast<ProfileIValueOp>()) {
+      profile_function_table_.push_back(
+          node->cast<ProfileIValueOp>()->getCallback());
+    } else {
+      TORCH_INTERNAL_ASSERT(false);
+    }
+  }
+
+  void emitGetAttr(Node* node) {
+    emitLoadInputs(node->inputs());
+    const auto type = node->input()->type()->expect<ClassType>();
+    const auto& field = node->s(attr::name);
+    const auto slot = type->getAttributeSlot(field);
+    insertInstruction(GET_ATTR, slot);
+  }
+
+  void emitSetAttr(Node* node) {
+    emitLoadInputs(node->inputs());
+    const auto type = node->inputs().at(0)->type()->expect<ClassType>();
+    const auto& field = node->s(attr::name);
+    const auto slot = type->getAttributeSlot(field);
+    insertInstruction(SET_ATTR, slot);
+  }
+
+  void insertBailoutBlocks() {
+    for (const BailoutBlock& block : bailout_blocks_) {
+      TORCH_INTERNAL_ASSERT(instructions_[block.jf_instruction_index].op == JF)
+      instructions_[block.jf_instruction_index].X =
+          instructions_.size() - block.jf_instruction_index;
+      instructions_.insert(
+          instructions_.end(),
+          block.instructions.begin(),
+          block.instructions.end());
+      instructions_source_.insert(
+          instructions_source_.end(),
+          block.instructions.size(),
+          instructions_source_[block.jf_instruction_index]);
+    }
+  }
+  void emitInterfaceCall(
+      std::string method_name_str,
+      c10::ArrayRef<Value*> inputs) {
+    emitLoadInputs(inputs);
+    auto method_name = insertConstant(std::move(method_name_str));
+    insertInstruction(INTERFACE_CALL, method_name, inputs.size());
+  }
+
+  void emitListUnpack(Node* node) {
+    emitLoadInputs(node->inputs());
+    insertInstruction(LIST_UNPACK, node->outputs().size());
+  }
+
+  void emitTupleConstruct(Node* node) {
+    bool named =
+        node->output()->type()->expectRef<TupleType>().name().has_value();
+    if (named) {
+      emitContainerConstruct(NAMED_TUPLE_CONSTRUCT, node);
+    } else {
+      emitLoadInputs(node->inputs());
+      insertInstruction(TUPLE_CONSTRUCT, node->inputs().size());
+    }
+  }
+
+  void emitContainerConstruct(OpCode op, Node* node) {
+    emitLoadInputs(node->inputs());
+    insertInstruction(
+        op, emitType(node->output()->type()), node->inputs().size());
+  }
+
+  void emitCreateObject(Node* node) {
+    insertInstruction(CREATE_OBJECT, emitType(node->output()->type()));
+  }
+  void emitIsinstance(Node* node) {
+    emitLoadInputs(node->inputs());
+    std::vector<TypePtr> types = node->tys(attr::types);
+    size_t types_start = type_table_.size();
+    for (const auto& typ : types) {
+      emitType(typ);
+    }
+    insertInstruction(ISINSTANCE, types_start, types.size());
+  }
+
+  void emitTupleSlice(Node* node) {
+    emitLoadInputs(node->inputs());
+    int64_t beg_ind = node->i(attr::beg);
+    int64_t end_ind = node->i(attr::end);
+    insertInstruction(TUPLE_SLICE, beg_ind, end_ind - beg_ind);
+  }
+
+  void emitFork(Node* node) {
+    emitLoadInputs(node->inputs());
+    auto forked_fn = std::make_unique<GraphFunction>(
+        "<forked function>", node->g(attr::Subgraph), nullptr);
+    forked_functions_.emplace_back(std::move(forked_fn));
+    function_table_.emplace_back(forked_functions_.back().get());
+    insertInstruction(FORK, function_table_.size() - 1, node->inputs().size());
+  }
+
+  void emitAwaitable(Node* node) {
+    emitLoadInputs(node->inputs());
+    auto await_fn = std::make_unique<GraphFunction>(
+        "<awaitable function>", node->g(attr::Subgraph), nullptr);
+    awaited_functions_.emplace_back(std::move(await_fn));
+    function_table_.emplace_back(awaited_functions_.back().get());
+    insertInstruction(
+        AWAITABLE, function_table_.size() - 1, node->inputs().size());
+  }
+
+  void emitWarn(Node* node) {
+    if (FLAGS_torch_jit_disable_warning_prints) {
+      return;
+    }
+
+    emitLoadInputs(node->inputs());
+    int32_t idx = -1;
+    if (node->hasAttribute(attr::warn_id)) {
+      idx = static_cast<int32_t>(node->i(attr::warn_id));
+    }
+    insertInstruction(WARN, idx);
+  }
+
+  void emitEnter(Node* node) {
+    emitLoadInputs(node->inputs());
+    insertInstruction(ENTER);
+  }
+
+  void emitExit(Node* /* node */) {
+    insertInstruction(EXIT);
+  }
+
+  void emitNode(Node* node) {
+    WithCurrentNode guard(&current_node_, node);
+    switch (node->kind()) {
+      default:
+        // NOLINTNEXTLINE(clang-analyzer-optin.cplusplus.VirtualCall)
+        checkNodeAndEmit(node);
+        // emitOperator(node);
+        break;
+      case prim::RaiseException:
+        emitOperatorOrInstruction(node, RAISE_EXCEPTION);
+        break;
+      case prim::TupleIndex:
+        emitOperatorOrInstruction(node, TUPLE_INDEX);
+        break;
+      case prim::Drop:
+        emitDrop(node->inputs());
+        break;
+      case prim::Constant:
+        emitConstant(node);
+        break;
+      case prim::If:
+        emitIf(node);
+        break;
+      case prim::Loop:
+        emitLoop(node);
+        break;
+      case aten::wait:
+        emitWait(node);
+        break;
+      case prim::Param:
+        break;
+      case prim::CallFunction:
+        emitCall(
+            node->inputs().at(0)->type()->expectRef<FunctionType>().function(),
+            node->inputs().slice(1));
+        break;
+      case prim::CallMethod:
+        if (auto class_type = node->inputs().at(0)->type()->cast<ClassType>()) {
+          emitCall(&class_type->getMethod(node->s(attr::name)), node->inputs());
+        } else {
+          emitInterfaceCall(node->s(attr::name), node->inputs());
+        }
+        break;
+      case prim::TypeCheck:
+        emitTypeCheck(node);
+        break;
+      case prim::BailOut:
+        emitBailOut(node);
+        break;
+      case prim::profile_ivalue:
+      case prim::profile:
+        emitProfile(node);
+        break;
+      case prim::GetAttr:
+        emitGetAttr(node);
+        break;
+      case prim::SetAttr:
+        emitSetAttr(node);
+        break;
+      case prim::ListUnpack:
+        emitListUnpack(node);
+        break;
+      case prim::TupleConstruct:
+        emitTupleConstruct(node);
+        break;
+      case prim::ListConstruct:
+        emitContainerConstruct(LIST_CONSTRUCT, node);
+        break;
+      case prim::DictConstruct:
+        emitContainerConstruct(DICT_CONSTRUCT, node);
+        break;
+      case prim::CreateObject:
+        emitCreateObject(node);
+        break;
+      case prim::isinstance:
+        emitIsinstance(node);
+        break;
+      case prim::TupleSlice:
+        emitTupleSlice(node);
+        break;
+      case prim::fork:
+        emitFork(node);
+        break;
+      case prim::awaitable:
+        emitAwaitable(node);
+        break;
+      case aten::warn:
+        emitWarn(node);
+        break;
+      case prim::Enter:
+        emitEnter(node);
+        break;
+      case prim::Exit:
+        emitExit(node);
+        break;
+      case prim::Uninitialized:
+        emitOperatorOrInstruction(node, UN_INITIALIZED, 0, 0, false);
+        break;
+      case prim::dtype:
+        emitOperatorOrInstruction(node, DTYPE);
+        break;
+      case prim::device:
+        emitOperatorOrInstruction(node, DEVICE);
+        break;
+      case aten::dim:
+        emitOperatorOrInstruction(node, DIM);
+        break;
+      case prim::is_cuda:
+        emitOperatorOrInstruction(node, IS_CUDA);
+        break;
+      case aten::__not__:
+        emitOperatorOrInstruction(node, __NOT__);
+        break;
+      case aten::format:
+        emitFormat(node);
+        break;
+      case aten::__is__:
+        emitOperatorOrInstruction(node, __IS__);
+        break;
+      case aten::__isnot__:
+        emitOperatorOrInstruction(node, __ISNOT__);
+        break;
+      case prim::NumToTensor:
+        emitOperatorOrInstruction(node, NUM_TO_TENSOR);
+        break;
+      case prim::tolist:
+        emitOperatorOrInstruction(node, TO_LIST);
+        break;
+    }
+  }
+
+  void emitCodeForBlock(Block* block) {
+    emitNodeAtBlockLevel(block->param_node());
+    for (auto node : block->nodes()) {
+      emitNodeAtBlockLevel(node);
+    }
+    emitNodeAtBlockLevel(block->return_node());
+  }
+
+  const std::vector<GraphExecutor*>& grad_executors() {
+    if (!grad_executors_) {
+      grad_executors_.emplace();
+      for (Operation& op : operator_table_) {
+        if (auto executor = detail::getGradExecutor(op)) {
+          grad_executors_->push_back(executor);
+        }
+      }
+    }
+    return *grad_executors_;
+  }
+
+  const std::vector<GraphExecutor*>& diff_graph_op_executors() {
+    if (!forward_executors_) {
+      forward_executors_.emplace();
+      for (Operation& op : operator_table_) {
+        if (auto executor = detail::getDifferentiableGraphOpExecutor(op)) {
+          forward_executors_->push_back(executor);
+        }
+      }
+    }
+    return *forward_executors_;
+  }
+
+  void dump(std::ostream& out, size_t i) const {
+    out << i << " " << instructions_[i];
+    if (instructions_[i].op == OP || instructions_[i].op == CALL ||
+        instructions_[i].op == OPN) {
+      out << " # " << *instructions_source_[i];
+    } else {
+      out << "\n";
+    }
+  }
+
+  void dump(std::ostream& out) const {
+    out << *graph_ << "\n";
+    for (const auto i : c10::irange(instructions_.size())) {
+      dump(out, i);
+    }
+  }
+
+  /**
+   * Add an operation to operator_table_ if not a duplicate and return its index
+   */
+  int add_to_operator_table(
+      const Operator& op,
+      const Node* node,
+      const std::string& op_name,
+      const int num_inputs,
+      const bool is_vararg) {
+    int size = operator_table_.size();
+
+    const Operation& oper = op.getOperation(node);
+
+    if (!is_vararg) {
+      std::pair<std::string, int> key(op_name, num_inputs);
+      auto found = operator_table_inv_.find(key);
+
+      if (found != operator_table_inv_.end()) {
+        return found->second;
+      }
+
+      operator_table_inv_.emplace(key, size);
+    }
+
+    operator_table_.emplace_back(oper);
+#ifndef NDEBUG
+    full_operator_table_.emplace_back(op);
+#endif
+    return size;
+  }
+
+  inline void assert_stack_size(
+      int32_t instruction_index,
+      size_t init_size,
+      size_t actual_size) const {
+#ifndef NDEBUG
+    const auto& schema = full_operator_table_[instruction_index].schema();
+    int64_t expected_size = static_cast<int64_t>(init_size) -
+        static_cast<int64_t>(schema.arguments().size()) +
+        static_cast<int64_t>(schema.returns().size());
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+        static_cast<size_t>(expected_size) == actual_size ||
+            schema.is_varret() || schema.is_vararg(),
+        "Expected to find ",
+        expected_size,
+        " values on the stack, but found ",
+        actual_size,
+        " on the stack after ",
+        toString(full_operator_table_[instruction_index].schema()));
+#endif
+  }
+};
+
+struct MobileCodeImpl : CodeImpl {
+  MobileCodeImpl(
+      const std::shared_ptr<Graph>& graph,
+      std::string function_name,
+      bool emit_default_input_instructions,
+      bool support_default_args_before_out,
+      bool emit_promoted_ops,
+      size_t remaining_bailout_depth)
+      : CodeImpl(
+            graph,
+            std::move(function_name),
+            remaining_bailout_depth,
+            false),
+        emit_default_input_instructions_(emit_default_input_instructions),
+        support_default_args_before_out_(support_default_args_before_out),
+        emit_promoted_ops_(emit_promoted_ops) {
+    // NOLINTNEXTLINE(clang-analyzer-optin.cplusplus.VirtualCall)
+    run();
+  }
+
+  void run() override {
+    process_ops_for_mobile();
+    emitCodeForBlock(graph_->block());
+    insertInstruction(RET);
+    // we deferred the emission of bailout blocks so they appear at the end
+    // emit them now and patch up the jumps
+    insertBailoutBlocks();
+  }
+
+  void process_ops_for_mobile() {
+    DepthFirstGraphNodeIterator graph_it(graph_);
+    Node* node = graph_it.next();
+    while (node) {
+      if (node->maybeOperator()) {
+        auto op_schema = node->getOperator().schema();
+        // skip if schema has vararg
+        if (!op_schema.is_vararg()) {
+          auto specifiedArgs = CalculateNecessaryArgs(
+              op_schema.arguments(),
+              node->inputs(),
+              support_default_args_before_out_);
+
+          size_t numInclude = specifiedArgs.first +
+              (support_default_args_before_out_ ? specifiedArgs.second : 0);
+          auto unique_name = !op_schema.overload_name().empty()
+              ? op_schema.name() + "." + op_schema.overload_name()
+              : op_schema.name();
+          auto it = op_to_num_specified_args_.insert(
+              std::pair<std::string, size_t>(unique_name, 0));
+          op_to_num_out_args_.insert(std::pair<std::string, size_t>(
+              unique_name, specifiedArgs.second));
+          auto prev_value = it.first->second;
+          it.first->second = std::max(numInclude, prev_value);
+        }
+      }
+      node = graph_it.next();
+    }
+  }
+
+ private:
+  void emitOperator(Node* node) override {
+    if (emit_default_input_instructions_) {
+      CodeImpl::emitOperator(node);
+    } else {
+      const Operator& op = node->getOperator();
+      std::string unique_op_name = c10::toString(op.schema().operator_name());
+      int num_inputs = node->inputs().size();
+      bool is_vararg = op.schema().is_vararg();
+
+      if (op.hasOperation() && is_vararg) {
+        emitLoadInputs(node->inputs());
+        int operation_index = add_to_operator_table(
+            op,
+            node,
+            unique_op_name,
+            num_inputs,
+            /* is_vararg */ true);
+        insertInstruction(OPN, operation_index, num_inputs);
+      } else {
+        auto num_include = num_inputs;
+        auto it = op_to_num_specified_args_.find(unique_op_name);
+        if (it != op_to_num_specified_args_.end()) {
+          num_include = it->second;
+        }
+        if (support_default_args_before_out_) {
+          auto num_out = op_to_num_out_args_.find(unique_op_name)->second;
+          auto num_specified_before_out = num_include - num_out;
+          emitLoadInputs(node->inputs(), 0, num_specified_before_out);
+          emitLoadInputs(
+              node->inputs(),
+              node->inputs().size() - num_out,
+              node->inputs().size());
+        } else {
+          emitLoadInputs(node->inputs(), num_include);
+        }
+        int operation_index = add_to_operator_table(
+            op, node, unique_op_name, num_inputs, is_vararg);
+        insertInstruction(OP, operation_index);
+      }
+    }
+  }
+
+  void emitOperatorOrInstruction(
+      Node* node,
+      OpCode op,
+      int64_t X = 0,
+      uint64_t N = 0,
+      bool emit_inputs = true) override {
+    if (emit_promoted_ops_) {
+      CodeImpl::emitOperatorOrInstruction(node, op, X, N, emit_inputs);
+    } else {
+      CodeImpl::emitOperator(node);
+    }
+  }
+
+  // To support forward compatibility for bytecode version bump from v5 to v6
+  bool emit_default_input_instructions_;
+  // To support forward compatibility for bytecode version bump from v6 to v7
+  bool support_default_args_before_out_;
+  // To support forward compatibility for bytecode version bump from v7 to v8
+  bool emit_promoted_ops_;
+};
+
+} // namespace torch::jit::interpreter
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/interpreter/frame.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/interpreter/frame.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a38fba699d0ed9695aa1321bed055a51b34bc00
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/interpreter/frame.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <atomic>
+#include <memory>
+
+#include <torch/csrc/jit/runtime/interpreter/code_impl.h>
+#include <torch/csrc/jit/runtime/profiling_record.h>
+
+namespace torch::jit::interpreter {
+
+// A Frame captures function's state
+// (e.g. `pc` and `base_pointer`)
+// Each Frame corresponds to a call to a `Frame::function`
+// which has not yet returned
+// The arguments for `Frame::function`
+// are located at [base_pointer + arg_number]
+struct Frame {
+  std::shared_ptr<CodeImpl> function;
+  // program counter corresponds to the index
+  // of the currently executed instruction
+  size_t pc;
+  // marks the start index of the frame
+  // base_pointer is used by TAIL_CALL
+  // to replace the current frame
+  // with a frame of a bailout graph
+  size_t base_pointer;
+
+  // unique to every frame with prim::profile across all threads
+  std::optional<size_t> id;
+
+  // RecordFunction object associated with this frame
+  std::unique_ptr<at::RecordFunction> record_function;
+
+  // symbol table for a frame
+  ShapeSymbolTable symbols2dims;
+
+  static size_t genId();
+};
+
+} // namespace torch::jit::interpreter
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/interpreter/preprocess_graph.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/interpreter/preprocess_graph.h
new file mode 100644
index 0000000000000000000000000000000000000000..bd7b426dd4bd9b99be135a49dd9821461573ad8c
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/interpreter/preprocess_graph.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit::interpreter {
+
+// pre-processing that happens once per graph
+struct PreprocessGraph {
+  explicit PreprocessGraph(Graph& g);
+
+  // Outputs of the preprocessing:
+  std::shared_ptr<Graph> graph;
+  std::unordered_map<Node*, bool> can_emit_inline;
+};
+
+} // namespace torch::jit::interpreter
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/jit_exception.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/jit_exception.h
new file mode 100644
index 0000000000000000000000000000000000000000..cf1d3f17471d6a2d37d05b9ddd1cc25ca9491b7e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/jit_exception.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <stdexcept>
+
+#include <torch/csrc/Export.h>
+#include <optional>
+#include <string>
+
+namespace torch::jit {
+
+struct TORCH_API JITException : public std::runtime_error {
+  explicit JITException(
+      const std::string& msg,
+      std::optional<std::string> python_class_name = std::nullopt,
+      std::optional<std::string> original_msg = std::nullopt);
+
+  std::optional<std::string> getPythonClassName() const {
+    return python_class_name_;
+  }
+
+  // the original msg if this is from a python exception. The interpretor has
+  // changed the original message by adding "The following operation failed in
+  // the TorchScript interpreter." in front of it in the handleError function.
+  std::optional<std::string> getOriginalMsg() const {
+    return original_msg_;
+  }
+
+  static const std::string& getCaughtOriginalMsg();
+  static const std::string& getCaughtPythonClassName();
+  static void setCaughtOriginalMsg(const std::string& msg);
+  static void setCaughtPythonClassName(const std::string& pythonClassName);
+
+ private:
+  std::optional<std::string> python_class_name_;
+  std::optional<std::string> original_msg_;
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/jit_trace.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/jit_trace.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d243fa618ee328b0add09ca117a8f1fe2779f87
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/jit_trace.h
@@ -0,0 +1,8 @@
+#include <torch/csrc/jit/ir/ir.h>
+#include <memory>
+
+namespace torch::jit {
+TORCH_API std::shared_ptr<Graph> TraceGraph(
+    const std::shared_ptr<Graph>& graph,
+    Stack& stack);
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/logging.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/logging.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c93c8d13c66c6e50c3b1b15fd547f3ade5befb4
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/logging.h
@@ -0,0 +1,89 @@
+#pragma once
+
+#include <chrono>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <torch/csrc/Export.h>
+
+namespace torch::jit::logging {
+
+class LoggerBase {
+ public:
+  TORCH_API virtual void addStatValue(
+      const std::string& stat_name,
+      int64_t val) = 0;
+  virtual ~LoggerBase() = default;
+};
+
+TORCH_API LoggerBase* getLogger();
+TORCH_API LoggerBase* setLogger(LoggerBase* logger);
+
+// No-op logger. This is the default and is meant to incur almost no runtime
+// overhead.
+
+class NoopLogger : public LoggerBase {
+ public:
+  void addStatValue(
+      const std::string& stat_name [[maybe_unused]],
+      int64_t val [[maybe_unused]]) override {}
+  ~NoopLogger() override = default;
+};
+
+// Trivial locking logger. Pass in an instance of this to setLogger() to use it.
+// This keeps track of the sum of all statistics.
+//
+// NOTE: this is not written in a scalable way and should probably only be used
+// in the single-threaded case or for testing.
+class TORCH_API LockingLogger : public LoggerBase {
+ public:
+  void addStatValue(const std::string& stat_name, int64_t val) override;
+  virtual int64_t getCounterValue(const std::string& name) const;
+  enum class AggregationType { SUM = 0, AVG = 1 };
+  void setAggregationType(const std::string& stat_name, AggregationType type);
+  ~LockingLogger() override = default;
+
+ private:
+  mutable std::mutex m;
+  struct RawCounter {
+    RawCounter() = default;
+    int64_t sum{0};
+    size_t count{0};
+  };
+  std::unordered_map<std::string, RawCounter> raw_counters;
+  std::unordered_map<std::string, AggregationType> agg_types;
+};
+
+// Make this struct so the timer internals are opaque to the user.
+struct JITTimePoint {
+  std::chrono::time_point<std::chrono::high_resolution_clock> point;
+};
+
+TORCH_API JITTimePoint timePoint();
+TORCH_API void recordDurationSince(
+    const std::string& name,
+    const JITTimePoint& tp);
+
+namespace runtime_counters {
+constexpr const char* GRAPH_EXECUTORS_CONSTRUCTED =
+    "pytorch_runtime.graph_executors_constructed";
+constexpr const char* GRAPH_EXECUTOR_INVOCATIONS =
+    "pytorch_runtime.graph_executor_invocations";
+constexpr const char* EXECUTION_PLAN_CACHE_HIT =
+    "pytorch_runtime.execution_plan_cache_hit";
+constexpr const char* EXECUTION_PLAN_CACHE_MISS =
+    "pytorch_runtime.execution_plan_cache_miss";
+
+inline std::vector<const char*> allRuntimeCounters() {
+  return {
+      GRAPH_EXECUTORS_CONSTRUCTED,
+      GRAPH_EXECUTOR_INVOCATIONS,
+      EXECUTION_PLAN_CACHE_HIT,
+      EXECUTION_PLAN_CACHE_MISS};
+}
+
+} // namespace runtime_counters
+
+} // namespace torch::jit::logging
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/operator.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/operator.h
new file mode 100644
index 0000000000000000000000000000000000000000..c7fe02eec448e1ed209feaaa433c7611c009e2c9
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/operator.h
@@ -0,0 +1,343 @@
+// in memory description of all ATen Ops similar to Caffe2 schema
+// once C10 exists this can be removed, or stubbed out, but we need
+// it now to implement correct semantic checking for script
+#pragma once
+
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <ATen/core/dispatch/OperatorOptions.h>
+#include <ATen/core/op_registration/op_allowlist.h>
+#include <ATen/core/stack.h>
+#include <c10/util/Exception.h>
+#include <c10/util/overloaded.h>
+#include <torch/csrc/jit/frontend/function_schema_parser.h>
+#include <torch/csrc/jit/runtime/operator_options.h>
+#include <torch/library.h>
+
+#include <ATen/core/function_schema.h>
+#include <ATen/core/symbol.h>
+
+#include <functional>
+#include <initializer_list>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <variant>
+#include <vector>
+
+namespace torch::jit {
+
+struct Node;
+using ::c10::Argument;
+using ::c10::FunctionSchema;
+using ::c10::Symbol;
+
+using OperationCreator = Operation (*)(const Node*);
+
+namespace {
+const std::array<at::Tag, 1> kJitOnlyOperatorTags = {
+    at::Tag::pt2_compliant_tag};
+}
+
+/*
+ * Note: JIT relies on Operator instances having static lifetime, because
+ * it for example stores a non-owning FunctionSchema* pointer in the Node class,
+ * which points to the function schema stored in the Operator instance.
+ * Also, jit::Operator is meant to store more operator related information like
+ * symbolic derivatives, which also requires them to have static lifetime
+ * so that changes to symbolic derivatives are remembered.
+ *
+ * Currently, the JIT operator library contains a jit::Operator instance
+ * with a wrapper for each c10 operator. The c10 operator library registers
+ * those wrappers using listeners in register_c10_ops.cpp.
+ * TODO Instead of doing it this way, we should only have pure-jit ops in
+ * the jit library but have the JIT operator lookup look into the c10 library
+ * too.
+ */
+
+// An Operator is a thin wrapper around either a pure JIT operator (e.g. prim
+// ops) or a c10 operator, allowing some common operations and abstracting away
+// the concrete operator nature.
+struct TORCH_API Operator {
+ private:
+  struct C10Operator final {
+    c10::OperatorHandle handle_;
+    Operation op_;
+  };
+  struct UnparsedFunctionSchema final {
+    std::string schema_string_;
+    mutable std::optional<c10::AliasAnalysisKind> alias_analysis_;
+  };
+  struct JitOnlyOperator final {
+    // The only valid transition for schema_ is from right->left, i.e.
+    // when the schema gets parsed.
+    mutable std::variant<FunctionSchema, UnparsedFunctionSchema> schema_;
+
+    std::variant<Operation, OperationCreator> op_;
+  };
+
+ public:
+  Operator(c10::OperatorHandle opHandle, Operation operation)
+      : op_(C10Operator{std::move(opHandle), std::move(operation)}) {}
+
+  Operator(
+      std::string schema,
+      Operation op,
+      c10::AliasAnalysisKind alias_analysis)
+      : op_(JitOnlyOperator{
+            UnparsedFunctionSchema{std::move(schema), alias_analysis},
+            Operation(std::move(op))}) {}
+
+  Operator(
+      std::string name,
+      std::string overload_name,
+      std::vector<Argument> arguments,
+      std::vector<Argument> returns,
+      Operation op,
+      c10::AliasAnalysisKind alias_analysis)
+      : op_(JitOnlyOperator{
+            FunctionSchema(varArgSchemaWithName(
+                std::move(name),
+                std::move(overload_name),
+                std::move(arguments),
+                std::move(returns),
+                alias_analysis)),
+            std::move(op)}) {}
+
+  Operator(
+      std::string schema,
+      OperationCreator op_creator,
+      c10::AliasAnalysisKind alias_analysis)
+      : op_(JitOnlyOperator{
+            UnparsedFunctionSchema{std::move(schema), alias_analysis},
+            op_creator}) {}
+
+  // Helper constructor to register `op` to run
+  // run for _every_ IR Node where n.kind() == name, regardless of arguments.
+  // This is accomplished by marking the schema varargs and having no required
+  // arguments.
+  Operator(
+      Symbol name,
+      OperationCreator op_creator,
+      c10::AliasAnalysisKind alias_analysis)
+      : op_(JitOnlyOperator{
+            FunctionSchema(varArgSchemaWithName(name, alias_analysis)),
+            op_creator}) {}
+
+  Operation getOperation(const Node* node = nullptr) const {
+    return std::visit(
+        c10::overloaded(
+            [](const C10Operator& op) { return op.op_; },
+            [node](const JitOnlyOperator& op) {
+              return std::visit(
+                  c10::overloaded(
+                      [](const Operation& op) { return op; },
+                      [node](const OperationCreator& op_creator) {
+                        return op_creator(node);
+                      }),
+                  op.op_);
+            }),
+        op_);
+  }
+
+  Operation getOperationForDispatchKey(c10::DispatchKey dk) const {
+    // TODO: some sort of caching mechanism?
+    return std::visit(
+        c10::overloaded(
+            [dk](const C10Operator& op) {
+              return Operation([op, dk](Stack& stack) {
+                op.handle_.callBoxedForDispatchKey(dk, stack);
+              });
+            },
+            [](const JitOnlyOperator& op) {
+              TORCH_CHECK(
+                  false,
+                  "calling a JIT operator for dispatch key is not supported");
+              return Operation(nullptr);
+            }),
+        op_);
+  }
+
+  const FunctionSchema& schema() const {
+    return std::visit(
+        c10::overloaded(
+            [](const C10Operator& op) -> const FunctionSchema& {
+              return op.handle_.schema();
+            },
+            [](const JitOnlyOperator& op) -> const FunctionSchema& {
+              // we lazily parse schema initialized from strings so that
+              // we do less work during static operator registration
+              if (op.schema_.index() == 1) {
+                auto& unmaterializedSchema =
+                    std::get<UnparsedFunctionSchema>(op.schema_);
+                FunctionSchema schema =
+                    parseSchema(unmaterializedSchema.schema_string_);
+                if (unmaterializedSchema.alias_analysis_.has_value()) {
+                  // TODO What if it gets set later?
+                  schema.setAliasAnalysis(
+                      *unmaterializedSchema.alias_analysis_);
+                }
+                op.schema_ = std::move(schema);
+              }
+              return std::get<FunctionSchema>(op.schema_);
+            }),
+        op_);
+  }
+
+  c10::ArrayRef<at::Tag> getTags() const {
+    return std::visit(
+        c10::overloaded(
+            [](const C10Operator& op) { return op.handle_.getTags(); },
+            [](const JitOnlyOperator& op) {
+              // JitOnlyOperators don't have an c10::OperatorHandle or a way to
+              // specify tags. We're grandfathering them all into
+              // pt2_compliant_tag, but for anything else, please just stop
+              // using JitOnlyOperator.
+              return c10::ArrayRef<at::Tag>(kJitOnlyOperatorTags);
+            }),
+        op_);
+  }
+
+  bool isC10Op() const {
+    return op_.index() == 0;
+  }
+
+  c10::AliasAnalysisKind aliasAnalysisKind() const {
+    const FunctionSchema& schemaRef = schema();
+    c10::AliasAnalysisKind alias_analysis = schemaRef.aliasAnalysis();
+
+    TORCH_CHECK(
+        alias_analysis == AliasAnalysisKind::FROM_SCHEMA ||
+            !schemaRef.hasAnyAliasInfo(),
+        "In operator registration: Tried to register operator ",
+        schemaRef,
+        " with aliasing information in the schema but without AliasAnalysisKind::FROM_SCHEMA.");
+    return alias_analysis;
+  }
+
+  bool hasOperation() const {
+    return std::visit(
+        c10::overloaded(
+            [](const C10Operator&) { return true; },
+            [](const JitOnlyOperator& op) { return op.op_.index() == 0; }),
+        op_);
+  }
+
+ private:
+  static FunctionSchema varArgSchemaWithName(
+      Symbol name,
+      AliasAnalysisKind alias_analysis) {
+    auto result = FunctionSchema(
+        name,
+        "",
+        {},
+        {},
+        /*is_vararg*/ true,
+        /*is_varret*/ true);
+    result.setAliasAnalysis(alias_analysis);
+    return result;
+  }
+
+  static FunctionSchema varArgSchemaWithName(
+      std::string name,
+      std::string overload_name,
+      std::vector<Argument> arguments,
+      std::vector<Argument> returns,
+      AliasAnalysisKind alias_analysis) {
+    auto result = FunctionSchema(
+        std::move(name),
+        std::move(overload_name),
+        std::move(arguments),
+        std::move(returns),
+        /*is_vararg*/ false,
+        /*is_varret*/ false);
+    result.setAliasAnalysis(alias_analysis);
+    return result;
+  }
+
+  std::variant<C10Operator, JitOnlyOperator> op_;
+};
+
+TORCH_API std::string canonicalSchemaString(const FunctionSchema& schema);
+
+TORCH_API const std::vector<std::shared_ptr<Operator>> getAllOperators();
+TORCH_API const std::vector<std::shared_ptr<Operator>>& getAllOperatorsFor(
+    Symbol name);
+// Returns operators in the order which OpOverloadPacket resolves them.
+TORCH_API std::vector<std::shared_ptr<Operator>> getAllSortedOperatorsFor(
+    Symbol name);
+
+// given a operator with an overload name, find the specific operator related to
+// it, may return nullptr if no operator exists.
+TORCH_API std::shared_ptr<Operator> findOperatorFor(
+    const c10::OperatorName& full_name);
+
+TORCH_API std::vector<Symbol> findSimilarOperators(Symbol input_op);
+
+TORCH_API void registerOperator(Operator&& op);
+TORCH_API void deregisterOperator(const FunctionSchema& schema);
+
+// XXX: this function is meant to be used with string literals only!
+TORCH_API std::shared_ptr<Operator> getOperatorForLiteral(
+    const char* signature);
+
+// Ensure the thing that registers c10 ops is defined.
+// Otherwise, our registry will not have c10 ops. You can run into this
+// scenario if you're querying registered ops during static init.
+//
+// This fn is defined in register_c10_ops.cpp
+TORCH_API void ensure_c10_registerer_defined();
+
+// Used to assert that unschematized operators have an analysis method written
+TORCH_API bool aliasAnalysisHasSpecialCaseFor(c10::Symbol sym);
+
+// A factory function to generate an optional operator. It has two
+// instantiations depending on the template bool arg value. The arg can be a
+// compile-time function for the selective op registration based on schema
+// string.
+template <typename Func>
+std::optional<Operator> OperatorGenerator(
+    const char* schema_str,
+    Func&& op,
+    AliasAnalysisKind alias_analysis) {
+  return std::optional<Operator>(Operator(
+      std::string(schema_str), std::forward<Func>(op), alias_analysis));
+}
+
+template <typename Func>
+std::optional<Operator> OperatorGenerator(
+    torch::detail::SelectiveStr<true> schema_str,
+    Func&& op,
+    AliasAnalysisKind alias_analysis) {
+  return OperatorGenerator(
+      static_cast<const char*>(schema_str),
+      std::forward<Func>(op),
+      alias_analysis);
+}
+
+template <typename Func>
+std::optional<Operator> OperatorGenerator(
+    torch::detail::SelectiveStr<false> schema_str,
+    Func&& op,
+    AliasAnalysisKind alias_analysis) {
+  return std::nullopt;
+}
+
+template <typename Func>
+std::optional<Operator> OperatorGenerator(
+    const std::string name,
+    const std::string overload_name,
+    const std::vector<c10::Argument> arguments,
+    const std::vector<c10::Argument> returns,
+    Func&& op,
+    AliasAnalysisKind alias_analysis) {
+  return std::optional<Operator>(Operator(
+      name,
+      overload_name,
+      arguments,
+      returns,
+      std::forward<Func>(op),
+      alias_analysis));
+}
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/operator_options.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/operator_options.h
new file mode 100644
index 0000000000000000000000000000000000000000..0cbf4c732c8df80a96577ee22dd10cea78c27196
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/operator_options.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <ATen/core/dispatch/OperatorOptions.h>
+
+namespace torch::jit {
+
+using AliasAnalysisKind = c10::AliasAnalysisKind;
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/print_handler.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/print_handler.h
new file mode 100644
index 0000000000000000000000000000000000000000..6eb4f509270dcd3b42256168f780831829b341bf
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/print_handler.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+
+#include <string>
+
+namespace torch::jit {
+
+using PrintHandler = void (*)(const std::string&);
+
+TORCH_API PrintHandler getDefaultPrintHandler();
+TORCH_API PrintHandler getPrintHandler();
+TORCH_API void setPrintHandler(PrintHandler ph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/profiling_graph_executor_impl.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/profiling_graph_executor_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..1c95ba1d1f21f34db1d781ba346f954d7c71a885
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/profiling_graph_executor_impl.h
@@ -0,0 +1,78 @@
+#pragma once
+#include <c10/util/Flags.h>
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/runtime/graph_executor_impl.h>
+
+TORCH_DECLARE_bool(torch_jit_static_then_dynamic);
+
+TORCH_DECLARE_bool(torch_jit_always_dynamic);
+
+C10_DECLARE_bool(torch_jit_release_profiling_graph_after_optimization);
+C10_DECLARE_int32(torch_jit_release_profiling_graph_delay_in_seconds);
+C10_DECLARE_int64(torch_jit_num_profiled_runs);
+C10_DECLARE_int64(torch_jit_bailout_depth);
+
+namespace torch::jit {
+
+TORCH_API void runNooptPassPipeline(std::shared_ptr<Graph>& graph);
+
+struct TORCH_API ProfilingGraphExecutorImpl : public GraphExecutorImplBase {
+  ProfilingGraphExecutorImpl(
+      const std::shared_ptr<Graph>& graph,
+      std::string function_name);
+
+  const ExecutionPlan& getPlanFor(
+      Stack& stack,
+      std::optional<size_t> remaining_bailout_depth) override;
+  GraphExecutorState getDebugState() override;
+  ~ProfilingGraphExecutorImpl() override = default;
+
+  void debugFlushCompilationCache();
+
+  bool isOptimized() const override {
+    return optimized_plan_.has_value();
+  }
+
+ private:
+  const ExecutionPlan& getOptimizedPlanFor(
+      Stack& stack,
+      std::optional<size_t> remaining_bailout_depth);
+  void runProfilingInsensitiveOptimizations(std::shared_ptr<Graph>& graph);
+  void runProfilingOptimizations(
+      std::shared_ptr<Graph>& graph,
+      size_t remaining_depth);
+  void replaceFallbackGraphWithFallbackFunction(Block* b);
+  FusionBehavior getCurrentBehavior(size_t remaining_depth);
+  size_t getInstantiatedBailoutDepth();
+  void runNoGradOptimizations(
+      std::shared_ptr<Graph>& graph,
+      size_t remaining_bailout_depth);
+  void runFinalOptimizations(std::shared_ptr<Graph>& graph);
+
+  void clearTheGraphCompilationIntermediateGraphs();
+
+  std::unique_ptr<ProfilingRecord> pr_;
+  std::optional<ExecutionPlan>
+      profiling_plan_; // plan to run in order to profiling the code
+  std::optional<ExecutionPlan> optimized_plan_;
+  FusionStrategy fusion_strategy_;
+
+  // this plan is used if getGraphExecutorOptimize is unset
+  std::optional<ExecutionPlan> fallback_plan_;
+  // fallback functions are inserted for tensorexpr fusion groups
+  // and by specialize_autogradzero. Whenever, at runtime, input
+  // tensor don't match profiled properties, fallback functions are called
+  // They are the deoptimized version of the logic in fusion groups
+  // and/or autograd.
+  // The fallback functions are owned by a GraphExecutor instance
+  // They only exist in the optimized graph which is a private property
+  // of the GraphExecutor and only shared with InterpreterState
+  std::vector<std::unique_ptr<Function>> fallback_functions_;
+  std::optional<size_t> remaining_bailout_depth_;
+  // The time the optimized_plan_ is created.
+  int32_t time_optimized_plan_created_ = 0;
+  // Has the extra memory used by the graph for profiling is released?
+  bool is_graph_extra_memory_released_ = false;
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/profiling_record.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/profiling_record.h
new file mode 100644
index 0000000000000000000000000000000000000000..0884fdd50a370d110a8f8b2d3340f50c41dbd449
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/profiling_record.h
@@ -0,0 +1,205 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
+#include <ATen/core/stack.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+#include <list>
+#include <map>
+#include <unordered_map>
+#include <vector>
+
+// We would like to assign each position/axis of a tensor an abstract size
+// * For each `tensor` we have a profiled `Value` of a `TensorType` describing
+// the properties of the `tensor`.
+// * `TensorType` has a property called `symbolic_sizes_` to describe observed
+// `tensor.sizes()`
+// * `symbolic_sizes_` is a vector of abstract sizes (or
+// `std::vector<ShapeSymbol>`) where
+//   * `ShapeSymbol`at `symbolic_sizes_[i]`  describes the size value
+//   (`Dimension`) at `tensor.sizes()[i]`
+// * We may see the same `Dimension` at different positions `i` in
+// `tensor.sizes()` or even in different `tensor`
+//   * First, we would like associate the same `ShapeSymbol` to the same
+//   `Dimension` across **one** profiling execution or run of a TorchScript
+//   function.
+//     * The same `ShapeSymbol`s in different positions of `symbolic_shapes_` in
+//     possibly different `TensorType`s (i.e. `TensorType`s for different
+//     profiled values) form an implicit set. The elements of such a set are
+//     called *dimension locations*.
+//     * These sets allow us to track how the shapes of input arguments of some
+//     operation relate to operation's output shapes as the input and output
+//     shapes might share the same `ShapeSymbol`s
+// * For **every** profiling run, we would like to maintain the invariant that
+// *the same `ShapeSymbol` is always associated with the same `Dimension`*.
+// * To maintain this invariant we merge the profiling information from all
+// profiling runs,
+//   * For every two runs, we iterate over all `symbic_shapes_`  and compare
+//   their `ShapeSymbol`s in the same position.
+//     * if we observe that for every dimension location that has
+//     the`ShapeSymbol S1`  in run #1 there is **only one** `ShapeSymbol S2` in
+//     the same dimension location in run #2, we conclude that the invariant
+//     holds.
+//     * However, if we observe some dimension locations in run #2 have
+//     `ShapeSymbol S2` and the other ones have `ShapeSymbol S3` we would like
+//     to partition the virtual set of dimension locations associated with
+//     `ShapeSymbol S1` into two new subsets, so the invariant holds.
+//     * The partitioning works by assigning a new symbol to the dimension
+//     locations (associated with `ShapeSymbol S1`) that have `ShapeSymbol S2`
+//     and another new symbol to the dimension locations that have `ShapeSymbol
+//     S3`. In other words,
+//       * Subset #1 will consist of the dimension locations that in run #2 have
+//       `ShapeSymbol S2`  and will have `ShapeSymbol S4`  in those dimension
+//       locations
+//       * Subset #2 will consist of the dimension locations that in run #2 have
+//       `ShapeSymbol S4`  and will have `ShapeSymbol S5`  in those dimension
+//       locations
+//     * The effective result of merging the profiling information from two runs
+//     is new `TensorTypes` whose `symbolic_sizes_` /dimension locations have
+//     either `ShapeSymbol S4` or `ShapeSymbol S5`.
+//     * Partitioning can be done even before we have seen all the dimension
+//     locations associated with `ShapeSymbol S1`
+//       * We use `getSymbolInSet` of `ShapeSymbolTable` to remember all
+//       `ShapeSymbols` from run #2 we observed in the dimension locations
+//       associated with `ShapeSymbol S1` .
+//       * For every `ShapeSymbol` from run #2 in the dimension location
+//       associated with `ShapeSymbol S1`  `getSymbolInSet` returns a symbol
+//       that we assign to the dimension location in a new TensorType.
+//         * It's important to point out that the same `ShapeSymbol S2` from run
+//         #2 in two dimension locations that have different `ShapeSymbol`s in
+//         run #1 are different! These dimension locations will belong to
+//         different subsets and have different `ShapeSymbol`s after merge.
+//         * On the other hand, for the same `ShapeSymbol S2` in two dimension
+//         locations that have `ShapeSymbol S1` in run #1`getSymbolInSet` will
+//         return the same symbol.
+
+namespace torch::jit {
+
+using ::c10::TensorTypePtr;
+using Dimension = int64_t;
+
+TORCH_API void RegisterProfilingNode(const std::function<bool(const Node*)>&);
+
+struct ProfilingRecord;
+
+// `SetPartitioningHelper` is used to maintain the following invariant:
+// For **every** profiling run, *the same `ShapeSymbol` is always associated
+// with the same `Dimension`*.
+// while merging the profiling information from multiple runs.
+struct SetPartitioningHelper {
+  std::map<c10::ShapeSymbol, std::map<Dimension, c10::ShapeSymbol>>
+      sets2subsets_;
+
+  // `partitionSetByDimension` partitions a virtual set
+  // of dimension locations associated with ShapeSymbol `symbol` into subsets.
+  // Partitioning is equivalent to giving (or renaming) a particular
+  // dimension location a new `ShapeSymbol`.
+  // The same `Dimension` value in different dimension locations
+  // that used to have `symbol` will receive the same
+  // new `ShapeSymbol`, effectively forming a new set.
+  c10::ShapeSymbol partitionSetByDimension(
+      Dimension new_size,
+      c10::ShapeSymbol symbol) {
+    auto& dims2symbols = getSetForSymbol(symbol);
+
+    if (dims2symbols.count(new_size) == 0) {
+      auto new_sym = c10::ShapeSymbol::newSymbol();
+      dims2symbols[new_size] = new_sym;
+      return new_sym;
+    }
+
+    return dims2symbols[new_size];
+  }
+
+ private:
+  std::map<Dimension, c10::ShapeSymbol>& getSetForSymbol(c10::ShapeSymbol s) {
+    auto& set = sets2subsets_[s];
+    // N.B. adding a mapping { s.static_size(), s }
+    // makes sure we preserve the fact that
+    // some dimension values remain the same
+    // across all profiled runs
+    if (s.is_static()) {
+      set.insert({s.static_size(), s});
+    }
+    return set;
+  }
+};
+
+// ShapeSymbolTable is used by Interpreter
+// to assign dimension values to ShapeSymbols
+// and fail a guard if the same symbol
+// is assigned more than one dimension value.
+struct ShapeSymbolTable {
+  // N.B. we treat static symbols as always assigned
+  // to themselves
+  bool isBound(c10::ShapeSymbol s) {
+    if (s.is_static()) {
+      return true;
+    }
+    return data_.count(s) != 0;
+  }
+
+  // N.B. we treat static symbols as always assigned
+  // to themselves
+  Dimension getValue(c10::ShapeSymbol s) {
+    if (s.is_static()) {
+      return s.static_size();
+    }
+    return data_[s];
+  }
+  void assign(c10::ShapeSymbol s, Dimension v) {
+    TORCH_INTERNAL_ASSERT(!s.is_static());
+    data_[s] = v;
+  }
+  std::map<c10::ShapeSymbol, Dimension> data_;
+  // Tries to assign dimension values from `new_sizes` to
+  // `ShapeSymbol`s `sym_shapes`.
+  // Returns `true` if every dimension value from `new_sizes`
+  // can be assigned to the corresponding `ShapeSymbol` from
+  // `sym_shapes`
+  // A dimension value can be assigned to a `ShapeSymbol`
+  // * if the symbol isn't assigned yet any dimension value
+  // * if the symbol is assigned and its value is equal to
+  // the dimension value from `new_sizes`
+  bool bindSymbolicShapes(
+      at::IntArrayRef new_sizes,
+      const c10::SymbolicShape& sym_shapes);
+};
+
+struct ProfilingRecord {
+  // N.B. ProfilingRecord's copy and move c-tor are disabled, so we won't
+  // end up accidentally copying or moving ProfilingRecords whose addresses
+  // are captured in callbacks_
+  ProfilingRecord(const ProfilingRecord&) = delete;
+  ProfilingRecord(ProfilingRecord&&) noexcept = delete;
+  TORCH_API static std::unique_ptr<ProfilingRecord> instrumentGraph(
+      const std::shared_ptr<Graph>& graph);
+  TORCH_API static void removeProfilingNodes(Block* b);
+  TORCH_API static void removeProfileCounter(Block* b);
+
+  std::shared_ptr<Graph> profiled_graph_;
+  mutable std::mutex mutex_;
+  size_t profiling_count_;
+
+  bool ready() const;
+
+  std::shared_ptr<Graph> graph() const {
+    return profiled_graph_;
+  }
+
+  TORCH_API ProfileIValueOp* createProfileIValueNode(Value* in_val);
+  TORCH_API ProfileIValueOp* createProfileIValueNode(ArrayRef<Value*> inputs);
+
+ private:
+  ProfileOp* createProfileNode(
+      const std::function<void(Stack&)>& fp,
+      at::ArrayRef<Value*> inputs);
+  void instrumentBlock(Block* block);
+  void insertShapeProfile(Node* n, size_t offset, const TypePtr& input_type);
+  ProfilingRecord(std::shared_ptr<Graph> g);
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/register_ops_utils.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/register_ops_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..e50ec4fe41ff58314f74787374c257109cccb060
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/register_ops_utils.h
@@ -0,0 +1,883 @@
+#pragma once
+
+#include <ATen/Context.h>
+#include <c10/core/DeviceType.h>
+#include <torch/csrc/autograd/autograd.h>
+#include <torch/csrc/autograd/edge.h>
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/autograd/generated/variable_factories.h>
+#include <torch/csrc/autograd/variable.h>
+#include <torch/csrc/jit/api/compilation_unit.h>
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/frontend/error_report.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/mobile/register_ops_common_utils.h>
+#include <torch/csrc/jit/runtime/custom_operator.h>
+#include <torch/csrc/jit/runtime/graph_executor.h>
+#include <torch/csrc/jit/runtime/jit_exception.h>
+#include <torch/csrc/jit/runtime/logging.h>
+#include <torch/csrc/jit/runtime/operator.h>
+#include <torch/csrc/jit/runtime/print_handler.h>
+#include <torch/csrc/jit/runtime/profiling_record.h>
+#include <torch/csrc/jit/runtime/vararg_functions.h>
+#include <torch/csrc/jit/serialization/pickle.h>
+
+#include <ATen/ExpandUtils.h>
+#include <ATen/Parallel.h>
+#include <ATen/WrapDimUtils.h>
+#include <ATen/core/Dict.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/ivalue.h>
+#include <c10/core/Device.h>
+#include <c10/core/thread_pool.h>
+#include <c10/util/SmallVector.h>
+#include <c10/util/irange.h>
+
+namespace torch::jit {
+constexpr inline c10::AliasAnalysisKind aliasAnalysisFromSchema() {
+  return c10::AliasAnalysisKind::FROM_SCHEMA;
+}
+
+constexpr inline c10::AliasAnalysisKind aliasAnalysisConservative() {
+  return c10::AliasAnalysisKind::CONSERVATIVE;
+}
+
+constexpr inline c10::AliasAnalysisKind aliasAnalysisSpecialCase() {
+  return c10::AliasAnalysisKind::INTERNAL_SPECIAL_CASE;
+}
+
+template <class T>
+c10::List<T> make_result_list(const TypePtr& elemType) {
+  return c10::List<T>();
+}
+
+template <>
+c10::impl::GenericList make_result_list<IValue>(const TypePtr& elemType);
+
+// As described in https://docs.python.org/3/library/functions.html#round
+// When a number is exactly halfway between two integers, python builtin round
+// function will round to even number. We use round(x/2)*2 to handle the
+// special halfway case. For positive 'x', round(x/2)*2 =
+// round((x_e + x_r)/2)*2 = x_e + round(x_r/2)*2, where x_e is an even integer,
+// x_r is either 0.5 of 1.5, round(x_r/2)*2 results a 0 or 2, so the final
+// result will always be a even number. Due to symmetricity, it also applies to
+// negative cases.
+inline double round_to_even(double a) {
+  return a - std::floor(a) == 0.5 ? (std::round(a * 0.5) * 2.0) : std::round(a);
+}
+
+// using the rules from python_arg_parser FunctionParameter::check
+// tensor cannot have grad set, tensor must be 0 dim,
+// and if the dest is an int the source must be integral type
+void checkImplicitTensorToNum(const at::Tensor& t, bool toInt);
+
+[[maybe_unused]] static int64_t floordiv(int64_t a, int64_t b) {
+  if (b == 0) {
+    throw std::runtime_error("division by 0");
+  }
+  if ((a > 0) == (b > 0)) {
+    // simple case, both have same sign
+    return a / b;
+  } else {
+    // in python division rounds down, it doesn't not truncate like in c++
+    auto r = lldiv(a, b);
+    return (r.rem) ? r.quot - 1 : r.quot;
+  }
+}
+TORCH_API void checkDoubleInRange(double a);
+[[maybe_unused]] static int64_t floor(double a) {
+  checkDoubleInRange(a);
+  return std::floor(a);
+}
+[[maybe_unused]] static int64_t ceil(double a) {
+  checkDoubleInRange(a);
+  return std::ceil(a);
+}
+
+[[maybe_unused]] static int64_t gcd(int64_t a, int64_t b) {
+  while (b != 0) {
+    int64_t r = a % b;
+    a = b;
+    b = r;
+  }
+  // in python gcd returns non-negative values
+  return std::abs(a);
+}
+
+int64_t partProduct(int n, int m);
+
+void loop(int n, int64_t& p, int64_t& r);
+
+int nminussumofbits(int v);
+
+int64_t factorial(int n);
+static const double degToRad = std::acos(-1.0) / 180.0;
+static const double radToDeg = 180.0 / std::acos(-1.0);
+double degrees(double x);
+double radians(double x);
+
+// Convert an python index (which may be negative) into an index usable for a
+// C++ container
+
+// Equivalent to list.at(idx)
+template <typename T>
+decltype(auto) getItem(const c10::List<T>& list, int64_t idx) {
+  const int64_t list_size = list.size();
+  const int64_t normalized_idx = normalizeIndex(idx, list_size);
+  if (normalized_idx < 0 || normalized_idx >= list_size) {
+    throw std::out_of_range("list index out of range");
+  }
+  return list.get(normalized_idx);
+}
+
+template <typename T>
+void setItem(const c10::List<T>& list, int64_t idx, T&& value) {
+  const int64_t list_size = list.size();
+  const int64_t normalized_idx = normalizeIndex(idx, list_size);
+  if (normalized_idx < 0 || normalized_idx >= list_size) {
+    throw std::out_of_range("list index out of range");
+  }
+  list.set(normalized_idx, std::forward<T>(value));
+}
+
+void listAppend(Stack& stack);
+
+void listReverse(Stack& stack);
+
+template <typename T>
+void minList(Stack& stack) {
+  c10::List<T> a = pop(stack).to<c10::List<T>>();
+  c10::List<T> b = pop(stack).to<c10::List<T>>();
+
+  size_t min_size = std::min(a.size(), b.size());
+  for (const auto i : c10::irange(min_size)) {
+    if (a[i] == b[i]) {
+      continue;
+    }
+
+    push(stack, a[i] < b[i] ? a : b);
+    return;
+  }
+
+  push(stack, b.size() < a.size() ? b : a);
+}
+
+template <typename T>
+void maxList(Stack& stack) {
+  c10::List<T> a = pop(stack).to<c10::List<T>>();
+  c10::List<T> b = pop(stack).to<c10::List<T>>();
+
+  size_t min_size = std::min(a.size(), b.size());
+  for (const auto i : c10::irange(min_size)) {
+    if (a[i] == b[i]) {
+      continue;
+    }
+
+    push(stack, a[i] > b[i] ? a : b);
+    return;
+  }
+
+  push(stack, b.size() > a.size() ? b : a);
+}
+
+void listPopImpl(Stack& stack, const char* empty_message);
+
+void listPop(Stack& stack);
+
+void listClear(Stack& stack);
+
+void listDelete(Stack& stack);
+
+void listInsert(Stack& stack);
+
+template <typename T>
+void listRemove(Stack& stack) {
+  T elem = pop(stack).to<T>();
+  c10::List<T> list = pop(stack).to<c10::List<T>>();
+
+  auto pos = std::find(list.begin(), list.end(), elem);
+
+  if (pos != list.end()) {
+    list.erase(pos);
+  } else {
+    TORCH_CHECK(false, "list.remove(x): x not in list");
+  }
+}
+
+template <typename T>
+void listMin(Stack& stack) {
+  c10::List<T> list = pop(stack).to<c10::List<T>>();
+  size_t list_size = list.size();
+  if (list_size == 0) {
+    throw std::runtime_error("min() arg is an empty sequence");
+  }
+
+  T min_elem = list[0];
+  for (const auto i : c10::irange(1, list_size)) {
+    T elem = list[i];
+    min_elem = elem < min_elem ? elem : min_elem;
+  }
+
+  stack.push_back(min_elem);
+}
+
+template <typename T>
+void listMax(Stack& stack) {
+  c10::List<T> list = pop(stack).to<c10::List<T>>();
+  size_t list_size = list.size();
+  if (list_size == 0) {
+    throw std::runtime_error("max() arg is an empty sequence");
+  }
+
+  T max_elem = list[0];
+  for (const auto i : c10::irange(1, list_size)) {
+    T elem = list[i];
+    max_elem = elem > max_elem ? elem : max_elem;
+  }
+
+  stack.push_back(max_elem);
+}
+
+template <>
+void listRemove<at::Tensor>(Stack& stack);
+
+template <typename T>
+void listIndex(Stack& stack) {
+  T elem = pop(stack).to<T>();
+  c10::List<T> list = pop(stack).to<c10::List<T>>();
+
+  auto pos = std::find(list.begin(), list.end(), elem);
+
+  if (pos != list.end()) {
+    push(stack, static_cast<int64_t>(std::distance(list.begin(), pos)));
+  } else {
+    TORCH_CHECK(false, "'", elem, "' is not in list");
+  }
+}
+
+template <>
+void listIndex<at::Tensor>(Stack& stack);
+
+template <typename T>
+void listCount(Stack& stack) {
+  T elem = pop(stack).to<T>();
+  c10::List<T> list = pop(stack).to<c10::List<T>>();
+
+  const int64_t count = std::count(list.begin(), list.end(), elem);
+  push(stack, count);
+}
+
+template <>
+void listCount<at::Tensor>(Stack& stack);
+
+void listExtend(Stack& stack);
+
+void listCopy(Stack& stack);
+
+void listSelect(Stack& stack);
+
+void listLen(Stack& stack);
+
+template <typename T>
+void listEq(Stack& stack) {
+  c10::List<T> b = pop(stack).to<c10::List<T>>();
+  c10::List<T> a = pop(stack).to<c10::List<T>>();
+  push(stack, a == b);
+}
+
+template <typename T>
+void listNe(Stack& stack) {
+  c10::List<T> b = pop(stack).to<c10::List<T>>();
+  c10::List<T> a = pop(stack).to<c10::List<T>>();
+  push(stack, a != b);
+}
+
+inline bool tensor_list_equal(
+    const c10::List<at::Tensor>& a,
+    const c10::List<at::Tensor>& b) {
+  if (a.size() != b.size()) {
+    return false;
+  }
+
+  for (const auto i : c10::irange(a.size())) {
+    const at::Tensor& a_element = a[i];
+    const at::Tensor& b_element = b[i];
+    // This preserves Python's semantics, which uses eq() to compare two
+    // elements, then passes the result to bool().
+    // see: https://docs.python.org/3.4/reference/datamodel.html#object.__ge__
+    const auto cmp_result = a_element.eq(b_element);
+    if (!at::native::is_nonzero(cmp_result)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+// Specialization for at::Tensor, since it doesn't define operator==
+template <>
+void listEq<at::Tensor>(Stack& stack);
+
+// Specialization for at::Tensor, since it doesn't define operator==
+template <>
+void listNe<at::Tensor>(Stack& stack);
+
+void listList(Stack& stack);
+
+template <typename T>
+void listContains(Stack& stack) {
+  auto key = pop(stack).to<T>();
+  auto list = pop(stack).to<c10::List<T>>();
+  // NOLINTNEXTLINE(performance-implicit-conversion-in-loop)
+  for (const T& item : list) {
+    if (item == key) {
+      push(stack, true);
+      return;
+    }
+  }
+  push(stack, false);
+}
+
+void listAdd(Stack& stack);
+
+void listInplaceAdd(Stack& stack);
+
+void listMulIntLeftInPlace(Stack& stack);
+
+void listMulIntLeft(Stack& stack);
+
+void listMulIntRight(Stack& stack);
+
+void listSlice(Stack& stack);
+
+template <typename T>
+void listSort(Stack& stack) {
+  bool reverse = pop(stack).toBool();
+  c10::List<T> list = pop(stack).to<c10::List<T>>();
+  std::sort(list.begin(), list.end(), [reverse](const T& a, const T& b) {
+    // FBCode errors without this check - "strict weak ordering"
+    // TODO: remove when possible, since it just slows down
+    // sorting and doesn't do anything useful
+    if (a == b) {
+      return false;
+    }
+    return (a < b) != reverse;
+  });
+}
+
+// Specialization for at::Tensor
+template <>
+void listSort<at::Tensor>(Stack& stack);
+
+template <typename T>
+void listCopyAndSort(Stack& stack) {
+  c10::List<T> list = pop(stack).to<c10::List<T>>();
+  auto list_copied = list.copy();
+  std::sort(list_copied.begin(), list_copied.end(), [](const T& a, const T& b) {
+    // "strict weak ordering" issue - see other sort
+    if (a == b) {
+      return false;
+    }
+    return a < b;
+  });
+  push(stack, list_copied);
+}
+
+// Specialization for at::Tensor
+template <>
+void listCopyAndSort<at::Tensor>(Stack& stack);
+
+void listSetItem(Stack& stack);
+
+struct OperatorGeneratorArgs {
+  const char* schema_str;
+  bool isOperationCreator;
+  union {
+    void (*operation)(Stack&);
+    OperationCreator operationCreator;
+  };
+  AliasAnalysisKind aliasAnalysis;
+
+  explicit constexpr OperatorGeneratorArgs(
+      torch::detail::SelectiveStr<true> schema_str,
+      void (*op)(Stack&),
+      AliasAnalysisKind aa)
+      : schema_str(schema_str),
+        isOperationCreator(false),
+        operation(op),
+        aliasAnalysis(aa) {}
+
+  explicit constexpr OperatorGeneratorArgs(
+      torch::detail::SelectiveStr<true> schema_str,
+      OperationCreator opCreator,
+      AliasAnalysisKind aa)
+      : schema_str(schema_str),
+        isOperationCreator(true),
+        operationCreator(opCreator),
+        aliasAnalysis(aa) {}
+
+  template <typename... Args>
+  explicit constexpr OperatorGeneratorArgs(
+      torch::detail::SelectiveStr<false>,
+      Args...)
+      : schema_str(nullptr),
+        isOperationCreator(false),
+        operation(nullptr),
+        aliasAnalysis(AliasAnalysisKind::INTERNAL_SPECIAL_CASE) {}
+};
+
+#define DEFINE_GENERIC_BINARY_OP(                                             \
+    aten_op, op, int_float_result, complex_result)                            \
+  OperatorGeneratorArgs(                                                      \
+      TORCH_SELECTIVE_SCHEMA(#aten_op                                         \
+                             ".int_int(int a, int b) -> " #int_float_result), \
+      [](Stack& stack) {                                                      \
+        int64_t a, b;                                                         \
+        pop(stack, a, b);                                                     \
+        push(stack, op);                                                      \
+      },                                                                      \
+      aliasAnalysisFromSchema()),                                             \
+      OperatorGeneratorArgs(                                                  \
+          TORCH_SELECTIVE_SCHEMA(                                             \
+              #aten_op                                                        \
+              ".float_float(float a, float b) -> " #int_float_result),        \
+          [](Stack& stack) {                                                  \
+            double a, b;                                                      \
+            pop(stack, a, b);                                                 \
+            push(stack, op);                                                  \
+          },                                                                  \
+          aliasAnalysisFromSchema()),                                         \
+      OperatorGeneratorArgs(                                                  \
+          TORCH_SELECTIVE_SCHEMA(                                             \
+              #aten_op                                                        \
+              ".complex_complex(complex a, complex b) -> " #complex_result),  \
+          [](Stack& stack) {                                                  \
+            c10::complex<double> a, b;                                        \
+            pop(stack, a, b);                                                 \
+            push(stack, op);                                                  \
+          },                                                                  \
+          aliasAnalysisFromSchema())
+
+// define implementations for primitive number ops
+#define DEFINE_GENERIC_OP(aten_op, int_op, float_op, int_result, float_result) \
+  OperatorGeneratorArgs(                                                       \
+      TORCH_SELECTIVE_SCHEMA(#aten_op ".int(int a, int b) -> " #int_result),   \
+      [](Stack& stack) {                                                       \
+        int64_t a, b;                                                          \
+        pop(stack, a, b);                                                      \
+        push(stack, int_op);                                                   \
+      },                                                                       \
+      aliasAnalysisFromSchema()),                                              \
+      OperatorGeneratorArgs(                                                   \
+          TORCH_SELECTIVE_SCHEMA(                                              \
+              #aten_op ".float(float a, float b) -> " #float_result),          \
+          [](Stack& stack) {                                                   \
+            double a, b;                                                       \
+            pop(stack, a, b);                                                  \
+            push(stack, float_op);                                             \
+          },                                                                   \
+          aliasAnalysisFromSchema())
+
+#define DEFINE_INT_FLOAT_OP(aten_op, op, result)                            \
+  OperatorGeneratorArgs(                                                    \
+      TORCH_SELECTIVE_SCHEMA(#aten_op                                       \
+                             ".int_float(int a, float b) -> " #result),     \
+      [](Stack& stack) {                                                    \
+        int64_t a;                                                          \
+        double b;                                                           \
+        pop(stack, a, b);                                                   \
+        push(stack, op);                                                    \
+      },                                                                    \
+      aliasAnalysisFromSchema()),                                           \
+      OperatorGeneratorArgs(                                                \
+          TORCH_SELECTIVE_SCHEMA(#aten_op                                   \
+                                 ".float_int(float a, int b) -> " #result), \
+          [](Stack& stack) {                                                \
+            double a;                                                       \
+            int64_t b;                                                      \
+            pop(stack, a, b);                                               \
+            push(stack, op);                                                \
+          },                                                                \
+          aliasAnalysisFromSchema())
+
+#define DEFINE_INT_OP(aten_op, op)                                  \
+  OperatorGeneratorArgs(                                            \
+      TORCH_SELECTIVE_SCHEMA(#aten_op ".int(int a, int b) -> int"), \
+      [](Stack& stack) {                                            \
+        int64_t a, b;                                               \
+        pop(stack, a, b);                                           \
+        push(stack, op); /* NOLINT(hicpp-signed-bitwise) */         \
+      },                                                            \
+      aliasAnalysisFromSchema())
+
+#define DEFINE_STR_CMP_OP(aten_op, op)                               \
+  OperatorGeneratorArgs(                                             \
+      TORCH_SELECTIVE_SCHEMA(#aten_op ".str(str a, str b) -> bool"), \
+      [](Stack& stack) {                                             \
+        auto b = pop(stack).toStringRef();                           \
+        auto a = pop(stack).toStringRef();                           \
+        push(stack, op);                                             \
+      },                                                             \
+      aliasAnalysisFromSchema())
+
+// define a primitive op over Scalar operands.
+// it's necessary to register this overload following
+// int/float variations to avoid trapping Scalar args
+// in unintended implicit conversions
+#define DEFINE_SCALAR_BINARY_OP_AVOID_COLLISION_GENERIC(          \
+    aten_op, int_op, float_op, result, string_val)                \
+  OperatorGeneratorArgs(                                          \
+      TORCH_SELECTIVE_SCHEMA(#aten_op string_val                  \
+                             "(Scalar a, Scalar b) -> " #result), \
+      [](Stack& stack) {                                          \
+        IValue x, y;                                              \
+        pop(stack, x, y);                                         \
+        if (x.isDouble()) {                                       \
+          if (y.isDouble()) {                                     \
+            double a = x.toDouble();                              \
+            double b = y.toDouble();                              \
+            push(stack, float_op);                                \
+          } else {                                                \
+            double a = x.toDouble();                              \
+            int64_t b = y.toInt();                                \
+            push(stack, float_op);                                \
+          }                                                       \
+        } else {                                                  \
+          if (y.isDouble()) {                                     \
+            int64_t a = x.toInt();                                \
+            double b = y.toDouble();                              \
+            push(stack, float_op);                                \
+          } else {                                                \
+            int64_t a = x.toInt();                                \
+            int64_t b = y.toInt();                                \
+            push(stack, int_op);                                  \
+          }                                                       \
+        }                                                         \
+      },                                                          \
+      aliasAnalysisFromSchema())
+
+#define DEFINE_SCALAR_BINARY_OP(aten_op, int_op, float_op, result) \
+  DEFINE_SCALAR_BINARY_OP_AVOID_COLLISION_GENERIC(                 \
+      aten_op, int_op, float_op, result, "")
+
+#define DEFINE_SCALAR_BINARY_OP_AVOID_COLLISION(   \
+    aten_op, int_op, float_op, result)             \
+  DEFINE_SCALAR_BINARY_OP_AVOID_COLLISION_GENERIC( \
+      aten_op, int_op, float_op, result, ".Scalar_Scalar")
+
+#define DEFINE_BINARY_OP(aten_op, op)             \
+  DEFINE_GENERIC_OP(aten_op, op, op, int, float), \
+      DEFINE_INT_FLOAT_OP(aten_op, op, float),    \
+      DEFINE_SCALAR_BINARY_OP(aten_op, op, op, Scalar)
+
+#define DEFINE_BINARY_FLOAT_OP(aten_op, op)         \
+  DEFINE_GENERIC_OP(aten_op, op, op, float, float), \
+      DEFINE_INT_FLOAT_OP(aten_op, op, float),      \
+      DEFINE_SCALAR_BINARY_OP(aten_op, op, op, float)
+
+#define DEFINE_COMPARISON_OP(aten_op, op)             \
+  DEFINE_GENERIC_OP(aten_op, op, op, bool, bool),     \
+      DEFINE_INT_FLOAT_OP(aten_op, op, bool),         \
+      DEFINE_SCALAR_BINARY_OP(aten_op, op, op, bool), \
+      DEFINE_STR_CMP_OP(aten_op, op)
+
+#define DEFINE_UNARY_INT_OP(aten_op, op, result)                  \
+  OperatorGeneratorArgs(                                          \
+      TORCH_SELECTIVE_SCHEMA(#aten_op ".int(int a) -> " #result), \
+      [](Stack& stack) {                                          \
+        int64_t a;                                                \
+        pop(stack, a);                                            \
+        push(stack, op);                                          \
+      },                                                          \
+      aliasAnalysisFromSchema())
+
+#define DEFINE_UNARY_FLOAT_OP(aten_op, op, result)                    \
+  OperatorGeneratorArgs(                                              \
+      TORCH_SELECTIVE_SCHEMA(#aten_op ".float(float a) -> " #result), \
+      [](Stack& stack) {                                              \
+        double a;                                                     \
+        pop(stack, a);                                                \
+        push(stack, op);                                              \
+      },                                                              \
+      aliasAnalysisFromSchema())
+
+#define DEFINE_UNARY_OP(aten_op, op, int_result, float_result)            \
+  DEFINE_UNARY_INT_OP(aten_op, op, int_result),                           \
+      DEFINE_UNARY_FLOAT_OP(aten_op, op, float_result),                   \
+      OperatorGeneratorArgs(                                              \
+          TORCH_SELECTIVE_SCHEMA(#aten_op ".Scalar(Scalar a) -> Scalar"), \
+          [](Stack& stack) {                                              \
+            IValue x;                                                     \
+            pop(stack, x);                                                \
+            if (x.isDouble()) {                                           \
+              double a = x.toDouble();                                    \
+              push(stack, static_cast<float_result>(op));                 \
+            } else {                                                      \
+              int64_t a = x.toInt();                                      \
+              push(stack, static_cast<int_result>(op));                   \
+            }                                                             \
+          },                                                              \
+          aliasAnalysisFromSchema())
+#define DEFINE_BOOL_OP(aten_op, op)                                     \
+  OperatorGeneratorArgs(                                                \
+      TORCH_SELECTIVE_SCHEMA(#aten_op ".bool(bool a, bool b) -> bool"), \
+      [](Stack& stack) {                                                \
+        bool a, b;                                                      \
+        pop(stack, a, b);                                               \
+        push(stack, op);                                                \
+      },                                                                \
+      aliasAnalysisFromSchema())
+#define DEFINE_STRING_OP(op_name, string_op, result)                    \
+  OperatorGeneratorArgs(                                                \
+      TORCH_SELECTIVE_SCHEMA(#op_name ".str(str a, str b) ->" #result), \
+      [](Stack& stack) {                                                \
+        auto b = pop(stack).toStringRef();                              \
+        auto a = pop(stack).toStringRef();                              \
+        push(stack, string_op);                                         \
+      },                                                                \
+      aliasAnalysisFromSchema())
+
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+//-----------------------------------------------------------------------------
+#define DEFINE_UNARY_COMPLEX_OP(aten_op, op, result)                      \
+  OperatorGeneratorArgs(                                                  \
+      TORCH_SELECTIVE_SCHEMA(#aten_op ".complex(complex a) -> " #result), \
+      [](Stack& stack) {                                                  \
+        c10::complex<double> a;                                           \
+        pop(stack, a);                                                    \
+        push(stack, op);                                                  \
+      },                                                                  \
+      aliasAnalysisFromSchema())
+
+// Some complex unary ops (like abs, angle) return real valued output, but most
+// other unary ops return complex valued output. So, this macro is used in the
+// former case where we can explicitly pass complex_result_cast argument, which
+// is set to c10::complex<float> in the macro `DEFINE_UNARY_OP_WITH_COMPLEX`
+// defined below.
+#define DEFINE_UNARY_OP_WITH_COMPLEX_CAST(                                \
+    aten_op,                                                              \
+    op,                                                                   \
+    int_result,                                                           \
+    float_result,                                                         \
+    complex_result,                                                       \
+    complex_result_cast)                                                  \
+  DEFINE_UNARY_INT_OP(aten_op, op, int_result),                           \
+      DEFINE_UNARY_FLOAT_OP(aten_op, op, float_result),                   \
+      DEFINE_UNARY_COMPLEX_OP(aten_op, op, complex_result),               \
+      OperatorGeneratorArgs(                                              \
+          TORCH_SELECTIVE_SCHEMA(#aten_op ".Scalar(Scalar a) -> Scalar"), \
+          [](Stack& stack) {                                              \
+            IValue x;                                                     \
+            pop(stack, x);                                                \
+            if (x.isDouble()) {                                           \
+              double a = x.toDouble();                                    \
+              push(stack, static_cast<float_result>(op));                 \
+            } else if (x.isComplexDouble()) {                             \
+              c10::complex<double> a = x.toComplexDouble();               \
+              push(stack, static_cast<complex_result_cast>(op));          \
+            } else {                                                      \
+              int64_t a = x.toInt();                                      \
+              push(stack, static_cast<int_result>(op));                   \
+            }                                                             \
+          },                                                              \
+          aliasAnalysisFromSchema())
+
+#define DEFINE_UNARY_OP_WITH_COMPLEX(aten_op, op, int_result, float_result) \
+  DEFINE_UNARY_OP_WITH_COMPLEX_CAST(                                        \
+      aten_op, op, int_result, float_result, complex, c10::complex<double>)
+
+#define DEFINE_GENERIC_OP_WITH_COMPLEX(                                       \
+    aten_op,                                                                  \
+    int_op,                                                                   \
+    float_op,                                                                 \
+    complex_op,                                                               \
+    int_result,                                                               \
+    float_result,                                                             \
+    complex_result)                                                           \
+  OperatorGeneratorArgs(                                                      \
+      TORCH_SELECTIVE_SCHEMA(#aten_op ".int(int a, int b) -> " #int_result),  \
+      [](Stack& stack) {                                                      \
+        int64_t a, b;                                                         \
+        pop(stack, a, b);                                                     \
+        push(stack, int_op);                                                  \
+      },                                                                      \
+      aliasAnalysisFromSchema()),                                             \
+      OperatorGeneratorArgs(                                                  \
+          TORCH_SELECTIVE_SCHEMA(                                             \
+              #aten_op ".complex(complex a, complex b) -> " #complex_result), \
+          [](Stack& stack) {                                                  \
+            c10::complex<double> a, b;                                        \
+            pop(stack, a, b);                                                 \
+            push(stack, complex_op);                                          \
+          },                                                                  \
+          aliasAnalysisFromSchema()),                                         \
+      OperatorGeneratorArgs(                                                  \
+          TORCH_SELECTIVE_SCHEMA(                                             \
+              #aten_op ".float(float a, float b) -> " #float_result),         \
+          [](Stack& stack) {                                                  \
+            double a, b;                                                      \
+            pop(stack, a, b);                                                 \
+            push(stack, float_op);                                            \
+          },                                                                  \
+          aliasAnalysisFromSchema())
+
+#define DEFINE_INT_COMPLEX_OP(aten_op, op, result)                          \
+  OperatorGeneratorArgs(                                                    \
+      TORCH_SELECTIVE_SCHEMA(#aten_op                                       \
+                             ".int_complex(int a, complex b) -> " #result), \
+      [](Stack& stack) {                                                    \
+        int64_t a;                                                          \
+        c10::complex<double> b;                                             \
+        pop(stack, a, b);                                                   \
+        push(stack, op);                                                    \
+      },                                                                    \
+      aliasAnalysisFromSchema()),                                           \
+      OperatorGeneratorArgs(                                                \
+          TORCH_SELECTIVE_SCHEMA(                                           \
+              #aten_op ".complex_int(complex a, int b) -> " #result),       \
+          [](Stack& stack) {                                                \
+            c10::complex<double> a;                                         \
+            int64_t b;                                                      \
+            pop(stack, a, b);                                               \
+            push(stack, op);                                                \
+          },                                                                \
+          aliasAnalysisFromSchema())
+
+#define DEFINE_FLOAT_COMPLEX_OP(aten_op, op, result)                      \
+  OperatorGeneratorArgs(                                                  \
+      TORCH_SELECTIVE_SCHEMA(                                             \
+          #aten_op ".float_complex(float a, complex b) -> " #result),     \
+      [](Stack& stack) {                                                  \
+        double a;                                                         \
+        c10::complex<double> b;                                           \
+        pop(stack, a, b);                                                 \
+        push(stack, op);                                                  \
+      },                                                                  \
+      aliasAnalysisFromSchema()),                                         \
+      OperatorGeneratorArgs(                                              \
+          TORCH_SELECTIVE_SCHEMA(                                         \
+              #aten_op ".complex_float(complex a, float b) -> " #result), \
+          [](Stack& stack) {                                              \
+            c10::complex<double> a;                                       \
+            double b;                                                     \
+            pop(stack, a, b);                                             \
+            push(stack, op);                                              \
+          },                                                              \
+          aliasAnalysisFromSchema())
+
+#define DEFINE_SCALAR_BINARY_OP_WITH_COMPLEX_AVOID_COLLISION_GENERIC( \
+    aten_op, int_op, float_op, complex_op, result, string_val)        \
+  OperatorGeneratorArgs(                                              \
+      TORCH_SELECTIVE_SCHEMA(#aten_op string_val                      \
+                             "(Scalar a, Scalar b) -> " #result),     \
+      [](Stack& stack) {                                              \
+        IValue x, y;                                                  \
+        pop(stack, x, y);                                             \
+        if (x.isComplexDouble()) {                                    \
+          c10::complex<double> a = x.toComplexDouble();               \
+          if (y.isComplexDouble()) {                                  \
+            c10::complex<double> b = y.toComplexDouble();             \
+            push(stack, complex_op);                                  \
+          } else if (y.isDouble()) {                                  \
+            double b = y.toDouble();                                  \
+            push(stack, complex_op);                                  \
+          } else {                                                    \
+            int64_t b = y.toInt();                                    \
+            push(stack, complex_op);                                  \
+          }                                                           \
+        } else if (x.isDouble()) {                                    \
+          double a = x.toDouble();                                    \
+          if (y.isComplexDouble()) {                                  \
+            c10::complex<double> b = y.toComplexDouble();             \
+            push(stack, complex_op);                                  \
+          } else if (y.isDouble()) {                                  \
+            double b = y.toDouble();                                  \
+            push(stack, float_op);                                    \
+          } else {                                                    \
+            int64_t b = y.toInt();                                    \
+            push(stack, float_op);                                    \
+          }                                                           \
+        } else {                                                      \
+          int64_t a = x.toInt();                                      \
+          if (y.isComplexDouble()) {                                  \
+            c10::complex<double> b = y.toComplexDouble();             \
+            push(stack, complex_op);                                  \
+          } else if (y.isDouble()) {                                  \
+            double b = y.toDouble();                                  \
+            push(stack, float_op);                                    \
+          } else {                                                    \
+            int64_t b = y.toInt();                                    \
+            push(stack, int_op);                                      \
+          }                                                           \
+        }                                                             \
+      },                                                              \
+      aliasAnalysisFromSchema())
+
+#define DEFINE_SCALAR_BINARY_OP_WITH_COMPLEX_WITHOUT_INT_COMPLEX_PAIR(     \
+    aten_op, int_op, float_op, complex_op, result)                         \
+  OperatorGeneratorArgs(                                                   \
+      TORCH_SELECTIVE_SCHEMA(#aten_op "(Scalar a, Scalar b) -> " #result), \
+      [](Stack& stack) {                                                   \
+        IValue x, y;                                                       \
+        pop(stack, x, y);                                                  \
+        if (x.isComplexDouble()) {                                         \
+          c10::complex<double> a = x.toComplexDouble();                    \
+          if (y.isComplexDouble()) {                                       \
+            c10::complex<double> b = y.toComplexDouble();                  \
+            push(stack, complex_op);                                       \
+          } else if (y.isDouble()) {                                       \
+            double b = y.toDouble();                                       \
+            push(stack, complex_op);                                       \
+          }                                                                \
+        } else if (x.isDouble()) {                                         \
+          double a = x.toDouble();                                         \
+          if (y.isComplexDouble()) {                                       \
+            c10::complex<double> b = y.toComplexDouble();                  \
+            push(stack, complex_op);                                       \
+          } else if (y.isDouble()) {                                       \
+            double b = y.toDouble();                                       \
+            push(stack, float_op);                                         \
+          } else {                                                         \
+            int64_t b = y.toInt();                                         \
+            push(stack, float_op);                                         \
+          }                                                                \
+        } else {                                                           \
+          int64_t a = x.toInt();                                           \
+          if (y.isDouble()) {                                              \
+            double b = y.toDouble();                                       \
+            push(stack, float_op);                                         \
+          } else if (y.isInt()) {                                          \
+            int64_t b = y.toInt();                                         \
+            push(stack, int_op);                                           \
+          }                                                                \
+        }                                                                  \
+      },                                                                   \
+      aliasAnalysisFromSchema())
+
+#define DEFINE_SCALAR_BINARY_OP_WITH_COMPLEX(                   \
+    aten_op, int_op, float_op, complex_op, result)              \
+  DEFINE_SCALAR_BINARY_OP_WITH_COMPLEX_AVOID_COLLISION_GENERIC( \
+      aten_op, int_op, float_op, complex_op, result, "")
+
+#define DEFINE_BINARY_OP_WITH_COMPLEX(aten_op, op)                          \
+  DEFINE_GENERIC_OP_WITH_COMPLEX(aten_op, op, op, op, int, float, complex), \
+      DEFINE_INT_COMPLEX_OP(aten_op, op, complex),                          \
+      DEFINE_FLOAT_COMPLEX_OP(aten_op, op, complex),                        \
+      DEFINE_INT_FLOAT_OP(aten_op, op, float),                              \
+      DEFINE_SCALAR_BINARY_OP_WITH_COMPLEX(aten_op, op, op, op, Scalar)
+
+#define DEFINE_COMPARISON_OP_WITH_COMPLEX(aten_op, op)                   \
+  DEFINE_GENERIC_OP_WITH_COMPLEX(aten_op, op, op, op, bool, bool, bool), \
+      DEFINE_INT_FLOAT_OP(aten_op, op, bool),                            \
+      DEFINE_FLOAT_COMPLEX_OP(aten_op, op, bool),                        \
+      DEFINE_SCALAR_BINARY_OP_WITH_COMPLEX_WITHOUT_INT_COMPLEX_PAIR(     \
+          aten_op, op, op, op, bool),                                    \
+      DEFINE_STR_CMP_OP(aten_op, op)
+
+TORCH_API at::Generator make_generator_for_device(
+    c10::Device device,
+    std::optional<int64_t> seed = std::nullopt);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/script_profile.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/script_profile.h
new file mode 100644
index 0000000000000000000000000000000000000000..77fbc3212d2b005f0760509f066b542627ef34f9
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/script_profile.h
@@ -0,0 +1,103 @@
+#pragma once
+
+#include <chrono>
+#include <map>
+#include <string>
+
+#include <ATen/core/ivalue.h>
+#include <c10/macros/Macros.h>
+#include <torch/csrc/jit/frontend/source_ref.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+namespace profiling {
+
+struct Datapoint {
+  using Timepoint = std::chrono::time_point<std::chrono::steady_clock>;
+  SourceRange sourceRange;
+  Timepoint start;
+  Timepoint end;
+
+  explicit Datapoint(SourceRange sr)
+      : sourceRange(std::move(sr)), start(std::chrono::steady_clock::now()) {}
+};
+
+class TORCH_API InstructionSpan {
+ public:
+  explicit InstructionSpan(Node&);
+  ~InstructionSpan();
+  InstructionSpan(InstructionSpan&&) = delete;
+  InstructionSpan& operator=(InstructionSpan&&) = delete;
+
+ private:
+  std::unique_ptr<Datapoint> datapoint_;
+};
+
+bool TORCH_API isProfilingOngoing();
+
+} // namespace profiling
+
+struct TORCH_API InstructionStats : public CustomClassHolder {
+  int64_t count{0};
+  std::chrono::nanoseconds duration{0};
+};
+
+class TORCH_API SourceStats : public CustomClassHolder {
+ public:
+  using LineMap = c10::Dict<int64_t, c10::intrusive_ptr<InstructionStats>>;
+
+  SourceStats(SourceRef source, const LineMap& lineMap)
+      : source_(std::move(source)), lineMap_(lineMap) {}
+
+  const SourceRef& getSourceRef() const {
+    return source_;
+  }
+
+  const LineMap& getLineMap() const {
+    return lineMap_;
+  }
+
+ private:
+  SourceRef source_;
+  LineMap lineMap_;
+};
+
+/**
+ * ScriptProfile is an underlying C++ implementation for TorchScript profiling.
+ * The profiling section is specified by calling enable() and disable():
+ *
+ * ...
+ * scriptProfile.enable();
+ * ...
+ * (scripts)
+ * ...
+ * scriptProfile.disable();
+ * ...
+ *
+ * NOTE: you cannot attach the profiler while the script is running.
+ *
+ * To retrieve collected runtime data, users may call dumpStats() and do
+ * arbitrary filtering on the data they want. Note that dumpStats() should
+ * not be called inside a profiling section.
+ * In general, stats are aggregated per source function body, and then by line
+ * number.
+ */
+class TORCH_API ScriptProfile : public CustomClassHolder {
+  // Aggregates datapoints by function source id, then by line number.
+  using LineMap = std::map<int64_t, InstructionStats>;
+  using SourceMap = std::map<SourceRef, LineMap, std::less<>>;
+
+ public:
+  void enable();
+  void disable();
+  const SourceMap& dumpStats();
+  void addDatapoint(std::shared_ptr<profiling::Datapoint>);
+  ~ScriptProfile() override;
+
+ private:
+  bool enabled_{false};
+  std::vector<std::shared_ptr<profiling::Datapoint>> datapoints_;
+  SourceMap sourceMap_;
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/serialized_shape_function_registry.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/serialized_shape_function_registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..508d82e59db95bcf8614d32850aaa91aeba716a6
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/serialized_shape_function_registry.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API const std::string& GetSerializedShapeFunctions();
+
+TORCH_API const OperatorMap<std::string>& GetShapeFunctionMappings();
+
+TORCH_API const OperatorMap<std::pair<std::string, std::string>>&
+GetBoundedShapeMappings();
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/shape_function_registry.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/shape_function_registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb5c77277b3039430b5d246fed506e2eae6b1244
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/shape_function_registry.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API const std::string& GetSerializedFuncs();
+
+TORCH_API const OperatorMap<std::string>& GetFuncMapping();
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/simple_graph_executor_impl.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/simple_graph_executor_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..70ad5d69ee920eca634e92dba23bcfc7c21498c5
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/simple_graph_executor_impl.h
@@ -0,0 +1,23 @@
+#pragma once
+#include <c10/util/Flags.h>
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/runtime/graph_executor_impl.h>
+
+namespace torch::jit {
+
+struct TORCH_API SimpleGraphExecutorImpl : public GraphExecutorImplBase {
+  SimpleGraphExecutorImpl(
+      const std::shared_ptr<Graph>& graph,
+      std::string function_name);
+
+  const ExecutionPlan& getPlanFor(
+      Stack& stack,
+      std::optional<size_t> remaining_bailout_depth) override;
+  GraphExecutorState getDebugState() override;
+  ~SimpleGraphExecutorImpl() override = default;
+
+ private:
+  std::optional<ExecutionPlan> execution_plan_;
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/slice_indices_adjust.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/slice_indices_adjust.h
new file mode 100644
index 0000000000000000000000000000000000000000..da752c1c9e36aac5dc949d9abe8076cc5c61bdb0
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/slice_indices_adjust.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <cstddef>
+#include <cstdint>
+
+namespace torch::jit {
+
+// Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
+// 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020 Python Software
+// Foundation; All Rights Reserved
+//
+// Stolen (with appropriate modifications) by @agolynski
+// (https://github.com/pytorch/pytorch/pull/33019) from cpython repo
+// Objects/sliceobject.c with comment: this is harder to get right than you
+// might think
+//
+// This adjusts indexes according to python list semantics and returns number
+// of elements in the resulting list.
+TORCH_API int64_t slice_indices_adjust(
+    int64_t length,
+    int64_t* start,
+    int64_t* stop,
+    int64_t step);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/static/ProcessedNodeInputs.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/static/ProcessedNodeInputs.h
new file mode 100644
index 0000000000000000000000000000000000000000..534f24acc1afb8979fb2a9647a1f00caf2517a31
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/static/ProcessedNodeInputs.h
@@ -0,0 +1,241 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include <memory>
+
+#include <c10/macros/Macros.h>
+#include <c10/util/Logging.h>
+
+/**
+ * Packed representation of input indices for ProcessedNode.
+ */
+class ProcessedNodeInputs {
+ private:
+  // This keeps the size usage for inputs + outputs down to 16 bytes;
+  // we use 12 bytes, and then two 2-byte integers are used to store
+  // the outputs.
+  static constexpr size_t kMaxInlineInputs = 5;
+
+ public:
+  ProcessedNodeInputs() : ProcessedNodeInputs(0) {}
+
+  explicit ProcessedNodeInputs(size_t size) {
+    TORCH_DCHECK_LT(size, (1 << 16));
+    if (size <= kMaxInlineInputs) {
+      repr_.inline_repr_.size = size;
+    } else {
+      new (&repr_.outline_repr_) HeapArrayPtr(size);
+    }
+  }
+
+  uint16_t operator[](uint16_t idx) const {
+    // NOLINTNEXTLINE(*const-cast*)
+    return (*const_cast<ProcessedNodeInputs*>(this))[idx];
+  }
+
+  uint16_t& operator[](uint16_t idx) {
+    if (C10_LIKELY(repr_.is_inline())) {
+      TORCH_DCHECK_LT(idx, repr_.inline_repr_.size);
+      return repr_.inline_repr_.inputs[idx];
+    } else {
+      return repr_.outline_repr_[idx];
+    }
+  }
+
+  [[nodiscard]] uint16_t size() const {
+    if (C10_LIKELY(repr_.is_inline())) {
+      return repr_.inline_repr_.size;
+    } else {
+      return repr_.outline_repr_.size();
+    }
+  }
+
+  [[nodiscard]] bool empty() const {
+    return size() == 0;
+  }
+
+ private:
+  class HeapArrayPtr {
+   public:
+    HeapArrayPtr() = default;
+    ~HeapArrayPtr() = default;
+
+    explicit HeapArrayPtr(uint16_t size) : array_(alloc(size)) {}
+
+    HeapArrayPtr(const HeapArrayPtr& rhs) : array_(alloc(rhs.size())) {
+      if (rhs.array_) {
+        std::memcpy(
+            array_.get(),
+            rhs.array_.get(),
+            (rhs.size() + 1) * sizeof(uint16_t));
+      }
+    }
+
+    HeapArrayPtr& operator=(const HeapArrayPtr& rhs) {
+      if (&rhs == this) {
+        return *this;
+      }
+
+      if (size() != rhs.size()) {
+        array_ = alloc(rhs.size());
+      }
+
+      if (rhs.array_) {
+        std::memcpy(
+            array_.get(),
+            rhs.array_.get(),
+            (rhs.size() + 1) * sizeof(uint16_t));
+      }
+      return *this;
+    }
+
+    HeapArrayPtr(HeapArrayPtr&&) noexcept = default;
+    HeapArrayPtr& operator=(HeapArrayPtr&&) noexcept = default;
+
+    [[nodiscard]] bool empty() const {
+      return size() != 0;
+    }
+
+    [[nodiscard]] uint16_t size() const {
+      return array_ ? array_[0] : 0;
+    }
+
+    uint16_t operator[](uint16_t idx) const {
+      TORCH_DCHECK_LT(idx, size());
+      return array_[idx + 1];
+    }
+
+    uint16_t& operator[](uint16_t idx) {
+      TORCH_DCHECK_LT(idx, size());
+      return array_[idx + 1];
+    }
+
+   private:
+    // NOLINTNEXTLINE(modernize-avoid-c-arrays)
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays)
+    std::unique_ptr<uint16_t[]> array_;
+
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays)
+    // NOLINTNEXTLINE(modernize-avoid-c-arrays)
+    static std::unique_ptr<uint16_t[]> alloc(uint16_t num_elts) {
+      if (num_elts) {
+        auto result = std::make_unique<uint16_t[]>(num_elts + 1);
+        result[0] = num_elts;
+        return result;
+      } else {
+        return nullptr;
+      }
+    }
+  };
+
+  // We want ProcessedNode to be able to pack two more `uint16_t`
+  // fields after its ProcessedNodeInputs, and we'll end up being
+  // aligned to an 8-byte boundary anyway. We could avoid this pragma
+  // at the cost of having to move ProcessedNode::outputs_offset_ and
+  // ProcessedNode::num_outputs_ into this class, which would be
+  // awkward.
+#pragma pack(push, 2)
+  union Repr {
+    [[nodiscard]] bool is_inline() const {
+      uint8_t tag = 0;
+      // Use of reinterpret_cast to pointer to char or unsigned char
+      // is defined behavior; see
+      // https://en.cppreference.com/w/cpp/language/reinterpret_cast .
+      std::memcpy(&tag, reinterpret_cast<const uint8_t*>(this), 1);
+      // HeapArrayPtr will be represented as a plain old pointer,
+      // which will have alignment to at least a 2-byte boundary
+      // (because it's uint16_t*) and more likely an 8- or 16-byte
+      // boundary because malloc will tend to just align everything to
+      // one of those. So, we just set tag to 1 when inline_repr_ is
+      // active so as to be able to differentiate the two.
+      return (tag & 1) != 0;
+    }
+
+    // NOLINTNEXTLINE(modernize-use-equals-default)
+    Repr() {}
+
+    ~Repr() {
+      destroyIfOutline();
+    }
+
+    Repr(const Repr& rhs) {
+      if (rhs.is_inline()) {
+        std::memcpy(&inline_repr_, &rhs.inline_repr_, sizeof(inline_repr_));
+      } else {
+        new (&outline_repr_) OutlineRepr(rhs.outline_repr_);
+      }
+    }
+
+    Repr& operator=(const Repr& rhs) {
+      if (&rhs == this) {
+        return *this;
+      }
+      if (rhs.is_inline()) {
+        destroyIfOutline();
+        new (&inline_repr_) InlineRepr();
+        std::memcpy(&inline_repr_, &rhs.inline_repr_, sizeof(inline_repr_));
+      } else {
+        if (is_inline()) {
+          new (&outline_repr_) OutlineRepr(rhs.outline_repr_);
+        } else {
+          outline_repr_ = rhs.outline_repr_;
+        }
+      }
+      return *this;
+    }
+
+    Repr(Repr&& rhs) noexcept {
+      if (rhs.is_inline()) {
+        std::memcpy(&inline_repr_, &rhs.inline_repr_, sizeof(inline_repr_));
+      } else {
+        new (&outline_repr_) OutlineRepr(std::move(rhs.outline_repr_));
+      }
+    }
+
+    Repr& operator=(Repr&& rhs) noexcept {
+      if (&rhs == this) {
+        return *this;
+      }
+
+      if (rhs.is_inline()) {
+        destroyIfOutline();
+        new (&inline_repr_) InlineRepr();
+        std::memcpy(&inline_repr_, &rhs.inline_repr_, sizeof(inline_repr_));
+      } else {
+        if (is_inline()) {
+          new (&outline_repr_) OutlineRepr(std::move(rhs.outline_repr_));
+        } else {
+          outline_repr_ = std::move(rhs.outline_repr_);
+        }
+      }
+
+      return *this;
+    }
+
+    struct InlineRepr {
+      uint8_t tag = 0x1;
+      uint8_t size{};
+      uint16_t inputs[kMaxInlineInputs]{};
+    };
+
+    using OutlineRepr = HeapArrayPtr;
+
+    InlineRepr inline_repr_{};
+    OutlineRepr outline_repr_;
+
+   private:
+    void destroyIfOutline() {
+      if (!is_inline()) {
+        outline_repr_.~OutlineRepr();
+      }
+    }
+  } repr_;
+#pragma pack(pop)
+};
+
+static_assert(
+    sizeof(ProcessedNodeInputs) == 12,
+    "ProcessedNodeInputs has the wrong size!");
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/static/fusion.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/static/fusion.h
new file mode 100644
index 0000000000000000000000000000000000000000..ee6c1d2fc959b5c965afca9af4d07bf96cb02d5e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/static/fusion.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API void fuseStaticSubgraphs(
+    std::shared_ptr<Graph> graph,
+    size_t min_size);
+
+TORCH_API void performTensorExprFusion(
+    std::shared_ptr<Graph> graph,
+    std::vector<IValue> sample_inputs);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/static/impl.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/static/impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..faeed46f3e3cfdd2746a60056b160a30ed397875
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/static/impl.h
@@ -0,0 +1,1148 @@
+#pragma once
+#include <ATen/core/ivalue.h>
+#include <ATen/core/symbol.h>
+#include <c10/core/CPUAllocator.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/FbcodeMaps.h>
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/graph_node_list.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/passes/constant_propagation.h>
+#include <torch/csrc/jit/passes/freeze_module.h>
+#include <torch/csrc/jit/passes/inliner.h>
+#include <torch/csrc/jit/runtime/static/ProcessedNodeInputs.h>
+#include <torch/custom_class.h>
+#include <limits>
+
+#ifdef FBCODE_CAFFE2
+#include <folly/container/F14Map.h>
+#include <folly/container/F14Set.h>
+#endif
+
+namespace torch::jit {
+
+TORCH_API bool canEnableStaticRuntime(
+    const std::shared_ptr<torch::jit::Graph>& graph);
+
+TORCH_API std::string dumpValueSet(
+    const c10::FastSet<const Value*>& value_set,
+    const char* set_name = "");
+
+TORCH_API inline bool doesNotHeapAllocateWhenStoredInIValue(const Type& type) {
+  switch (type.kind()) {
+    // NOTE: NumberType may allocate because it includes complex.
+    case TypeKind::NoneType:
+    case TypeKind::IntType:
+    case TypeKind::FloatType:
+    case TypeKind::BoolType:
+    case TypeKind::DeviceObjType:
+    case TypeKind::StreamObjType:
+      return true;
+    default:
+      return false;
+  }
+}
+
+TORCH_API inline c10::Symbol getStaticRuntimeMetadataSymbol() {
+  return Symbol::attr("static_runtime::metadata");
+}
+
+TORCH_API inline bool borrowsOutputs(c10::Symbol kind) {
+  static const std::array<c10::Symbol, 4> symbols_with_borrowed_outputs = {
+      c10::Symbol::fromQualString("static_runtime::select_tensor"),
+      c10::Symbol::fromQualString("static_runtime::dict_unpack"),
+      c10::Symbol::fromQualString("static_runtime::VarTupleUnpack"),
+      c10::Symbol::fromQualString("prim::IfThenElse"),
+  };
+  return std::find(
+             symbols_with_borrowed_outputs.begin(),
+             symbols_with_borrowed_outputs.end(),
+             kind) != symbols_with_borrowed_outputs.end();
+}
+
+// Group values used by `graph` into three categories:
+//
+// - output_aliases:
+//     values that are either outputs or contain aliases of outputs
+// - external_aliases:
+//     values that are inputs, constants, or their aliases.
+//     The output aliases that end up here are as a result of aliasDb failing to
+//     recognize them as outputs due to collection object (e.g., Tuple) aliasing
+//     inputs.
+// Values that dont't show up in output_aliases or external_aliases are created
+// and consumed within the graph.
+class ValueGroup {
+ public:
+  explicit ValueGroup() = default;
+  void init(const Block& block, const AliasDb& db);
+
+  bool isExternalAlias(const Value* value) const {
+    return external_aliases_.find(value) != external_aliases_.end();
+  }
+
+  bool isOutputAlias(const Value* value) const {
+    return output_aliases_.find(value) != output_aliases_.end();
+  }
+
+  bool isAlwaysAlive(const Value* value) const {
+    return isExternalAlias(value) || isOutputAlias(value);
+  }
+
+  std::string toString() const {
+    return c10::str(
+        dumpValueSet(output_aliases_, "ValueGroup::output_aliases_"),
+        "\n",
+        dumpValueSet(external_aliases_, "ValueGroup::external_aliases_"));
+  }
+
+ private:
+  c10::FastSet<const Value*> output_aliases_;
+  c10::FastSet<const Value*> external_aliases_;
+};
+
+class TORCH_API ManagedTensorRanges {
+ public:
+  ManagedTensorRanges() = default;
+  ManagedTensorRanges(
+      Block& block,
+      const AliasDb& alias_db,
+      const c10::FastSet<const Value*>& managed_tensor_values);
+
+  // If true, then this node is the last use of at least one
+  // managed tensor. availableTensorValuesAfterNode(node) will return a vector
+  // of the managed tensors that are available for re-use
+  // in the nodes following this one.
+  bool nodeFreesManagedTensors(Node* node) const;
+  const std::vector<const Value*>& availableTensorValuesAfterNode(
+      Node* node) const;
+
+  // For testing. True if v1 and v2 are both mutable types and have lifetimes
+  // that overlap.
+  bool lifetimesOverlap(const Value* v1, const Value* v2) const;
+
+ private:
+  struct Lifetime {
+    Lifetime(size_t start_, size_t end_) : start(start_), end(end_) {}
+    size_t start;
+    size_t end;
+  };
+
+  // Returns nullptr if we are not tracking the lifetime of value
+  Lifetime* getLifetime(const Value* value);
+  const Lifetime* getLifetime(const Value* value) const;
+  // Collect all values in the input that have tracked lifetimes.
+  // A value's lifetime may not be tracked if it is a graph input
+  // or immutable type (containers with at least one mutable
+  // type are mutable)
+  std::vector<const Value*> collectValuesWithTrackedLifetimes(
+      at::ArrayRef<const Value*> values);
+  void extendLifetime(Value* input, size_t new_end);
+  void extendInputLifetime(Node* node, size_t new_end);
+
+  // Maps Node* to the set of managed tensors that are now available
+  // for re-use after this node.
+  c10::FastMap<Node*, std::vector<const Value*>> node_to_newly_free_tensors_{};
+  // Maps each Value* to its lifetime (start node index, end node index)
+  c10::FastMap<const Value*, Lifetime> value_lifetimes_{};
+};
+
+struct TORCH_API StaticModuleOptions {
+  // enabling out variant allows Static Runtime to do memory planning
+  bool enable_out_variant{true};
+  // to reuse tensor storage for tensors whose live-range do not overlap to
+  // reduce memory footprint (enable_out_variant must be true)
+  bool optimize_memory{true};
+  // to batch allocate tensor storage for output tensors of the
+  // graph, where storage is deallocated outside static runtime
+  // (enable_out_variant must be true)
+  bool manage_output_tensors{false};
+  // Gates the ReplaceWithCopy pass, which replaces ops that
+  // sometimes alias their outputs with out variants that
+  // always copy (so the output may participate in memory planning).
+  // Since replacing with copies is done after TensorExpr fusion, the
+  // resulting graph does not conform to the assumptions made in the fuser.
+  // So, even if this flag is turned on, the ReplaceWithCopy pass will not
+  // be executed if TensorExpr fusion is enabled.
+  bool use_copy_variants{true};
+  // Gates the ReplaceWithMaybeCopy pass, which replaces ops that
+  // sometimes alias their outputs with subgraphs that include an out
+  // variant.
+  // For the same reason as `use_copy_variants`, the ReplaceWithMaybeCopy pass
+  // will not be executed if TensorExpr fusion is enabled, even if this flag
+  // is turned on.
+  bool use_maybe_copy_variants{true};
+  // enable TensorExpr fusion of ops at model loading time
+  bool enable_tensorexpr_fusion{false};
+};
+
+/*
+  Responsible for plugging StaticRuntime metadata onto the
+  IR nodes. StaticRuntimeMetdata extends CustomClassHolder
+  which can be casted to IValue and attached to IR node.
+  This is needed to pass parent graph metadata to forked
+  graph in presence of prim::fork operator
+*/
+class TORCH_API StaticRuntimeMetadata : public torch::CustomClassHolder {
+ public:
+  explicit StaticRuntimeMetadata(const StaticModuleOptions& opts)
+      : opts_(opts) {}
+
+  const StaticModuleOptions& get_opts() {
+    return opts_;
+  }
+
+ private:
+  StaticModuleOptions opts_;
+};
+
+/// The static runime supports two execution modes.
+///
+/// Mode 1: single-threaded with no parallelism except for intra-op parallelism
+/// For this mode, you can do either:
+/// @code
+///   // m is a TorchScript module
+///   auto module = StaticModule(m, opts);
+///   auto output = module(args, kwargs);
+/// @endcode
+///
+/// or
+///
+/// @code
+///   // g is the TorchScript graph
+///   auto module = StaticModule(g, opts);
+///   auto output = module(args, kwargs);
+/// @endcode
+///
+/// Mode 2: similar to data parallelism, run the same model for different inputs
+/// on different threads at the same time.
+/// You should have one StaticModule per model, and one StaticRuntime instance
+/// per running thread. To avoiding creating StaticRuntimes on the fly, use a
+/// synchronized stack (i.e. boost::lockfree::stack) to cache all the
+/// StaticRuntime instances in your code.
+/// @code
+///   // initialization
+///   auto module = std::make_shared<StaticModule>(m, opts);
+///
+///   // 128 is good for most cases. Pick a number that works for you
+///   boost::lockfree::stack<std::shared_ptr<StaticRuntime>,
+///     boost::lockfree::fixed_sized<true>> pool(128);
+///
+///   // inference
+///   std::shared_ptr<StaticRuntime> runtime = nullptr;
+///   pool.pop(runtime);
+///   if (!runtime) {
+///     // holds a reference to the underlying module
+///     // but does its own memory management
+///     runtime = std::make_shared<StaticRuntime>(*module);
+///   }
+///   auto output = runtime(args, kwargs);
+///   pool.push(runtime);
+/// @endcode
+///
+class MemoryPlanner;
+class StaticNodeInfo;
+class ProcessedNode;
+class StaticRuntime;
+
+using SROperator = std::function<void(ProcessedNode*)>;
+
+#ifdef FBCODE_CAFFE2
+struct TORCH_API SROperatorObserver {
+  using OperatorCallback = void (*)(const Node*);
+  OperatorCallback startCb = nullptr;
+  OperatorCallback endCb = nullptr;
+
+  static void setCurrentThreadObserver(SROperatorObserver* observer);
+  static SROperatorObserver* getCurrentThreadObserver();
+  static void onStart(const Node* name);
+  static void onEnd(const Node* name);
+};
+#endif
+
+class TORCH_API ProcessedFunction {
+ public:
+  ProcessedFunction(
+      Node* node,
+      bool enable_out_variant,
+      bool check_memory_overlap);
+
+  enum class Kind : uint8_t {
+    kOutVariant,
+    kNativeFunction,
+    kInterpreterFallback,
+  };
+
+  void run(ProcessedNode* pnode) const {
+    return f_(pnode);
+  }
+
+  Kind kind() const {
+    return kind_;
+  }
+
+  bool checkMemoryOverlap() const {
+    return check_memory_overlap_;
+  }
+
+  size_t num_outputs() const {
+    return num_outputs_;
+  }
+
+ private:
+  SROperator f_;
+  Kind kind_{ProcessedFunction::Kind::kOutVariant};
+  bool check_memory_overlap_{false};
+  size_t num_outputs_{0};
+};
+
+// A `BlockInfo` instance stores all of the shared state that each
+// `BlockRunner` will need to access. Most of this information is
+// read-only and shared between threads.
+// - Each `BlockInfo` corresponds to one block in the graph.
+// - Each `BlockInfo` may be used by multiple block runners (when there are many
+//   threads).
+// - All of the `BlockInfo`s are stored in a vector in the `StaticModule` and
+//   are initialized during `StaticModule` construction.
+// - Most of the information stored is used to initialize the block's memory
+//   planner.
+class BlockInfo {
+ public:
+  BlockInfo(uint32_t input_idx, Block& block);
+
+  void set_nodes(
+      std::vector<StaticNodeInfo> nodes,
+      const c10::FastMap<Node*, bool>& node_has_out_variant);
+
+  const std::vector<StaticNodeInfo>& nodes() const {
+    return nodes_;
+  }
+
+  size_t num_nodes() const;
+
+  size_t num_inputs() const {
+    return block_.inputs().size();
+  }
+
+  size_t num_outputs() const {
+    return block_.outputs().size();
+  }
+
+  graph_node_list node_ptrs() const {
+    return block_.nodes();
+  }
+
+  void set_output_indices(std::vector<uint16_t> indices) {
+    output_indices_ = std::move(indices);
+  }
+
+  const std::vector<uint16_t>& block_output_indices() const {
+    return output_indices_;
+  }
+
+  auto block_inputs_idx() const {
+    return input_idx_;
+  }
+
+  bool node_is_optimizable_container_type(const Node* node) const {
+    return node_is_optimizable_container_type_.find(node) !=
+        node_is_optimizable_container_type_.end();
+  }
+
+  bool value_is_managed_tensor(const Value* value) const {
+    return managed_tensor_values_.find(value) != managed_tensor_values_.end();
+  }
+
+  bool value_is_leaked_container(const Value* value) const {
+    return leaked_values_.find(value) != leaked_values_.end();
+  }
+
+  const ValueGroup& value_group() const {
+    return value_group_;
+  }
+
+  const ManagedTensorRanges& managed_tensor_ranges() const {
+    return managed_tensor_ranges_;
+  }
+
+  void init_value_group(const AliasDb& alias_db) {
+    value_group_.init(block_, alias_db);
+  }
+
+  void prepare_for_memory_planner(
+      const AliasDb& alias_db,
+      const StaticModuleOptions& opt);
+
+  const auto& managed_output_tensor_values() const {
+    return managed_output_tensor_values_;
+  }
+
+  const auto& managed_tensor_values() const {
+    return managed_tensor_values_;
+  }
+
+  const auto& leaked_values() const {
+    return leaked_values_;
+  }
+
+ private:
+  std::vector<StaticNodeInfo> nodes_;
+
+  ValueGroup value_group_;
+
+  c10::FastSet<const Node*> node_is_optimizable_container_type_;
+  c10::FastSet<const Value*> managed_tensor_values_;
+  c10::FastSet<const Value*> managed_output_tensor_values_;
+  c10::FastSet<const Value*> leaked_values_;
+
+  ManagedTensorRanges managed_tensor_ranges_{};
+
+  // The index of this block's inputs in the shared values_ array.
+  const uint16_t input_idx_;
+  // The indices of this block's outputs in the shared values_ array.
+  std::vector<uint16_t> output_indices_;
+  Block& block_;
+};
+
+class TORCH_API StaticModule {
+ public:
+  explicit StaticModule(
+      const std::shared_ptr<torch::jit::Graph>& g,
+      const StaticModuleOptions& opts = StaticModuleOptions(),
+      std::vector<IValue> sample_inputs = {});
+
+  explicit StaticModule(
+      const torch::jit::Module& m,
+      bool is_frozen = false,
+      const StaticModuleOptions& opts = StaticModuleOptions(),
+      std::vector<IValue> sample_inputs = {});
+
+ private:
+  explicit StaticModule(
+      std::pair<std::shared_ptr<torch::jit::Graph>, std::optional<Module>>
+          graph_and_module,
+      const StaticModuleOptions& opts);
+
+ public:
+  using KeywordArgs = std::unordered_map<std::string, c10::IValue>;
+  c10::IValue operator()(
+      const std::vector<c10::IValue>& args,
+      const KeywordArgs& kwargs = KeywordArgs());
+  c10::IValue operator()(
+      std::vector<c10::IValue>&& args,
+      const KeywordArgs& kwargs = KeywordArgs());
+
+  const Graph& graph() const {
+    return *graph_;
+  }
+
+  const Module& module() const {
+    DCHECK(module_.has_value());
+    return *module_;
+  }
+
+  const StaticModuleOptions& opts() const;
+
+  size_t num_inputs() const;
+  size_t num_outputs() const;
+
+  size_t num_constants() const {
+    return constants_.size();
+  }
+
+  size_t num_intermediate_values() const {
+    return num_intermediate_values_;
+  }
+
+  size_t total_num_values() const {
+    return num_inputs() + num_constants() + num_intermediate_values();
+  }
+
+  [[nodiscard]] const std::vector<uint16_t>& output_indices() const {
+    return output_indices_;
+  }
+
+  const std::vector<IValue>& constants() const {
+    return constants_;
+  }
+
+  const BlockInfo& block_info(Block* block) const {
+    return block_infos_.at(block);
+  }
+
+  Block* root_block() const {
+    return graph_->block();
+  }
+
+ private:
+  friend class StaticRuntime;
+  friend class BlockRunner;
+
+ public:
+  auto num_nodes() const {
+    return std::accumulate(
+        block_infos_.begin(),
+        block_infos_.end(),
+        0,
+        [](size_t sum, const auto& block_and_info) {
+          auto& block_info = block_and_info.second;
+          return sum + block_info.num_nodes();
+        });
+  }
+
+  [[nodiscard]] Node* findNodeWithKindForTesting(const std::string& kind) const;
+
+  const std::optional<c10::FunctionSchema>& schema() const {
+    return schema_;
+  }
+
+  bool first_input_is_self() const {
+    return module_.has_value();
+  }
+
+  StaticRuntime& runtime();
+
+  // See [Shared values array]
+  size_t value_buffer_size() const {
+    return value_buffer_size_;
+  }
+
+ private:
+  // Recursively prepares the BlockInfo array.
+  // - Populates `value_to_index` with the indices of each intermediate value
+  // - Returns the number of Value* processed, including sub-blocks.
+  size_t prepareBlockInfo(
+      Block* block,
+      const size_t start_idx,
+      c10::FastMap<const Value*, uint32_t>& value_to_index);
+
+  void prepareFunctionsAndConstants(
+      Block* block,
+      const AliasDb& alias_db,
+      c10::FastMap<const Value*, uint32_t>& value_to_index);
+
+  // Recursively traverse the graph and attach SR metadata
+  // to the prim::fork nodes as additional attributes
+  void attachNodeMetadata(Block* block);
+
+  // Recurses on sub-blocks and populates the array of ProcessedNodes
+  // Returns (number of nodes processed, number of blocks processed)
+  size_t prepareStaticNodeInfos(
+      Block* block,
+      const c10::FastMap<const Value*, uint32_t>& value_to_index,
+      const AliasDb& alias_db,
+      size_t node_idx = 0);
+
+  // Initialize various attributes that the memory planner will need.
+  // To be called at the tail of the ctor.
+  void prepareForMemoryPlanner();
+
+  StaticModuleOptions opts_;
+  // metadata that is stored in IR nodes as attribute
+  at::intrusive_ptr<jit::StaticRuntimeMetadata> sr_metadata_;
+  std::shared_ptr<torch::jit::Graph> graph_;
+  std::optional<torch::jit::Module> module_;
+  std::optional<c10::FunctionSchema> schema_;
+  std::unique_ptr<StaticRuntime> cached_runtime_;
+
+  // Bookkeeping for creating new StaticRuntime instances
+  // IValue table (defined by prim::Constant nodes)
+  std::vector<IValue> constants_;
+  // The functions to be called by corresponding ProcessedNode.
+  std::vector<ProcessedFunction> functions_{};
+  // A list of pre-processed nodes from which ProcessedNode are created per
+  // StaticRuntime instance.
+  std::vector<StaticNodeInfo> nodes_;
+  // Indices of graph outputs in the single values array.
+  std::vector<uint16_t> output_indices_;
+
+  size_t num_intermediate_values_ = 0;
+
+  // Includes self if module_ != std::nullopt.
+  // Note that we might have num_inputs_ == 0 even if the schema has a `self`
+  // argument. In this case, `self` isn't used in the graph, but the schema
+  // includes it anyways to be consistent with the JIT interpreter.
+  size_t num_inputs_;
+  // See `BlockInfo` definition. The blocks are stored in depth-first order.
+  c10::FastMap<Block*, BlockInfo> block_infos_;
+  size_t value_buffer_size_ = 0;
+};
+
+// `BlockRunner` contains the core runtime logic. Each block runner
+// corresponds to one block in the graph and has its own memory planner.
+// `StaticRuntime` will initialize all `BlockRunner`s
+// upon construction. Each block runner only directly executes nodes from its
+// block. Special ops with sub-blocks like `prim::If` may have
+// `BlockRunner`s stored in their `ProcessedNode`s; these
+// sub-blocks get executed in the op's implementation.
+// `StaticRuntime` stores a vector of IValues that all
+// `BlockRunner`s share. This vector is used to store all
+// constants, inputs, and intermediate tensors.
+class TORCH_API BlockRunner {
+ public:
+  BlockRunner(
+      const StaticModule& sm,
+      IValue* values,
+      Block* block,
+      torch::jit::TaskLauncher* launcher,
+      bool is_root_block = false);
+  BlockRunner(BlockRunner&&) noexcept;
+  BlockRunner& operator=(BlockRunner&&) = delete;
+  ~BlockRunner();
+
+  C10_DISABLE_COPY_AND_ASSIGN(BlockRunner);
+
+  using KeywordArgs = std::unordered_map<std::string, c10::IValue>;
+  c10::IValue operator()(
+      const std::vector<c10::IValue>& args,
+      const KeywordArgs& kwargs = KeywordArgs());
+  c10::IValue operator()(
+      std::vector<c10::IValue>&& args,
+      const KeywordArgs& kwargs = KeywordArgs());
+
+  c10::intrusive_ptr<c10::ivalue::Future> runAsync(
+      const std::vector<c10::IValue>& args,
+      const KeywordArgs& kwargs);
+
+  c10::intrusive_ptr<c10::ivalue::Future> runAsync(
+      std::vector<c10::IValue>&& args,
+      const KeywordArgs& kwargs);
+
+  void benchmark(
+      const std::vector<std::vector<c10::IValue>>& args_list,
+      const std::vector<KeywordArgs>& kwargs_list,
+      const uint32_t warmup_runs,
+      const uint32_t main_runs,
+      bool print_per_node_time = false,
+      bool generate_ai_pep_output = false);
+
+  struct IndividualMetrics {
+    float setup_time{0.0};
+    float memory_alloc_time{0.0};
+    float memory_dealloc_time{0.0};
+    float output_dealloc_time{0.0};
+    float first_iter_time{0.0};
+    float total_time{0.0};
+    size_t out_nodes_count{0};
+    size_t total_nodes_count{0};
+    std::vector<float> time_per_node;
+    std::unordered_map<std::string, float> time_per_node_type;
+    std::unordered_map<std::string, float> percent_per_node_type;
+    std::unordered_map<std::string, int> instances_per_node_type;
+    std::unordered_set<std::string> out_nodes;
+    std::unordered_set<std::string> native_nodes;
+  };
+
+  IndividualMetrics benchmark_individual_ops(
+      const std::vector<std::vector<c10::IValue>>& args_list,
+      const std::vector<KeywordArgs>& kwargs_list,
+      const uint32_t warmup_runs,
+      const uint32_t main_runs);
+
+  // Input is readwrite
+  IValue& Input(uint32_t i) {
+    TORCH_DCHECK_LT(i, block_info_.num_inputs());
+    return values_[i + block_info_.block_inputs_idx()];
+  }
+
+  // Output is readonly. The writing process happens inside ProcessedNodes
+  [[nodiscard]] const IValue& Output(uint32_t i) const {
+    DCHECK(i < outputs_.size());
+    return *outputs_[i];
+  }
+
+  const std::vector<IValue*> outputs() const {
+    return outputs_;
+  }
+
+  const std::vector<ProcessedNode>& nodes() const {
+    return nodes_;
+  }
+
+  std::vector<ProcessedNode>& nodes() {
+    return nodes_;
+  }
+
+  graph_node_list node_ptrs() const {
+    return block_info_.node_ptrs();
+  }
+
+  const Graph& graph() const {
+    return static_module_.graph();
+  }
+
+  const MemoryPlanner* get_memory_planner() const {
+    return planner_.get();
+  }
+
+  bool check_for_memory_leak(
+      bool output_returned = true,
+      bool recurse_on_sub_blocks = false);
+
+  // WARNING: Deallocate managed output tensors.  A client receiving Static
+  // Runtime-managed Tensors needs to be very careful to call
+  // `StaticRuntime::deallocateOutputTensors` after all references of output
+  // Tensors are gone.
+  void deallocateOutputTensors();
+
+  bool checkOutputTensorMemoryLeaks();
+
+  bool isManagedOutputTensor(const IValue& ivalue) const;
+  bool isManagedOutputTensorValue(const Value* value) const;
+
+  void disableManageOutputTensors();
+
+  // This is the fallback path taken if we can't construct the memory planner
+  // on the first iteration.
+  // IMPORTANT: Nothing here should be able to throw!!!
+  // This function can be called from the (implicitly) `noexcept` destructor
+  // of Deallocator, meaning that std::terminate will be called
+  // if any exception escapes. Even if resetMemory and ~Deallocator were
+  // `noexcept(false)`, it's possible that when ~Deallocator is called, the
+  // stack is already unwinding, so there's still danger of calling
+  // std::terminate.
+  void resetMemory() noexcept;
+
+ private:
+  // A helper object that invokes memory planner deallocation code
+  // when destructed.
+  class Deallocator {
+   public:
+    explicit Deallocator(BlockRunner& block_runner)
+        : block_runner_(block_runner) {}
+
+    Deallocator(Deallocator&&) = default;
+    Deallocator(const Deallocator&) = default;
+    Deallocator& operator=(const Deallocator&) = delete;
+    Deallocator& operator=(Deallocator&&) = delete;
+    ~Deallocator();
+
+    void setFinished() {
+      finished_ = true;
+    }
+
+   private:
+    void cleanupImpl();
+
+    bool finished_ = false;
+    BlockRunner& block_runner_;
+  };
+
+  template <typename IValueList>
+  c10::IValue run_impl(IValueList&& args, const KeywordArgs& kwargs);
+
+  template <typename IValueList>
+  c10::IValue run_impl_record_functions(
+      IValueList&& args,
+      const KeywordArgs& kwargs);
+
+  template <typename IValueList>
+  c10::intrusive_ptr<c10::ivalue::Future> run_impl_async(
+      IValueList&& args,
+      const KeywordArgs& kwargs);
+
+  template <typename IValueList>
+  c10::intrusive_ptr<c10::ivalue::Future> run_impl_record_functions_async(
+      IValueList&& args,
+      const KeywordArgs& kwargs);
+
+  // helper method for copying input args/kwargs into inputs_
+  template <typename IValueList>
+  void set_inputs(IValueList&& args, const KeywordArgs& kwargs);
+
+  // Set Input(idx) to args[idx]. Invoked by set_inputs. Copies or moves
+  // depending on overload.
+  void set_arg(const size_t idx, std::vector<IValue>&& args);
+  void set_arg(const size_t idx, const std::vector<IValue>& args);
+
+  // Set Input(idx) to arg. Always copies. Used for kwargs.
+  void set_arg(const size_t idx, const IValue& arg);
+
+  bool fast_check_and_correct_overlap_with(
+      ProcessedNode& n,
+      c10::IValue& tensor_ival);
+  void verify_and_correct_memory_overlap(ProcessedNode& n);
+
+  // clean up owning refs of input IValues
+  void clean_up_input_ivalues() noexcept {
+    for (const auto idx : c10::irange(block_info_.num_inputs())) {
+      values_[idx + inputs_begin_] = IValue();
+    }
+  }
+
+  void clean_up_intermediate_ivalues() noexcept;
+
+  IValue move_outputs_to_tuple(uint32_t num_outputs);
+
+  void create_memory_planner();
+
+  float benchmark_model(
+      const std::vector<std::vector<c10::IValue>>& args_list,
+      const std::vector<KeywordArgs>& kwargs_list,
+      const uint32_t warmup_runs,
+      const uint32_t main_runs);
+
+  void display_nodes(
+      const std::vector<c10::IValue>& args,
+      const KeywordArgs& kwargs);
+
+  const StaticModule& static_module_;
+  const BlockInfo& block_info_;
+
+  const bool is_root_block_;
+  // Cache this so we don't have to call static_module_.first_input_is_self()
+  const bool first_input_is_self_;
+  // Index of the start of this blocks inputs in the shared values_ array.
+  const uint16_t inputs_begin_;
+
+  bool manage_output_tensors_enabled_ = false;
+  std::unique_ptr<MemoryPlanner> planner_;
+  // [Shared values array]
+  // ProcessedNodes reference their inputs and outputs with
+  // offsets into this array, which saves memory.
+  // All BlockRunners share the same array. The layout is as
+  // follows:
+  // [constants][block_0][block_1]...[block_N]
+  // Note that constants from all blocks are pooled together at the start.
+  // The block ordering is depth-first.
+  // Each block is further divided into inputs and intermediates:
+  // [block_i] = [inputs_i][intermediates_i]
+  // Each BlockRunner knows where its inputs start. Each ProcessedNode
+  // knows how to find the indices of its outputs/inputs in this array.
+  IValue* values_;
+
+  std::vector<IValue*> outputs_;
+  std::vector<ProcessedNode> nodes_;
+};
+
+class TORCH_API StaticNodeInfo {
+ public:
+  StaticNodeInfo(
+      Node* n,
+      ProcessedFunction* fn,
+      ProcessedNodeInputs inputs,
+      uint16_t outputs_offset);
+
+  Node* node() const {
+    return node_;
+  }
+
+  size_t num_outputs() const {
+    DCHECK(fn_ != nullptr);
+    return fn_->num_outputs();
+  }
+
+  bool has_out_variant() const {
+    return fn_->kind() == ProcessedFunction::Kind::kOutVariant;
+  }
+
+ private:
+  friend class ProcessedNode;
+
+  Node* node_;
+  const ProcessedFunction* fn_;
+  ProcessedNodeInputs inputs_;
+  uint16_t outputs_offset_;
+};
+
+inline size_t BlockInfo::num_nodes() const {
+  return nodes_.size();
+}
+
+/*
+  ProcessedNodeMetadata class wraps the possible metadata
+  for ProcessedNode. Depending upon the nature of op, processedNode
+  can have one of the below possibilities of metadata:
+  - prim::If/prim::Loop ops contains block_runners_ as their metadata
+  - prim::fork op contains TaskLauncher (std::function) responsible for
+    execution of forked subgraph
+*/
+class TORCH_API ProcessedNodeMetadata {
+ public:
+  ProcessedNodeMetadata(
+      std::vector<BlockRunner> runners,
+      torch::jit::TaskLauncher* launcher)
+      : block_runners_(std::move(runners)), launcher_(launcher) {}
+
+  ProcessedNodeMetadata() : launcher_(nullptr) {}
+
+  // deleted copy ctor/assignment as standard containers (vector) always
+  // have copy constructors, but their instantiation is not well-formed
+  // if the contained type (BlockRunner) is not copyable
+  ProcessedNodeMetadata(const ProcessedNodeMetadata&) = delete;
+  ProcessedNodeMetadata& operator=(const ProcessedNodeMetadata&) = delete;
+  ProcessedNodeMetadata(ProcessedNodeMetadata&&) = delete;
+  ProcessedNodeMetadata&& operator=(ProcessedNodeMetadata&&) = delete;
+  ~ProcessedNodeMetadata() = default;
+
+  std::vector<BlockRunner>& block_runners() {
+    return block_runners_;
+  }
+
+  void set_block_runners(std::vector<BlockRunner> runners) {
+    block_runners_ = std::move(runners);
+  }
+
+  void set_launcher(torch::jit::TaskLauncher* launcher) {
+    launcher_ = launcher;
+  }
+
+  torch::jit::TaskLauncher* launcher() {
+    return launcher_;
+  }
+
+ private:
+  std::vector<BlockRunner> block_runners_;
+  torch::jit::TaskLauncher* launcher_;
+};
+
+class TORCH_API ProcessedNode {
+ public:
+  ProcessedNode() = default;
+
+  ProcessedNode(const StaticNodeInfo& other, IValue* values)
+      : node_(other.node_),
+        fn_(other.fn_),
+        inputs_(other.inputs_),
+        outputs_offset_(other.outputs_offset_),
+        values_(values),
+        metadata_(nullptr) {}
+
+  // These should be noexcept, but some Android build is failing
+  // saying the noexcept specification doesn't match the calculated
+  // one. Maybe std::variant is throwing it off?
+  ProcessedNode(ProcessedNode&&) = default;
+
+  ProcessedNode(const ProcessedNode&) = delete;
+  ProcessedNode& operator=(const ProcessedNode& other) = delete;
+  ProcessedNode& operator=(ProcessedNode&&) = default;
+  ~ProcessedNode() = default;
+
+  void run();
+
+  Node* node() const {
+    return node_;
+  }
+
+  // Input is readonly
+  [[nodiscard]] const IValue& Input(uint32_t i) const {
+    return values_[inputs_[i]];
+  }
+
+  // Output is readwrite
+  IValue& Output(uint32_t i) {
+    DCHECK(i < num_outputs());
+    return values_[outputs_offset_ + i];
+  }
+
+  [[nodiscard]] const IValue& Output(uint32_t i) const {
+    DCHECK(i < num_outputs());
+    return values_[outputs_offset_ + i];
+  }
+
+  uint32_t num_outputs() const {
+    DCHECK(fn_ != nullptr);
+    return static_cast<uint32_t>(fn_->num_outputs());
+  }
+
+  [[nodiscard]] c10::ArrayRef<const IValue> outputs() const {
+    return c10::ArrayRef<const IValue>(
+        values_ + outputs_offset_, num_outputs());
+  }
+
+  [[nodiscard]] uint16_t num_inputs() const {
+    return inputs_.size();
+  }
+
+  std::vector<IValue> inputs_ivalue_vec() const;
+
+  bool has_out_variant() const {
+    return fn_->kind() == ProcessedFunction::Kind::kOutVariant;
+  }
+
+  bool has_native() const {
+    return fn_->kind() == ProcessedFunction::Kind::kNativeFunction;
+  }
+
+#ifndef PYTORCH_DISABLE_PER_OP_PROFILING
+  const char* get_op_name() const {
+    return node_->kind().toQualString();
+  }
+#endif
+
+  bool check_outputs_for_memory_overlap() const {
+    return fn_->checkMemoryOverlap();
+  }
+
+  void set_outputs_memory_overlap_detected() {
+    overlap_detected_ = true;
+  }
+
+  bool outputs_memory_overlap_detected() {
+    return overlap_detected_;
+  }
+
+  bool check_and_correct_overlap_with(
+      const at::Tensor& input,
+      c10::IValue& output);
+  void verify_and_correct_memory_overlap();
+
+  void set_values(IValue* values) {
+    DCHECK(values_ == nullptr);
+    values_ = values;
+  }
+
+  [[nodiscard]] uint16_t output_ivalue_index(uint16_t i) const {
+    DCHECK(i < num_outputs());
+    return outputs_offset_ + i;
+  }
+  // used in debug mode
+  bool verify_no_memory_overlap(bool force_check = false) const;
+
+  // returns pointer to ProcessedNodeMetadata or nullptr if no object is owned
+  ProcessedNodeMetadata* metadata() {
+    return metadata_.get();
+  }
+
+  // attach block_runner to metadata of ProcessedNode
+  void set_metadata(std::vector<BlockRunner> block_runners) {
+    if (metadata_ == nullptr) {
+      metadata_ = std::make_unique<ProcessedNodeMetadata>();
+    }
+    metadata_->set_block_runners(std::move(block_runners));
+  }
+
+  // attach TaskLauncher to metadata of ProcessedNode
+  void set_metadata(torch::jit::TaskLauncher* launcher) {
+    if (metadata_ == nullptr) {
+      metadata_ = std::make_unique<ProcessedNodeMetadata>();
+    }
+    metadata_->set_launcher(launcher);
+  }
+
+ private:
+  [[nodiscard]] bool verify_outputs_dont_overlap_each_other() const;
+
+  [[nodiscard]] bool verify_inputs_dont_overlap_outputs(bool force_check) const;
+
+  Node* node_{nullptr};
+  const ProcessedFunction* fn_{nullptr};
+  ProcessedNodeInputs inputs_;
+  uint16_t outputs_offset_{0};
+  bool overlap_detected_{false};
+  IValue* values_ = nullptr; // unowned
+  // Metadata for ProcessedNode.
+  // 1. prim::If/Loop nodes contains sub-blocks as metadata
+  // 2. prim::fork nodes contains custom executor for async execution
+  std::unique_ptr<ProcessedNodeMetadata> metadata_;
+};
+
+// `StaticRuntime` is the owner of the array of IValues (used for constants,
+// inputs, and intermediate tensors) that all `BlockRunner`s share.
+// Upon construction, it initializes all block runners. `operator()` simply
+// forwards the inputs to the top-level block runner. Each `StaticRuntime`
+// instance corresponds to one `StaticModule`. Multiple `StaticRuntime`
+// instances can be created; this is useful for multi-threaded execution, since
+// `operator()` is not thread-safe.
+class TORCH_API StaticRuntime {
+ public:
+  explicit StaticRuntime(const StaticModule& sm);
+
+  using KeywordArgs = std::unordered_map<std::string, c10::IValue>;
+  c10::IValue operator()(
+      const std::vector<c10::IValue>& args,
+      const KeywordArgs& kwargs = KeywordArgs());
+  c10::IValue operator()(
+      std::vector<c10::IValue>&& args,
+      const KeywordArgs& kwargs = KeywordArgs());
+
+  // runAsync performs inline execution of graph on
+  // caller thread and async execution on taskLauncher
+  // If no custom taskLauncher is specified, execution is done
+  // on inter-op thread pool.
+  c10::intrusive_ptr<c10::ivalue::Future> runAsync(
+      const std::vector<c10::IValue>& args,
+      const KeywordArgs& kwargs = KeywordArgs(),
+      torch::jit::TaskLauncher taskLauncher = at::launch);
+
+  c10::intrusive_ptr<c10::ivalue::Future> runAsync(
+      std::vector<c10::IValue>&& args,
+      const KeywordArgs& kwargs = KeywordArgs(),
+      torch::jit::TaskLauncher taskLauncher = at::launch);
+
+  bool check_for_memory_leak(bool output_returned = true);
+  bool checkOutputTensorMemoryLeaks();
+
+  void deallocateOutputTensors();
+  bool isManagedOutputTensor(const IValue& ivalue) const;
+  void disableManageOutputTensors();
+
+  // Gets the top-level memory planner. Used for testing.
+  const MemoryPlanner* get_memory_planner() const;
+
+  void benchmark(
+      const std::vector<std::vector<c10::IValue>>& args_list,
+      const std::vector<KeywordArgs>& kwargs_list,
+      const uint32_t warmup_runs,
+      const uint32_t main_runs,
+      bool print_per_node_time = false,
+      bool generate_ai_pep_output = false) {
+    block_->benchmark(
+        args_list,
+        kwargs_list,
+        warmup_runs,
+        main_runs,
+        print_per_node_time,
+        generate_ai_pep_output);
+  }
+
+  using IndividualMetrics = BlockRunner::IndividualMetrics;
+
+  IndividualMetrics benchmark_individual_ops(
+      const std::vector<std::vector<c10::IValue>>& args_list,
+      const std::vector<KeywordArgs>& kwargs_list,
+      const int warmup_runs,
+      const int main_runs) {
+    return block_->benchmark_individual_ops(
+        args_list, kwargs_list, warmup_runs, main_runs);
+  }
+
+ private:
+  // An array of IValues with unchanging size/data ptr.
+  class IValueArray {
+   public:
+    IValueArray() = default;
+    explicit IValueArray(size_t size) : array_(allocate(size)), size_(size) {}
+
+    IValue* data() const {
+      return array_.get();
+    }
+
+    size_t size() const {
+      return size_;
+    }
+
+   private:
+    // NOLINTNEXTLINE(modernize-avoid-c-arrays)
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays)
+    static std::unique_ptr<IValue[]> allocate(size_t size) {
+      if (size) {
+        return std::make_unique<IValue[]>(size);
+      }
+      return nullptr;
+    }
+
+    // NOLINTNEXTLINE(modernize-avoid-c-arrays)
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays)
+    std::unique_ptr<IValue[]> array_ = nullptr;
+    size_t size_ = 0;
+  };
+
+  std::unique_ptr<BlockRunner> block_;
+  // for execution of async operations present in graph
+  torch::jit::TaskLauncher async_task_launcher_;
+  IValueArray values_;
+};
+
+} // namespace torch::jit
+C10_DECLARE_bool(static_runtime_disable_debug_memory_overlap_check);
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/static/init.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/static/init.h
new file mode 100644
index 0000000000000000000000000000000000000000..6171acc02f79c29a0d8a5597f3a3f300e21eeaba
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/static/init.h
@@ -0,0 +1,7 @@
+#include <torch/csrc/jit/python/pybind_utils.h>
+
+namespace torch::jit {
+
+void initStaticModuleBindings(PyObject* module);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/static/memory_planner.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/static/memory_planner.h
new file mode 100644
index 0000000000000000000000000000000000000000..b8a8777394f8b6d4fa860e89ae02378b205e544a
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/static/memory_planner.h
@@ -0,0 +1,298 @@
+#pragma once
+
+#include <torch/csrc/jit/runtime/static/impl.h>
+
+namespace torch::jit {
+
+// A StorageGroup represents a collection of tensors that share backing storage.
+class StorageGroup {
+ public:
+  // Every storage group must contain at least one tensor.
+  explicit StorageGroup(at::Tensor* tensor) : group_{tensor} {}
+
+  void addTensor(at::Tensor* tensor) {
+    group_.push_back(tensor);
+  }
+
+  const std::vector<at::Tensor*>& group() const {
+    return group_;
+  }
+
+  size_t maxTensorSize() const {
+    return max_tensor_size_;
+  }
+
+  void setMaxTensorSize(size_t new_size) {
+    max_tensor_size_ = new_size;
+  }
+
+  size_t numManagedTensors() const {
+    return group_.size();
+  }
+
+ private:
+  // The size attribute represents the amount of memory that will be
+  // allocated for all tensors in this storage group. Initially it
+  // is zero, eventually it gets updated by the MemoryPlanner.
+  size_t max_tensor_size_ = 0;
+  std::vector<at::Tensor*> group_{};
+};
+
+// A contiguous buffer of `StorageImpl`s
+class ManagedStorages {
+ public:
+  ManagedStorages();
+
+  ~ManagedStorages();
+
+  void allocate(size_t capacity);
+
+  void deallocate();
+
+  bool is_allocated() const {
+    return storages_ != nullptr;
+  }
+
+  // Append a new StorageImpl to the buffer. The new StorageImpl is given the
+  // same size and allocator as `storageImpl` argument
+  void append(at::StorageImpl& storageImpl);
+
+  at::StorageImpl& operator[](size_t idx) {
+    TORCH_INTERNAL_ASSERT(storages_ != nullptr);
+    return storages_[idx];
+  }
+
+  const at::StorageImpl& operator[](size_t idx) const {
+    TORCH_INTERNAL_ASSERT(storages_ != nullptr);
+    return storages_[idx];
+  }
+
+  size_t size() const {
+    return size_;
+  }
+
+  bool empty() const {
+    return size_ == 0;
+  }
+
+  size_t capacity() const {
+    return capacity_;
+  }
+
+ private:
+  // We will use placement-new to add new storages to this buffer
+  at::StorageImpl* storages_;
+
+  // Current number of storages that have been placed into the storage buffer
+  size_t size_;
+
+  // Total allocated capacity of the storage buffer
+  size_t capacity_;
+};
+
+TORCH_API std::vector<StorageGroup> assignStorageToManagedTensors(
+    graph_node_list nodes,
+    const ManagedTensorRanges& ranges,
+    const c10::FastMap<const Value*, at::Tensor*>& tensor_value_to_tensor);
+
+// There are three types of ops in a processed graph in Static Runtime:
+//   1. op with _out variant
+//   2. view-producing op
+//   3. tensor-producing op (could be replaced with type 1 by adding the _out
+//      variant to Static Runtime)
+// In Static Runtime, type 2 ops are replaced with their corresponding copy
+// versions when enable_out_variant is enabled and become type 1 ops.The memory
+// planner only manages tensors that are outputs of type 1 ops. For type 3, the
+// output tensors are allocated inside the operator and can't be directly
+// managed by memory planner.
+//
+// Memory planner tries to minimize the number of memory allocations by
+// tracking the output tensors of ops with _out variants with unique DataPtr
+// (part of StorageImpl). It tries to do this in several steps:
+//   1. record the max memory usage for each Tensor with unique DataPtr at the
+//      end of each iteration
+//   2. in the next iteration, allocate the buffer for the max total usage and
+//      compute the offset of each allocation with regard to the single memory
+//      buffer, optionally reusing memory. In the first iteration, we rely on
+//      the default allocator for memory allocation.
+//   3. free the buffer at the end of each iteration
+// Steps 1 and 3 are handled by `deallocate()`, and step 2 by `allocate()`.
+// Only models with simple output types are supported, i.e. None, Tensor or
+// List/Tuple/Dict of Tensors. Complex output types such as List of Lists are
+// not supported.
+//
+// Additional Optimizations:
+//
+// [Borrowed IValue Outputs]
+// A few native ops (notably, `static_runtime::dict_unpack` and
+// `static_runtime::VarTupleUnpack`) simply unpack IValues to a bunch of
+// outputs without modification. For example, `dict_unpack` does the following:
+// for each key in inputs:
+//     output[i] = dict_input[key]
+// To avoid refcount bumps, the outputs of these ops are non-owning references.
+// This requires special logic in the memory planner - when adding an op that
+// borrows outputs, be sure that the memory planner is updated accordingly!
+//
+// [Managed Output Tensors]
+// The memory planner is able to manage output tensors if the appropriate
+// `StaticModuleOptions` are set. However, the memory planner handles output
+// tensors separately from regular intermediate tensors:
+// 1. They don't participate in memory reuse.
+// 2. The memory planner cannot reclaim their backing storage until they have
+//    been explicitly freed by the client.
+
+class MemoryPlanner {
+ public:
+  MemoryPlanner(
+      BlockRunner* block_runner,
+      const BlockInfo& block_info,
+      bool enable_out_variant,
+      bool manage_output_tensors);
+
+  // disable copying and moving
+  MemoryPlanner(const MemoryPlanner&) = delete;
+  MemoryPlanner& operator=(const MemoryPlanner&) = delete;
+  MemoryPlanner(MemoryPlanner&&) = delete;
+  MemoryPlanner& operator=(MemoryPlanner&&) = delete;
+  virtual ~MemoryPlanner() = default;
+
+  void allocate();
+  void deallocate();
+  void deallocateOutputTensors();
+
+  size_t total_num_managed_tensors() const {
+    return num_managed_tensors_;
+  }
+
+  size_t total_reused_tensors() const {
+    return reused_tensors_;
+  }
+
+  size_t total_num_managed_output_tensors() const {
+    return managed_output_tensors_.size();
+  }
+
+  [[nodiscard]] size_t total_num_unmanaged() const {
+    return num_unmanaged_non_scalars() + num_unmanaged_scalars();
+  }
+
+  [[nodiscard]] size_t num_unmanaged_non_scalars() const {
+    return unmanaged_ivalues_.size() + unmanaged_borrowed_ivalues_.size();
+  }
+
+  [[nodiscard]] size_t num_unmanaged_scalars() const {
+    return num_unmanaged_scalar_ivalues_;
+  }
+
+  size_t total_managed() const {
+    return managed_bytes_;
+  }
+
+  size_t numOutputBufferBytes() const {
+    return output_buffer_bytes_;
+  }
+
+  // Check if `ivalue` is contained as a managed tensor. Only used in DCHECK().
+  bool isManagedOutputTensor(const IValue& ivalue) const {
+    if (!output_buffer_ || // output buffer got already deallocated.
+        output_buffer_bytes_ == 0 || // memory planning is not yet initialized.
+        !ivalue.isTensor() // a non-tensor is never managed
+    ) {
+      return false;
+    }
+    const auto& tensor = ivalue.toTensor();
+    if (!tensor.has_storage() || !tensor.storage().data_ptr()) {
+      return false;
+    }
+    // TODO: Improve this once D31357486 is landed.
+    uint8_t* tensor_ptr =
+        static_cast<uint8_t*>(tensor.storage().data_ptr().get());
+    uint8_t* buffer_start = static_cast<uint8_t*>(output_buffer_.get());
+    uint8_t* buffer_end = buffer_start + output_buffer_bytes_;
+    return buffer_start <= tensor_ptr && tensor_ptr < buffer_end;
+  }
+
+  bool isManagedStorageImpl(const at::StorageImpl* impl) const {
+    if (storages_.empty()) {
+      return false;
+    }
+    // Comparing pointers that aren't within the same array is
+    // UB. We're doing fancy memory allocation stuff, so we cast to an
+    // integer type and carry on.
+    const auto impl_p = reinterpret_cast<uintptr_t>(impl);
+    const auto start = reinterpret_cast<uintptr_t>(&storages_[0]);
+    const auto end =
+        reinterpret_cast<uintptr_t>(&storages_[0] + storages_.size());
+    return impl_p >= start && impl_p < end;
+  }
+
+  bool overlapWithInternalBuffer(void* data_ptr) {
+    return buffer_start_ <= data_ptr && data_ptr < buffer_end_;
+  }
+
+ protected:
+  uint8_t* allocateBuffer(size_t num_bytes);
+
+  size_t managed_bytes_{0};
+  size_t reused_tensors_{0};
+
+  // We allocate StorageImpls ourselves so that 1) we don't have to do
+  // an extra two loads per Tensor (which will likely miss in the CPU
+  // data cache) first reading the Storage (i.e., StorageImpl pointer)
+  // from the TensorImpl object and then second dereferencing it and
+  // 2) our memory access pattern during allocate() has high locality.
+  // We don't have any guarantee that the model doesn't change the
+  // Storage for managed tensors out from under us during execution,
+  // so we have to check the StorageImpls each time we deallocate.
+  ManagedStorages storages_;
+
+  // Contains the size (in bytes) of the data to be allocated for each storage
+  std::vector<size_t> storages_nbytes_;
+
+ private:
+  // ivalues created in one run but not managed by MemoryPlanner
+  std::vector<IValue*> unmanaged_ivalues_;
+
+  // Special class of unmanaged values: some native ops create IValues
+  // in a "borrowed" state that can and must be cleaned up without a
+  // reference count decrement.
+  std::vector<IValue*> unmanaged_borrowed_ivalues_;
+
+  // Even more special class of unmanaged values: if select_tensor
+  // outputs are outputs of the graph, then they need to be restored
+  // to an ordinary "strong reference" state.
+  std::vector<IValue*> borrowed_ivalues_needing_incref_;
+
+  std::vector<std::pair<size_t, at::Tensor*>> managed_output_tensors_{};
+  at::DataPtr buffer_; // allocated each time we call Run()
+  uint8_t* buffer_start_{nullptr};
+  uint8_t* buffer_end_{nullptr};
+  size_t num_managed_tensors_{0};
+  size_t num_unmanaged_scalar_ivalues_{0};
+
+  at::DataPtr output_buffer_;
+  size_t output_buffer_bytes_{0};
+
+  virtual void allocateManagedTensors() = 0;
+  virtual void deallocateManagedTensors() = 0;
+
+  void allocateOutputTensors();
+};
+
+class StandardMemoryPlanner : public MemoryPlanner {
+ public:
+  StandardMemoryPlanner(
+      BlockRunner* block_runner,
+      const BlockInfo& block_info,
+      bool enable_out_variant,
+      bool manage_output_tensors,
+      bool optimize_memory);
+
+ protected:
+  void allocateManagedTensors() override;
+  void deallocateManagedTensors() override;
+
+  std::vector<StorageGroup> managed_tensors_{};
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/static/ops.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/static/ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..5b90e0f6708f2a15c65d28730f476558750795bb
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/static/ops.h
@@ -0,0 +1,187 @@
+#pragma once
+
+#include <ATen/Utils.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/runtime/static/impl.h>
+
+namespace at::native {
+at::Tensor& reshape_copy_out(
+    at::Tensor& out,
+    const at::Tensor& self,
+    const at::DimVector& proposed_shape,
+    bool infer_size = true);
+at::Tensor& to_copy_out(
+    Tensor& out,
+    const Tensor& self,
+    bool non_blocking,
+    bool copy_strides,
+    std::optional<MemoryFormat> memory_format);
+} // namespace at::native
+
+namespace torch::jit {
+
+using SROpFunctor = SROperator (*)(Node* n);
+struct SROperatorFunctor {
+  virtual SROperator Generate(Node*) {
+    SROperator out;
+    return out;
+  }
+  virtual ~SROperatorFunctor() = default;
+};
+
+TORCH_DECLARE_REGISTRY(SROperatorRegistry, SROperatorFunctor);
+
+#define REGISTER_OPERATOR_FUNCTOR(name, id, ...)             \
+  struct SROperatorFunctor_##id : public SROperatorFunctor { \
+    SROpFunctor fn = __VA_ARGS__;                            \
+    SROperator Generate(Node* n) override {                  \
+      return fn(n);                                          \
+    }                                                        \
+  };                                                         \
+  C10_REGISTER_CLASS(SROperatorRegistry, name, SROperatorFunctor_##id)
+
+TORCH_DECLARE_REGISTRY(SRNativeOperatorRegistry, SROperatorFunctor);
+#define REGISTER_NATIVE_OPERATOR_FUNCTOR(name, id, ...)            \
+  struct SRNativeOperatorFunctor_##id : public SROperatorFunctor { \
+    SROpFunctor fn = __VA_ARGS__;                                  \
+    SROperator Generate(Node* n) override {                        \
+      return fn(n);                                                \
+    }                                                              \
+  };                                                               \
+  C10_REGISTER_CLASS(                                              \
+      SRNativeOperatorRegistry, name, SRNativeOperatorFunctor_##id)
+
+inline at::Tensor create_empty_from(const at::Tensor& t) {
+  return at::detail::empty_cpu(
+      {0},
+      c10::typeMetaToScalarType(t.dtype()),
+      t.layout(),
+      t.device(),
+      std::nullopt,
+      std::nullopt);
+}
+
+inline at::Tensor create_empty_from(
+    at::IntArrayRef sizes,
+    const at::Tensor& t) {
+  return at::detail::empty_cpu(
+      sizes,
+      c10::typeMetaToScalarType(t.dtype()),
+      t.layout(),
+      t.device(),
+      std::nullopt,
+      std::nullopt);
+}
+
+inline at::Tensor create_empty(c10::ScalarType dtype) {
+  return at::detail::empty_cpu(
+      {0}, dtype, std::nullopt, std::nullopt, std::nullopt, std::nullopt);
+}
+
+inline at::Tensor create_empty_from(
+    const at::Tensor& t,
+    c10::ScalarType dtype) {
+  return at::detail::empty_cpu(
+      {0}, dtype, t.layout(), t.device(), std::nullopt, std::nullopt);
+}
+
+inline at::Tensor create_empty_from(const at::Tensor& t, c10::Layout layout) {
+  return at::detail::empty_cpu(
+      {0},
+      c10::typeMetaToScalarType(t.dtype()),
+      layout,
+      t.device(),
+      std::nullopt,
+      std::nullopt);
+}
+
+inline at::Tensor create_empty_from(const at::Tensor& t, c10::Device device) {
+  return at::detail::empty_cpu(
+      {0},
+      c10::typeMetaToScalarType(t.dtype()),
+      t.layout(),
+      device,
+      std::nullopt,
+      std::nullopt);
+}
+
+inline at::Tensor create_empty_from(
+    const at::Tensor& t,
+    c10::MemoryFormat memory_format) {
+  return at::detail::empty_cpu(
+      {0},
+      c10::typeMetaToScalarType(t.dtype()),
+      t.layout(),
+      t.device(),
+      std::nullopt,
+      memory_format);
+}
+
+inline at::Tensor create_empty_from(
+    const at::Tensor& t,
+    c10::ScalarType dtype,
+    c10::MemoryFormat memory_format) {
+  return at::detail::empty_cpu(
+      {0}, dtype, t.layout(), t.device(), std::nullopt, memory_format);
+}
+
+inline bool checkResizedDataPtr(at::Tensor& t) {
+  auto const prev_data_ptr = t.data_ptr();
+  t.resize_({0});
+  return prev_data_ptr == t.data_ptr();
+}
+
+inline void fastResizeToZero(at::Tensor& t) {
+  t.unsafeGetTensorImpl()->set_sizes_contiguous({0});
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(checkResizedDataPtr(t));
+}
+
+// check if an op has an out variant registered in Static Runtime
+bool opIsRegistered(const c10::Symbol& op_name);
+// check if Static Runtime can run an op natively.
+// prim ops that are implemented directly in the jit interpreter are implemented
+// as native ops in Static Runtime
+bool nativeOpIsRegistered(const c10::Symbol& op_name);
+
+bool canReuseInputsOutputs(
+    Node* n,
+    const c10::FastMap<Node*, bool>& node_has_out_variant);
+bool isOptimizableContainerType(
+    Node* n,
+    const c10::FastMap<Node*, bool>& node_has_out_variant);
+
+SROperator getOutOfPlaceOperation(Node* n);
+SROperator getNativeOperation(Node* n);
+
+bool hasVarArgs(Node* n);
+
+inline std::string PrintNode(const Node* node) {
+  std::ostringstream ss;
+  node->print(ss, 0, nullptr, false);
+  return ss.str();
+}
+
+inline void LogAndDumpSchema(const Node* node) {
+  VLOG(1) << "Found schema mismatch for: " << node->schema();
+}
+
+inline bool sr_schema_check(torch::jit::Node*) {
+  return true;
+}
+
+template <typename Schema, typename... Schemas>
+bool sr_schema_check(
+    torch::jit::Node* node,
+    Schema&& first,
+    Schemas&&... rest) {
+  auto is_match = node->matches(first) || sr_schema_check(node, rest...);
+  if (!is_match) {
+    torch::jit::LogAndDumpSchema(node);
+  }
+  return is_match;
+}
+
+bool sr_schema_check_kind(torch::jit::Node* node, c10::Symbol node_kind);
+} // namespace torch::jit
+
+C10_DECLARE_bool(static_runtime_enable_fast_math);
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/static/passes.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/static/passes.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c399257c15d14efb7d17a9f8613a8fb56239b13
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/static/passes.h
@@ -0,0 +1,91 @@
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API void FuseInferenceOpsForSparseNN(
+    std::shared_ptr<torch::jit::Graph>& graph);
+
+TORCH_API void EliminateTrivialEquallySplit(
+    std::shared_ptr<torch::jit::Graph>& graph);
+
+TORCH_API void FuseListUnpack(std::shared_ptr<torch::jit::Graph>& graph);
+
+// If outputs_are_immutable is set to false, don't replace the view ops that
+// produce aliases of graph outputs with the copy version.
+TORCH_API void ReplaceWithCopy(
+    std::shared_ptr<torch::jit::Graph>& graph,
+    bool outputs_are_immutable = true);
+
+TORCH_API void ReplacePermuteWithCopy(
+    std::shared_ptr<torch::jit::Graph>& graph,
+    bool outputs_are_immutable = true);
+
+TORCH_API void ReplaceWithMaybeCopy(
+    std::shared_ptr<torch::jit::Graph>& graph,
+    bool outputs_are_immutable = true);
+
+TORCH_API void RemoveImmutableInputDictLookups(
+    std::shared_ptr<torch::jit::Graph>& graph);
+
+TORCH_API bool graphHasOp(std::shared_ptr<Graph>& graph, const char* op_name);
+
+TORCH_API bool forwardHasOp(const Module& module, const char* op_name);
+
+TORCH_API void FuseSignLog1P(std::shared_ptr<Graph>& graph);
+
+TORCH_API void UseVariadicTupleUnpack(const std::shared_ptr<Graph>& graph);
+
+// c10::Symbol::fromQualString is a bit long to type everywhere, and
+// we can't use a `using` statement since it's a static class function.
+inline c10::Symbol fromQualString(const std::string& qual_string) {
+  return c10::Symbol::fromQualString(qual_string);
+}
+
+// [Create owned refs for special values]
+// StaticRuntimeBlockRunner moves its outputs to the return value at the end of
+// run_impl. However, there's a corner case where this can cause problems. If
+// we return a constant, then the only reference in the constants_ array can
+// be destroyed by this move.
+// We could add special logic to handle this in run_impl. But since this is a
+// relatively rare corner case, it's simpler to just add an op that does nothing
+// but create an owned reference to its input. This owned reference can be
+// safely moved out of StaticRuntimeBlockRunner. Note that for scalars,
+// this actually does a copy.
+// Note that we have to do the same thing if we are returning a value from an
+// outer scope in a sub-block.
+TORCH_API void CreateOwnedRefsForSpecialValues(Graph& graph);
+
+// [Force non-empty outputs]
+// It is technically possible for sub-blocks to not return anything. This is
+// problematic for StaticRuntimeBlockRunner because it assumes that at least one
+// output is being returned. Rather than slowing down SR with special logic for
+// this corner case, we simply force blocks that return nothing to return None.
+TORCH_API void ForceNonEmptyOutputs(Graph& graph);
+
+TORCH_API void UseVariadicGroupedAccessor(const std::shared_ptr<Graph>& graph);
+
+TORCH_API void EliminateExtraPermuteOps(std::shared_ptr<Graph>& graph);
+
+TORCH_API void EliminateNoOpSlice(std::shared_ptr<Graph>& graph);
+
+TORCH_API void UseSplitAndSqueeze(std::shared_ptr<Graph>& graph);
+
+// [Remove unnecessary outputs]]
+// Removes outputs to reduce compute when it is not used later in the graph.
+// Currently used to remove the max_indices output of embedding_bag, which
+// isn't necessary to compute the main output.
+TORCH_API void RemoveUnnecessaryOutputs(std::shared_ptr<Graph>& graph);
+
+TORCH_API void RemoveUnnecessaryEmbeddingBagOutputs(
+    std::shared_ptr<Graph>& graph);
+
+TORCH_API void FuseClampNaNToNum(std::shared_ptr<Graph>& graph);
+
+TORCH_API void UseInPlaceGetRealInputsFromOptionalInputsV2(
+    std::shared_ptr<Graph>& graph);
+
+TORCH_API void PrepackWeights(std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
+
+C10_DECLARE_bool(enable_clip_ranges_gather_fusions);
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/static/processed_node_wrapper.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/static/processed_node_wrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..caceb502a9aa5c1267a00b59eba90090f0377ad5
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/static/processed_node_wrapper.h
@@ -0,0 +1,211 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <torch/csrc/jit/runtime/static/impl.h>
+
+namespace torch::jit {
+
+// The following class facilitates code reuse between ProcessedNodeInputWrapper
+// and ProcessedNodeOutputWrapper via CRTP
+template <typename DerivedWrapper>
+class ProcessedNodeWrapperBase {
+ public:
+  class ProcessedNodeWrapperBaseIter {
+   public:
+    using iterator_category = std::forward_iterator_tag;
+    using value_type = at::Tensor;
+    using difference_type = size_t;
+    using pointer = const at::Tensor*;
+    using reference = const at::Tensor&;
+
+    ProcessedNodeWrapperBaseIter() = default;
+
+    ProcessedNodeWrapperBaseIter(
+        const DerivedWrapper* container,
+        size_t start_idx)
+        : container_(container), idx_(start_idx) {}
+
+    ProcessedNodeWrapperBaseIter& operator++() {
+      TORCH_DCHECK_NE(idx_, container_->size());
+      ++idx_;
+      return *this;
+    }
+
+    ProcessedNodeWrapperBaseIter operator++(int) {
+      ProcessedNodeWrapperBaseIter old = *this;
+      ++(*this);
+      return old;
+    }
+
+    reference operator*() const {
+      TORCH_CHECK(container_ != nullptr);
+      return (*container_)[idx_];
+    }
+
+    pointer operator->() const {
+      TORCH_CHECK(container_ != nullptr);
+      return &(*container_)[idx_];
+    }
+
+    friend bool operator==(
+        ProcessedNodeWrapperBaseIter lhs,
+        ProcessedNodeWrapperBaseIter rhs) {
+      TORCH_DCHECK_EQ(lhs.container_, rhs.container_);
+      return lhs.idx_ == rhs.idx_;
+    }
+
+    friend bool operator!=(
+        ProcessedNodeWrapperBaseIter lhs,
+        ProcessedNodeWrapperBaseIter rhs) {
+      return !(lhs == rhs);
+    }
+
+   private:
+    const DerivedWrapper* container_ = nullptr;
+    size_t idx_ = 0;
+  };
+
+  // NB: to mimic the behavior of at::ArrayRef, both iterators are
+  // the const version.
+  using iterator = ProcessedNodeWrapperBaseIter;
+  using const_iterator = ProcessedNodeWrapperBaseIter;
+  using size_type = size_t;
+  using value_type = at::Tensor;
+
+  explicit ProcessedNodeWrapperBase(ProcessedNode& pnode) : pnode_(pnode) {}
+
+  iterator begin() {
+    return ProcessedNodeWrapperBaseIter(static_cast<DerivedWrapper*>(this), 0);
+  }
+  iterator end() {
+    return ProcessedNodeWrapperBaseIter(
+        static_cast<DerivedWrapper*>(this),
+        static_cast<DerivedWrapper*>(this)->size());
+  }
+
+  const_iterator begin() const {
+    return ProcessedNodeWrapperBaseIter(
+        static_cast<const DerivedWrapper*>(this), 0);
+  }
+  const_iterator end() const {
+    return ProcessedNodeWrapperBaseIter(
+        static_cast<const DerivedWrapper*>(this),
+        static_cast<const DerivedWrapper*>(this)->size());
+  }
+
+  const_iterator cbegin() const {
+    return ProcessedNodeWrapperBaseIter(
+        static_cast<const DerivedWrapper*>(this), 0);
+  }
+  const_iterator cend() const {
+    return ProcessedNodeWrapperBaseIter(
+        static_cast<const DerivedWrapper*>(this),
+        static_cast<const DerivedWrapper*>(this)->size());
+  }
+
+  bool empty() const {
+    return static_cast<const DerivedWrapper*>(this)->size() == 0;
+  }
+
+ protected:
+  ProcessedNode& pnode_;
+};
+
+// A ProcessedNodeWrapperBase lets us use ProcessedNode directly in a context
+// where a container of IValues is expected. This trick is handy for avoiding
+// refcount bumps in perf-sensitive native ops. For example, suppose we have an
+// op that takes a list of tensors as an argument and we've turned the op into a
+// variadic variant in static runtime. To use the PyTorch library implementation
+// of the op, we would have to pack the variadic arguments into a list:
+//   std::vector<Tensor> tensor_list;
+//   tensor_list.reserve(pnode->num_outputs());
+//   for (const auto i : c10::irange(pnode->num_inputs())
+//     tensor_list.push_back(pnode->Input(i).toTensor());
+//   op_impl(tensor_list);
+// Using ProcessedNodeWrapperBase, we can avoid this round of refcount bumps.
+// All we need to do is turn `op_impl` into a template and pass it
+// ProcessedNodeInputWrapper(*pnode)!
+class ProcessedNodeInputWrapper
+    : public ProcessedNodeWrapperBase<ProcessedNodeInputWrapper> {
+ public:
+  // The last `back_elements_ignored` elements are not considered.
+  // Same for the first `front_elements_ignored` elements.
+  // This is useful for ops where
+  // only the first N elements are tensors (N < inputs.size()).
+  // For instance, the last argument to VarStack is an integer dimension.
+  explicit ProcessedNodeInputWrapper(
+      ProcessedNode& pnode,
+      size_t front_elements_ignored = 0,
+      size_t back_elements_ignored = 1)
+      : ProcessedNodeWrapperBase<ProcessedNodeInputWrapper>(pnode),
+        front_elements_ignored_(front_elements_ignored),
+        back_elements_ignored_(back_elements_ignored) {
+    TORCH_CHECK(front_elements_ignored_ <= pnode_.num_inputs());
+    TORCH_CHECK(
+        back_elements_ignored_ <=
+        pnode_.num_inputs() - front_elements_ignored_);
+  }
+
+  size_t size() const {
+    return pnode_.num_inputs() - back_elements_ignored_ -
+        front_elements_ignored_;
+  }
+
+  const at::Tensor& operator[](size_t idx) const {
+    TORCH_CHECK(idx < size());
+    return pnode_.Input(front_elements_ignored_ + idx).toTensor();
+  }
+
+  const at::Tensor& front() const {
+    TORCH_CHECK(
+        !empty(),
+        "Attempted to access front() of empty ProcessedNodeInputWrapper");
+    return pnode_.Input(front_elements_ignored_).toTensor();
+  }
+
+  const at::Tensor& back() const {
+    TORCH_CHECK(
+        !empty(),
+        "Attempted to access back() of empty ProcessedNodeInputWrapper");
+    return pnode_.Input(pnode_.num_inputs() - back_elements_ignored_ - 1)
+        .toTensor();
+  }
+
+ private:
+  size_t front_elements_ignored_;
+  size_t back_elements_ignored_;
+};
+
+// Similar to ProcessedNodeInputWrapper, but wraps outputs and allows for
+// writing.
+class ProcessedNodeOutputWrapper
+    : public ProcessedNodeWrapperBase<ProcessedNodeOutputWrapper> {
+ public:
+  using ProcessedNodeWrapperBase<
+      ProcessedNodeOutputWrapper>::ProcessedNodeWrapperBase;
+
+  size_t size() const {
+    return pnode_.num_outputs();
+  }
+
+  at::Tensor& operator[](size_t idx) const {
+    TORCH_CHECK(idx < size());
+    return pnode_.Output(idx).toTensor();
+  }
+
+  at::Tensor& front() const {
+    TORCH_CHECK(
+        !empty(),
+        "Attempted to access front() of empty ProcessedNodeOutputWrapper");
+    return pnode_.Output(0).toTensor();
+  }
+
+  at::Tensor& back() const {
+    TORCH_CHECK(
+        !empty(),
+        "Attempted to access back() of empty ProcessedNodeOutputWrapper");
+    return pnode_.Output(size() - 1).toTensor();
+  }
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/static/static_method.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/static/static_method.h
new file mode 100644
index 0000000000000000000000000000000000000000..bd7ca82254ab37d100988d6f56e82340f3271c23
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/static/static_method.h
@@ -0,0 +1,48 @@
+#pragma once
+
+#include <torch/csrc/api/include/torch/imethod.h>
+#include <torch/csrc/jit/runtime/static/impl.h>
+
+namespace torch::jit {
+
+class StaticMethod : public torch::IMethod {
+ public:
+  StaticMethod(
+      std::shared_ptr<StaticModule> static_module,
+      std::string method_name)
+      : static_module_(std::move(static_module)),
+        method_name_(std::move(method_name)) {
+    TORCH_CHECK(static_module_);
+  }
+
+  c10::IValue operator()(
+      std::vector<IValue> args,
+      const IValueMap& kwargs = IValueMap()) const override {
+    return (*static_module_)(std::move(args), kwargs);
+  }
+
+  const std::string& name() const override {
+    return method_name_;
+  }
+
+ protected:
+  void setArgumentNames(
+      std::vector<std::string>& argument_names_out) const override {
+    const auto& schema = static_module_->schema();
+    CAFFE_ENFORCE(schema.has_value());
+    const auto& arguments = schema->arguments();
+    argument_names_out.clear();
+    argument_names_out.reserve(arguments.size());
+    std::transform(
+        arguments.begin(),
+        arguments.end(),
+        std::back_inserter(argument_names_out),
+        [](const c10::Argument& arg) -> std::string { return arg.name(); });
+  }
+
+ private:
+  std::shared_ptr<StaticModule> static_module_;
+  std::string method_name_;
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/static/te_wrapper.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/static/te_wrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..087f3180fb6e01c5f38ddfe28bc3e4c6885f9d6d
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/static/te_wrapper.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include <torch/csrc/jit/tensorexpr/codegen.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
+#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
+#include <torch/csrc/jit/tensorexpr/loopnest.h>
+
+namespace torch::jit {
+
+class TEWrapper {
+ public:
+  TEWrapper() = default;
+  void call(const std::vector<void*>& args);
+
+  template <typename ExpectedType>
+  bool checkInput(const at::Tensor& t) {
+#ifdef TORCH_ENABLE_LLVM
+    return t.is_contiguous() && t.dtype().Match<ExpectedType>();
+#else
+    return false;
+#endif
+  }
+
+#ifdef TORCH_ENABLE_LLVM
+  void update(std::unique_ptr<tensorexpr::LLVMCodeGen>&& cg_);
+#endif
+
+ private:
+#ifdef TORCH_ENABLE_LLVM
+  std::unique_ptr<tensorexpr::LLVMCodeGen> cg;
+#endif
+};
+
+std::shared_ptr<TEWrapper> createDiv();
+std::shared_ptr<TEWrapper> createLogit();
+std::shared_ptr<TEWrapper> createRelu();
+std::shared_ptr<TEWrapper> createTanh();
+std::shared_ptr<TEWrapper> createSigmoid();
+std::shared_ptr<TEWrapper> createSignedLog1p();
+std::shared_ptr<TEWrapper> createClamp();
+std::shared_ptr<TEWrapper> createClampNanToNum();
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/symbolic_script.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/symbolic_script.h
new file mode 100644
index 0000000000000000000000000000000000000000..048df20f1b98e9198cf966937ff2c0b1e51093f6
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/symbolic_script.h
@@ -0,0 +1,18 @@
+#pragma once
+// This file is temporary until native_functions.yaml and derivatives.yaml are
+// merged. Ideally this should all go into native_functions.yaml
+
+#include <c10/util/StringUtil.h>
+#include <torch/csrc/jit/api/module.h>
+#include <optional>
+
+namespace torch::jit {
+struct GradientPair {
+  std::shared_ptr<Graph> forward;
+  std::shared_ptr<Graph> backward;
+};
+
+TORCH_API std::optional<GradientPair> gradientInfoForSchema(
+    const FunctionSchema& schema);
+TORCH_API bool hasGradientInfoForSchema(const FunctionSchema& schema);
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/symbolic_shape_registry.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/symbolic_shape_registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..6be7d70cb735cb61c5bd369ac419c8670724f61e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/symbolic_shape_registry.h
@@ -0,0 +1,69 @@
+#pragma once
+// This file is temporary until native_functions.yaml and derivatives.yaml are
+// merged. Ideally this should all go into native_functions.yaml
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+/*
+ADDING A NEW SHAPE GRAPH:
+- For one node schema, there is one corresponding registered shape compute
+graph. The schema of the graph should be the same except for Tensor arguments.
+For every Tensor input in operator schema, there should be a List[int]
+corresponding to that Tensor's shape. For example: "aten::linear(Tensor input,
+Tensor weight, Tensor? bias=None) -> Tensor" ==> def linear(input: List[int],
+weight: List[int], bias: Optional[List[int]])
+
+Additionally, arguments which are unused at the end of the schema may be left
+off. This allows sharing a single graph for multiple function schemas, such as
+unary operators with different trailing arguments that do not affect the output
+shape.
+
+The shape graph should return a new, unaliased List[int] (or tuple of lists for
+multiple returns) and should not modify any input lists. This allows the shape
+graphs to be composed and executed.
+
+The shape analysis (particularly for non-complete, or symbolic shapes) works by
+partially evaluating the JIT IR. It may be possible for a Graph to be registered
+that we cannot currently partially evaluate. If this happens, please file an
+issue. There are lints registered to avoid particular known patterns (continue
+or break or early return in a loop). Those may be improved in the future, please
+file an issue if necessary.
+
+To debug (and write initially) the recommended flow is to define these functions
+in python and iterate there. Functions should be added to
+torch/jit/_shape_functions.
+
+To test operators, the preferred flow is through OpInfos, with
+`assert_jit_shape_analysis=True`. If this is not feasible, you can look at tests
+in `test_symbolic_shape_analysis.py` such as `test_adaptive_avg_pool2d`.
+
+Operators which take in a list of tensors, such as concat, are not yet
+supported. Concat has been special cased and could be generalized as needed.
+Please file an issue.
+*/
+
+struct BoundedShapeGraphs {
+  std::shared_ptr<Graph> lower_bound;
+  std::shared_ptr<Graph> upper_bound;
+};
+
+TORCH_API void RegisterShapeComputeGraphForSchema(
+    const FunctionSchema& schema,
+    const std::shared_ptr<Graph>& g);
+
+TORCH_API std::optional<std::shared_ptr<Graph>> shapeComputeGraphForSchema(
+    const FunctionSchema& schema);
+
+TORCH_API std::optional<BoundedShapeGraphs> boundedGraphsForSchema(
+    const FunctionSchema& schema);
+
+TORCH_API std::vector<const FunctionSchema*> RegisteredShapeComputeSchemas();
+
+TORCH_API void LintShapeComputeGraph(
+    const FunctionSchema* schema,
+    const std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/symbolic_shape_registry_util.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/symbolic_shape_registry_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..be3c1b6e67d25c471850e244cf383e980600c05e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/symbolic_shape_registry_util.h
@@ -0,0 +1,12 @@
+#pragma once
+// This file is temporary until native_functions.yaml and derivatives.yaml are
+// merged. Ideally this should all go into native_functions.yaml
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API const OperatorMap<std::string>& get_tensorexpr_elementwise_set();
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/vararg_functions.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/vararg_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..205d7b502c838a6f1a2f506177c0806c80d1d9cb
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/vararg_functions.h
@@ -0,0 +1,41 @@
+#pragma once
+#include <ATen/core/List.h>
+#include <ATen/core/functional.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
+#include <ATen/core/stack.h>
+
+namespace torch::jit {
+
+void tupleUnpack(Stack& stack);
+
+void format(Stack& stack, size_t num_inputs);
+
+void einsum(Stack& stack, size_t num_inputs);
+
+void percentFormat(Stack& stack, size_t num_inputs);
+
+void listUnpack(Stack& stack, size_t num_outputs);
+
+void tupleConstruct(Stack& stack, size_t num_inputs);
+
+void namedTupleConstruct(Stack& stack, c10::TypePtr type, size_t num_inputs);
+
+void listConstruct(Stack& stack, const c10::Type& list_type, size_t num_inputs);
+
+void dictConstruct(Stack& stack, const c10::Type& type, size_t num_inputs);
+
+// as weak_ref will create a Object with a non-owning CompilationUnit reference,
+// for use as a constant in the Graph to avoid a reference cycle
+void createObject(
+    Stack& stack,
+    const at::ClassTypePtr& type,
+    bool as_weak_ref = false);
+
+void isinstance(Stack& stack, at::ArrayRef<at::TypePtr> types);
+
+void tupleSlice(Stack& stack, size_t begin, size_t end);
+
+void dequantize(Stack& stack);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/variable_tensor_list.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/variable_tensor_list.h
new file mode 100644
index 0000000000000000000000000000000000000000..4370dc89244766b45289f756c1138e00fcc81d93
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/runtime/variable_tensor_list.h
@@ -0,0 +1,17 @@
+#pragma once
+#include <ATen/core/Tensor.h>
+
+namespace torch::jit {
+
+// a wrapper to mark places where we expect all the at::Tensors to be
+// variables
+struct variable_tensor_list : public std::vector<at::Tensor> {
+  variable_tensor_list() = default;
+  template <class InputIt>
+  variable_tensor_list(InputIt first, InputIt last)
+      : std::vector<at::Tensor>(first, last) {}
+  explicit variable_tensor_list(std::vector<at::Tensor>&& tensor)
+      : std::vector<at::Tensor>(std::move(tensor)) {}
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/callstack_debug_info_serialization.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/callstack_debug_info_serialization.h
new file mode 100644
index 0000000000000000000000000000000000000000..843bf7d564e7a60c3b84132653c79c3fd1e2a11f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/callstack_debug_info_serialization.h
@@ -0,0 +1,89 @@
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <torch/csrc/jit/frontend/source_range.h>
+#include <torch/csrc/jit/ir/scope.h>
+
+#include <ATen/core/ivalue.h>
+
+#include <vector>
+
+#include <c10/util/flat_hash_map.h>
+
+namespace c10 {
+struct IValue;
+}
+
+namespace torch::jit {
+
+class Pickler;
+class InlinedCallStackSerializer {
+ public:
+  // Serialize InlinedCallStack as
+  // SerializedInlinedCallStack =
+  // [module_info, source range tag, SerializedInlinedCallStack]
+  // module_info = [ClassType.qualifiedName, instance_name]
+  // source_range_tag = unique source range id
+  c10::IValue serialize(
+      const InlinedCallStackPtr& cs_ptr,
+      const SourceRangeTagMap& source_range_tags);
+
+ private:
+  // module_info = [ClassType.qualifiedName, instance_name]
+  c10::IValue serialize_module_instance_info(
+      const std::optional<ModuleInstanceInfo>& m);
+
+  // This caches serialized inlined callstack ptr, since many
+  // InlinedCallStackPtr can refer to the same one.
+  ska::flat_hash_map<InlinedCallStackPtr, c10::IValue>
+      serialized_inlined_callstack_;
+  // This caches serialized module instance info.
+  // There might be many nodes that are part of the same
+  // parent, grandparent etc. module.
+  ska::flat_hash_map<std::string, c10::IValue> serialized_module_instance_info_;
+};
+
+class TORCH_API CallStackDebugInfoPickler {
+ public:
+  CallStackDebugInfoPickler() = default;
+
+  std::vector<char> pickle(
+      const std::unordered_map<int64_t, DebugInfoTuple>& callstack_ptrs,
+      const SourceRangeTagMap& source_range_tags);
+
+ private:
+  InlinedCallStackSerializer css_;
+};
+
+class InlinedCallStackDeserializer {
+ public:
+  InlinedCallStackPtr deserialize(
+      const c10::IValue& iv,
+      const ska::flat_hash_map<int64_t, SourceRange>& source_range_map,
+      const std::shared_ptr<CompilationUnit>& cu);
+
+ private:
+  std::optional<ModuleInstanceInfo> deserialize_module_instance_info(
+      const c10::IValue& iv,
+      const std::shared_ptr<CompilationUnit>& cu);
+
+  ska::
+      flat_hash_map<c10::intrusive_ptr<c10::ivalue::Tuple>, InlinedCallStackPtr>
+          cached_inlined_callstacks_;
+  ska::flat_hash_map<c10::intrusive_ptr<c10::ivalue::Tuple>, ModuleInstanceInfo>
+      cached_module_instance_info_;
+};
+
+class TORCH_API CallStackDebugInfoUnpickler {
+ public:
+  ska::flat_hash_map<int64_t, DebugInfoTuple> unpickle(
+      const at::DataPtr& data,
+      size_t size,
+      const ska::flat_hash_map<int64_t, SourceRange>& source_range_map,
+      const std::shared_ptr<CompilationUnit>& cu);
+
+ private:
+  InlinedCallStackDeserializer csds_;
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/export.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/export.h
new file mode 100644
index 0000000000000000000000000000000000000000..18b9927c87ad4a2d1c29440d547bd91ef0a44c34
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/export.h
@@ -0,0 +1,279 @@
+#pragma once
+
+#include <caffe2/serialize/inline_container.h>
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/serialization/export_bytecode.h>
+#include <torch/csrc/jit/serialization/flatbuffer_serializer.h>
+#include <torch/csrc/jit/serialization/pickler.h>
+#include <torch/csrc/jit/serialization/python_print.h>
+#include <torch/csrc/jit/serialization/storage_context.h>
+#include <torch/csrc/jit/serialization/type_name_uniquer.h>
+#include <torch/csrc/onnx/onnx.h>
+#include <ostream>
+
+namespace ONNX_NAMESPACE {
+class ModelProto;
+}
+
+namespace torch::jit {
+
+// This map is used to keep track of parameters that should be exported
+// externally. When `defer_weight_export` is true, the returned map contains
+// kv pairs that map {external reference name} -> {at::Tensor to be exported}.
+// It is the responsibility of the caller to export these appropriately.
+//
+// For example, when exporting to a zip archive, the caller may write out files
+// for each entry in the export map, with the filename being the key and the
+// file contents being the raw tensor data.
+using RawDataExportMap = std::unordered_map<std::string, at::Tensor>;
+
+using SymbolDimMap = std::map<c10::ShapeSymbol, std::string>;
+using DimSymbolMap = std::map<std::string, c10::ShapeSymbol>;
+
+using NodeNameMap = std::unordered_map<const Node*, std::string>;
+
+// Used for modularized export settling function and node attributes.
+using NodeAttrNameMap = std::
+    unordered_map<const Node*, std::unordered_map<std::string, std::string>>;
+
+TORCH_API std::tuple<
+    std::shared_ptr<::ONNX_NAMESPACE::ModelProto>,
+    RawDataExportMap,
+    SymbolDimMap,
+    bool,
+    NodeNameMap>
+export_onnx(
+    const std::shared_ptr<Graph>& graph,
+    const std::map<std::string, at::Tensor>& initializers,
+    int64_t onnx_opset_version,
+    const std::unordered_map<
+        std::string,
+        std::unordered_map<int64_t, std::string>>& dynamic_axes,
+    bool defer_weight_export = false,
+    ::torch::onnx::OperatorExportTypes operator_export_type =
+        ::torch::onnx::OperatorExportTypes::ONNX,
+    bool strip_doc_string = true,
+    bool keep_initializers_as_inputs = true,
+    const std::map<std::string, int>& custom_opsets = {},
+    bool add_node_names = true,
+    bool use_external_data_format = false,
+    const std::string& onnx_file_path = std::string(),
+    const NodeAttrNameMap& node_attr_to_name = {});
+
+TORCH_API std::string serialize_model_proto_to_string(
+    const std::shared_ptr<::ONNX_NAMESPACE::ModelProto>& model_proto);
+
+TORCH_API void check_onnx_proto(const std::string& proto_string);
+
+// Serializer for both oldsyle and unified format TorchScript serialization
+class TORCH_API ScriptModuleSerializer {
+ public:
+  explicit ScriptModuleSerializer(
+      caffe2::serialize::PyTorchStreamWriter& export_writer)
+      : writer_(export_writer) {}
+
+  void writeFiles(const std::string& code_dir);
+  void serialize(
+      const Module& module,
+      const ExtraFilesMap& extra_files,
+      bool bytecode_format,
+      bool save_mobile_debug_info);
+  void serialize_unified_format(Module& module, uint64_t script_module_id);
+  SerializationStorageContext& storage_context();
+
+  ~ScriptModuleSerializer() = default;
+
+ private:
+  void convertNamedType(const c10::NamedTypePtr& class_type);
+  void convertTypes(const at::NamedTypePtr& root_type);
+  void writeExtraFiles(const Module& module, const ExtraFilesMap& extra_files);
+  void writeByteCode(const Module& module, bool save_mobile_debug_info);
+  void writeArchive(
+      const IValue& value,
+      const std::string& archive_name,
+      const std::string& archive_dir,
+      const std::string& tensor_dir,
+      bool use_storage_context = false,
+      bool skip_tensor_data = false);
+  void updateSourceRangeTags(const SourceRangeRecords& ranges);
+
+  caffe2::serialize::PyTorchStreamWriter& writer_;
+  std::vector<at::IValue> constant_table_;
+
+  std::unordered_set<c10::NamedTypePtr> converted_types_;
+  PrintDepsTable class_deps_;
+  TypeNameUniquer type_name_uniquer_;
+  // qualifier, e.g. '__torch__.Bar' -> PythonPrint for the file that will be
+  // created
+  OrderedDict<std::string, PythonPrint> file_streams_;
+  // Used to keep references of storages around during serialization to solve
+  // for ABA memory reuse problem hit when storages are created/destroyed
+  // during serialization process. Also used to coordinate sharing of storages
+  // between Script and eager modules in torch.package.
+  SerializationStorageContext storage_context_;
+
+  // Uniquely identifies a SourceRange in a model.
+  // SourceRanges are associated with Nodes of Graphs.
+  // However for mobile deployment we dont intend to ship
+  // full JIT with capabilities of reading code and constructing
+  // graphs.
+  // Instead we serialize the Code generated from graph of the methods.
+  // Code is serialized in bytecode format that contains instructions
+  // corresponding to the nodes of the graph. Since original graph is gone, the
+  // question is how do we identify where the ops, in serialized bytecode, come
+  // from in original model code. We do this in two parts.
+  // 1. Associate a unique tag to SourceRange.
+  // 2. Serialize this unique_tag.
+  //  2.1 Meaning save <byte_offset, source_range_tag, source range> instead of
+  //      <byte_offset, source range>
+  // 3. During serializing model for mobile, i.e. bytecode generation,
+  //    save unique tag of SourceRange corresponding to the Node.
+  // 4. During deserialization, read all the debug_pkl, to construct a map
+  //    of <unique_tag, SourceRange> and use tag saved with OPs in bytecode
+  //    to lookup the source range.
+  // Strictly speaking we will serialize InlinedCallStack directly, which
+  // contains SourceRange. This way we have access to entire callstack and not
+  // just source information about where the node is, since bytecode inlines the
+  // graph before saving it.
+  SourceRangeTagMap source_range_tags_;
+  int64_t current_source_range_tag_{0};
+};
+
+// For testing purposes
+TORCH_API std::string pretty_print_onnx(
+    const std::shared_ptr<Graph>& graph,
+    const std::map<std::string, at::Tensor>& initializers,
+    int64_t onnx_opset_version,
+    bool defer_weight_export,
+    ::torch::onnx::OperatorExportTypes operator_export_type =
+        ::torch::onnx::OperatorExportTypes::ONNX,
+    bool google_printer = false,
+    bool keep_initializers_as_inputs = true,
+    const std::map<std::string, int>& custom_opsets = {},
+    bool add_node_names = true);
+
+TORCH_API void ExportModule(
+    const Module& module,
+    std::ostream& out,
+    const ExtraFilesMap& metadata = ExtraFilesMap(),
+    bool bytecode_format = false,
+    bool save_mobile_debug_info = false,
+    bool use_flatbuffer = false);
+
+TORCH_API void ExportModule(
+    const Module& module,
+    const std::string& filename,
+    const ExtraFilesMap& metadata = ExtraFilesMap(),
+    bool bytecode_format = false,
+    bool save_mobile_debug_info = false,
+    bool use_flatbuffer = false);
+
+TORCH_API void ExportModule(
+    const Module& module,
+    const std::function<size_t(const void*, size_t)>& writer_func,
+    const ExtraFilesMap& metadata = ExtraFilesMap(),
+    bool bytecode_format = false,
+    bool save_mobile_debug_info = false,
+    bool use_flatbuffer = false);
+
+// Write the bytes of a pickle archive and the tensors referenced inside that
+// archive
+TORCH_API void writeArchiveAndTensors(
+    const std::string& archive_name,
+    const char* pickle_bytes,
+    size_t size,
+    const std::vector<at::Tensor>& tensors,
+    caffe2::serialize::PyTorchStreamWriter& out);
+
+// Surrounding system can install an additional hook to produce extra files
+// with metadata based on environment every time a module is serialized.
+using ExportModuleExtraFilesHook = std::function<ExtraFilesMap(const Module&)>;
+TORCH_API void SetExportModuleExtraFilesHook(ExportModuleExtraFilesHook hook);
+
+/**
+ * Generates new bytecode for a Script module and returns what the op list
+ * would be for a LiteScriptModule based off the current code base. If you
+ * have a LiteScriptModule and want to get the currently present
+ * list of ops call _export_operator_list instead.
+ */
+TORCH_API std::vector<std::string> export_opnames(const Module& m);
+
+struct TORCH_API BytecodeEmitMode {
+  static bool is_default_value_for_unspecified_arg_enabled();
+  static void set_default_value_for_unspecified_arg_enabled(bool enabled);
+
+  static bool is_default_args_before_out_args_enabled();
+  static void set_default_args_before_out_args_enabled(bool enabled);
+
+  static bool is_emit_promoted_ops_enabled();
+  static void set_default_emit_promoted_ops_enabled(bool enabled);
+};
+
+// RAII guard to switch the way JIT emits the bytecode for inputs.
+// default_value_for_unspecified_arg:
+// true: instruction of default argument values (like LOADC) is emitted.
+// false: instruction of default argument values are not emitted. Instead
+// they are fetched from operator schema.
+// default_args_before_out_args (to forward compatibile support
+// operators allowing out arguments and default arguments):
+// true: the number of specified arguments will deserialized to (#all_args -
+// #default_args). false: the number of specified arguments will deserialized to
+// (#all_args).
+struct TORCH_API BytecodeEmitModeGuard {
+  BytecodeEmitModeGuard(
+      bool enable_default_value_for_unspecified_arg,
+      bool enable_default_args_before_out_args,
+      bool enable_emit_promoted_ops)
+      : prev_default_value_for_unspecified_arg_mode(
+            BytecodeEmitMode::is_default_value_for_unspecified_arg_enabled()),
+        prev_default_args_before_out_args(
+            BytecodeEmitMode::is_default_args_before_out_args_enabled()),
+        prev_default_emit_promoted_ops(
+            BytecodeEmitMode::is_emit_promoted_ops_enabled()) {
+    BytecodeEmitMode::set_default_value_for_unspecified_arg_enabled(
+        enable_default_value_for_unspecified_arg);
+    BytecodeEmitMode::set_default_args_before_out_args_enabled(
+        enable_default_args_before_out_args);
+    BytecodeEmitMode::set_default_emit_promoted_ops_enabled(
+        enable_emit_promoted_ops);
+  }
+  ~BytecodeEmitModeGuard() {
+    BytecodeEmitMode::set_default_value_for_unspecified_arg_enabled(
+        prev_default_value_for_unspecified_arg_mode);
+    BytecodeEmitMode::set_default_args_before_out_args_enabled(
+        prev_default_args_before_out_args);
+    BytecodeEmitMode::set_default_emit_promoted_ops_enabled(
+        prev_default_emit_promoted_ops);
+  }
+  bool prev_default_value_for_unspecified_arg_mode;
+  bool prev_default_args_before_out_args;
+  bool prev_default_emit_promoted_ops;
+};
+
+TORCH_API IValue to_tuple(std::vector<IValue> ivalues);
+TORCH_API IValue
+Table(const std::vector<std::pair<std::string, IValue>>& entries);
+
+// TODO remove these switches once interface call is rolled out.
+TORCH_API void enableMobileInterfaceCallExport();
+bool getMobileInterfaceCallExport();
+
+TORCH_API CompilationOptions getOptionsFromGlobal();
+
+TORCH_API void save_jit_module(
+    const Module& module,
+    const std::string& filename,
+    const ExtraFilesMap& extra_files = ExtraFilesMap());
+
+TORCH_API DetachedBuffer::UniqueDetachedBuffer save_jit_module_to_bytes(
+    const Module& module,
+    const ExtraFilesMap& extra_files = ExtraFilesMap());
+
+TORCH_API void save_jit_module_to_write_func(
+    const Module& module,
+    const ExtraFilesMap& extra_files,
+    bool save_mobile_debug_info,
+    const std::function<size_t(const void*, size_t)>& writer_func);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/export_bytecode.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/export_bytecode.h
new file mode 100644
index 0000000000000000000000000000000000000000..b85574121544870eb15140cf19c1edf9f698e17f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/export_bytecode.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <ATen/core/function_schema.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
+#include <ATen/core/qualified_name.h>
+#include <torch/csrc/jit/backends/backend_debug_handler.h>
+#include <torch/csrc/jit/mobile/function.h>
+#include <torch/csrc/jit/mobile/module.h>
+#include <torch/csrc/jit/runtime/interpreter.h>
+#include <torch/csrc/jit/serialization/type_name_uniquer.h>
+
+namespace torch::jit {
+
+struct TORCH_API CompilationOptions {
+  bool incl_interface_call = false;
+  bool enable_default_value_for_unspecified_arg = false;
+  bool enable_default_args_before_out_args = true;
+  bool enable_emit_promoted_ops = true;
+  int model_version = caffe2::serialize::kProducedBytecodeVersion;
+};
+
+TORCH_API mobile::Module jitModuleToMobile(
+    const Module& module,
+    const CompilationOptions& options);
+
+mobile::Code compileGraphToMobileCode(
+    const std::string& name,
+    const std::shared_ptr<Graph>& graph,
+    const CompilationOptions& compilation_options,
+    BackendDebugInfoRecorder& debug_info_recorder);
+
+TORCH_API std::unique_ptr<mobile::Function> convertJitFunctionToMobileFunction(
+    const GraphFunction& function,
+    const CompilationOptions& options);
+
+TORCH_API IValue convertMobileFunctionToCodeTable(
+    const mobile::Function& func,
+    const CompilationOptions& compilation_options);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/flatbuffer_serializer.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/flatbuffer_serializer.h
new file mode 100644
index 0000000000000000000000000000000000000000..29542348873899996801abf3dabec00e6dfd48ea
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/flatbuffer_serializer.h
@@ -0,0 +1,92 @@
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <ATen/core/ivalue.h>
+#include <c10/macros/Macros.h>
+#include <torch/csrc/jit/mobile/module.h>
+
+/**
+ * Defines the public API for serializing mobile modules to flatbuffer.
+ * Note that this header must not include or depend on flatbuffer-defined
+ * types, to avoid leaking those details to PyTorch clients.
+ */
+
+namespace torch::jit {
+
+/// Maps file names to file contents.
+using ExtraFilesMap = std::unordered_map<std::string, std::string>;
+
+/**
+ * Represents a span of data. Typically owned by a UniqueDetachedBuffer.
+ */
+class TORCH_API DetachedBuffer final {
+ public:
+  /// Creates a new DetachedBuffer with an optional data owner. This interface
+  /// is provided to let users create objects of this type for testing.
+  DetachedBuffer(void* data, size_t size, void* internal_data_owner = nullptr)
+      : data_(data), size_(size), data_owner_(internal_data_owner) {}
+
+  /// Returns a pointer to the data.
+  [[nodiscard]] void* data() {
+    return data_;
+  }
+  /// Returns a pointer to the data.
+  [[nodiscard]] const void* data() const {
+    return data_;
+  }
+  /// Returns the size of the data, in bytes.
+  [[nodiscard]] size_t size() const {
+    return size_;
+  }
+
+  /// Wrapper type that typically owns data_owner_.
+  using UniqueDetachedBuffer =
+      std::unique_ptr<DetachedBuffer, std::function<void(DetachedBuffer*)>>;
+
+ private:
+  /// Deletes the owner, if present, and the buf itself.
+  /// Note: we could have provided a movable type with a destructor that did
+  /// this work, but the unique wrapper was easier in practice.
+  static void destroy(DetachedBuffer* buf);
+
+  /// Provides access to destroy() for implementation and testing.
+  friend struct DetachedBufferFriend;
+  friend struct DetachedBufferTestingFriend;
+
+  /// Pointer to the data. Not owned by this class.
+  void* data_;
+  /// The size of `data_`, in bytes.
+  size_t size_;
+  /// Opaque pointer to the underlying owner of `data_`. This class
+  /// (DetachedBuffer) does not own the owner or the data. It will typically be
+  /// owned by a UniqueDetachedBuffer that knows how to delete the owner along
+  /// with this class.
+  void* data_owner_;
+};
+
+TORCH_API void save_mobile_module(
+    const mobile::Module& module,
+    const std::string& filename,
+    const ExtraFilesMap& extra_files = ExtraFilesMap(),
+    const ExtraFilesMap& jit_sources = ExtraFilesMap(),
+    const std::vector<IValue>& jit_constants = {});
+
+TORCH_API DetachedBuffer::UniqueDetachedBuffer save_mobile_module_to_bytes(
+    const mobile::Module& module,
+    const ExtraFilesMap& extra_files = ExtraFilesMap(),
+    const ExtraFilesMap& jit_sources = ExtraFilesMap(),
+    const std::vector<IValue>& jit_constants = {});
+
+TORCH_API void save_mobile_module_to_func(
+    const mobile::Module& module,
+    const std::function<size_t(const void*, size_t)>& writer_func);
+
+// TODO(qihan): delete
+TORCH_API bool register_flatbuffer_serializer();
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/flatbuffer_serializer_jit.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/flatbuffer_serializer_jit.h
new file mode 100644
index 0000000000000000000000000000000000000000..040d79858b29bc1c492f5927009abf15e0c44704
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/flatbuffer_serializer_jit.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/csrc/jit/serialization/flatbuffer_serializer.h>
+
+namespace torch::jit {
+
+TORCH_API bool register_flatbuffer_all();
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import.h
new file mode 100644
index 0000000000000000000000000000000000000000..4aaed8d4262d3ce61d01c94520109d42325521d9
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import.h
@@ -0,0 +1,153 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <caffe2/serialize/inline_container.h>
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+
+#include <istream>
+
+namespace caffe2::serialize {
+class ReadAdapterInterface;
+} // namespace caffe2::serialize
+
+namespace torch::jit {
+
+class DeserializationStorageContext;
+
+TORCH_API Module import_ir_module(
+    std::shared_ptr<CompilationUnit> cu,
+    const std::string& filename,
+    std::optional<c10::Device> device = std::nullopt,
+    bool load_debug_files = true);
+
+TORCH_API Module import_ir_module(
+    std::shared_ptr<CompilationUnit> cu,
+    std::istream& in,
+    std::optional<c10::Device> device = std::nullopt,
+    bool load_debug_files = true);
+
+TORCH_API Module import_ir_module(
+    std::shared_ptr<CompilationUnit> cu,
+    std::unique_ptr<caffe2::serialize::ReadAdapterInterface> rai,
+    std::optional<c10::Device> device = std::nullopt,
+    bool load_debug_files = true);
+
+TORCH_API Module import_ir_module(
+    std::shared_ptr<CompilationUnit> cu,
+    const std::string& filename,
+    std::optional<c10::Device> device,
+    ExtraFilesMap& extra_files,
+    bool load_debug_files = true,
+    bool restore_shapes = false);
+
+// For reading unified serialization format from torch.Package
+TORCH_API Module import_ir_module(
+    std::shared_ptr<CompilationUnit> cu,
+    std::shared_ptr<caffe2::serialize::PyTorchStreamReader> reader,
+    std::shared_ptr<torch::jit::DeserializationStorageContext> storage_context,
+    std::optional<at::Device> device,
+    const std::string& ts_id /* torchscript identifier inside package */);
+
+TORCH_API Module import_ir_module(
+    std::shared_ptr<CompilationUnit> cu,
+    std::istream& in,
+    std::optional<c10::Device> device,
+    ExtraFilesMap& extra_files,
+    bool load_debug_files = true,
+    bool restore_shapes = false);
+
+TORCH_API Module import_ir_module(
+    std::shared_ptr<CompilationUnit> cu,
+    std::unique_ptr<caffe2::serialize::ReadAdapterInterface> rai,
+    std::optional<c10::Device> device,
+    ExtraFilesMap& extra_files,
+    bool load_debug_files = true);
+
+TORCH_API Module import_ir_module(
+    std::shared_ptr<CompilationUnit> cu,
+    std::shared_ptr<caffe2::serialize::ReadAdapterInterface> rai,
+    std::optional<c10::Device> device,
+    ExtraFilesMap& extra_files,
+    bool load_debug_files = true);
+
+/// Loads a serialized `Module` from the given `istream`.
+///
+/// The istream must contain a serialized `Module`, exported via
+/// `torch::jit::ExportModule` in C++.
+TORCH_API Module load(
+    std::istream& in,
+    std::optional<c10::Device> device = std::nullopt,
+    bool load_debug_files = true);
+
+TORCH_API Module load(
+    std::istream& in,
+    std::optional<c10::Device> device,
+    ExtraFilesMap& extra_files,
+    bool load_debug_files = true);
+
+/// Loads a serialized `Module` from the given `filename`.
+///
+/// The file stored at the location given in `filename` must contain a
+/// serialized `Module`, exported either via `ScriptModule.save()` in
+/// Python or `torch::jit::ExportModule` in C++.
+TORCH_API Module load(
+    const std::string& filename,
+    std::optional<c10::Device> device = std::nullopt,
+    bool load_debug_files = true);
+
+TORCH_API Module load(
+    const std::string& filename,
+    std::optional<c10::Device> device,
+    ExtraFilesMap& extra_files,
+    bool load_debug_files = true);
+
+/// Loads a serialized `Module` from the given shared_ptr `rai`.
+///
+/// The reader adapter, which is for customized input stream, must contain a
+/// serialized `Module`, exported either via `ScriptModule.save()` in
+/// Python or `torch::jit::ExportModule` in C++.
+TORCH_API Module load(
+    std::shared_ptr<caffe2::serialize::ReadAdapterInterface> rai,
+    std::optional<c10::Device> device = std::nullopt,
+    bool load_debug_files = true);
+
+TORCH_API Module load(
+    std::shared_ptr<caffe2::serialize::ReadAdapterInterface> rai,
+    std::optional<c10::Device> device,
+    ExtraFilesMap& extra_files,
+    bool load_debug_files = true);
+
+TORCH_API Module jitModuleFromSourceAndConstants(
+    const IValue& ivalue,
+    const ExtraFilesMap& source,
+    const std::vector<IValue>& constants,
+    int32_t version);
+
+TORCH_API Module parse_and_initialize_jit_module(
+    const std::shared_ptr<char>& data,
+    size_t size,
+    ExtraFilesMap& extra_files,
+    std::optional<at::Device> device = std::nullopt);
+
+TORCH_API Module load_jit_module_from_file(
+    const std::string& filename,
+    ExtraFilesMap& extra_files,
+    std::optional<at::Device> device = std::nullopt);
+
+TORCH_API Module load_jit_module_from_stream(
+    std::istream& in,
+    ExtraFilesMap& extra_files,
+    std::optional<at::Device> device = std::nullopt);
+
+TORCH_API Module parse_and_initialize_jit_module(
+    const std::shared_ptr<char>& data,
+    size_t size,
+    ExtraFilesMap& extra_files,
+    std::optional<at::Device> device);
+
+TORCH_API c10::intrusive_ptr<c10::ivalue::Object> ObjLoaderFunc(
+    const at::StrongTypePtr& type,
+    IValue input);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import_export_constants.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import_export_constants.h
new file mode 100644
index 0000000000000000000000000000000000000000..c03f8734c4e0817a20fac142820a94c37f63fb0f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import_export_constants.h
@@ -0,0 +1,19 @@
+#pragma once
+#include <cstddef>
+
+namespace torch::jit {
+constexpr size_t BYTECODE_INDEX_INSTRUCTION = 0;
+constexpr size_t BYTECODE_INDEX_OPERATOR = 1;
+constexpr size_t BYTECODE_INDEX_CONSTANT = 2;
+constexpr size_t BYTECODE_INDEX_TYPE = 3;
+constexpr size_t BYTECODE_INDEX_REGISTER_SIZE = 4;
+
+constexpr size_t BYTECODE_INDEX_SCHEMA_ARGUMENTS = 0;
+constexpr size_t BYTECODE_INDEX_SCHEMA_RETURNS = 1;
+
+constexpr size_t BYTECODE_INDEX_ARGUMENT_NAME = 0;
+constexpr size_t BYTECODE_INDEX_ARGUMENT_TYPE = 1;
+constexpr size_t BYTECODE_INDEX_ARGUMENT_DEFAULT_VALUE = 2;
+
+constexpr size_t BYTECODE_INDEX_MODULE_DEBUG_HANDLES = 0;
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import_export_functions.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import_export_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd67f0dc6d13df17279a7356dcd2fca161d9c412
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import_export_functions.h
@@ -0,0 +1,15 @@
+#pragma once
+#include <ATen/core/ivalue.h>
+
+// Functions that are used in both import and export processes
+
+namespace torch::jit {
+using c10::IValue;
+IValue expect_field(
+    c10::ivalue::TupleElements& elements,
+    const std::string& expected_name,
+    size_t entry);
+std::string operator_str(
+    const std::string& name,
+    const std::string& overloadname);
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import_export_helpers.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import_export_helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..c34d5cbda19b9c3712d62a7c6746a07b6ae0bae6
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import_export_helpers.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <memory>
+#include <string>
+
+namespace caffe2::serialize {
+class PyTorchStreamReader;
+}
+
+namespace torch::jit {
+
+struct Source;
+
+// Convert a class type's qualifier name to the corresponding path the source
+// file it should be written to.
+//
+// Qualifier is like: foo.bar.baz
+// Returns: libs/foo/bar/baz.py
+std::string qualifierToArchivePath(
+    const std::string& qualifier,
+    const std::string& export_prefix);
+
+std::shared_ptr<Source> findSourceInArchiveFromQualifier(
+    caffe2::serialize::PyTorchStreamReader& reader,
+    const std::string& export_prefix,
+    const std::string& qualifier);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import_read.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import_read.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b31b48fe0baf70a630bc6b6f48c6ead1fe77cf7
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import_read.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <torch/csrc/jit/serialization/unpickler.h>
+#include <memory>
+
+namespace caffe2::serialize {
+class PyTorchStreamReader;
+} // namespace caffe2::serialize
+
+namespace torch::jit {
+
+TORCH_API IValue readArchiveAndTensors(
+    const std::string& archive_name,
+    const std::string& pickle_prefix,
+    const std::string& tensor_prefix,
+    std::optional<TypeResolver> type_resolver,
+    std::optional<ObjLoader> obj_loader,
+    std::optional<at::Device> device,
+    caffe2::serialize::PyTorchStreamReader& stream_reader,
+    c10::TypePtr (*type_parser)(const std::string&) =
+        Unpickler::defaultTypeParser,
+    std::shared_ptr<DeserializationStorageContext> storage_context = nullptr);
+
+bool check_zip_file(
+    const std::shared_ptr<caffe2::serialize::ReadAdapterInterface>& rai);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import_source.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import_source.h
new file mode 100644
index 0000000000000000000000000000000000000000..741c5109de1b3e30142009599104e3b91f47b8b9
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/import_source.h
@@ -0,0 +1,100 @@
+#pragma once
+
+#include <ATen/core/ivalue_inl.h>
+#include <ATen/core/qualified_name.h>
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/frontend/parser.h>
+#include <torch/csrc/jit/frontend/resolver.h>
+#include <torch/csrc/jit/frontend/script_type_parser.h>
+#include <torch/csrc/jit/frontend/source_range.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/serialization/export.h>
+#include <torch/custom_class.h>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+namespace torch::jit {
+
+using SourceLoader = std::function<std::shared_ptr<Source>(const std::string&)>;
+
+struct SourceImporterImpl : public Resolver,
+                            std::enable_shared_from_this<SourceImporterImpl> {
+  SourceImporterImpl(
+      std::shared_ptr<CompilationUnit> cu,
+      const std::vector<at::IValue>* constant_table,
+      SourceLoader source_loader,
+      size_t version);
+  TypePtr findNamedType(const QualifiedName& name);
+  Function* findFunction(const QualifiedName& name);
+  void parseSourceIfNeeded(const std::string& qualifier);
+  void LEGACY_import_methods(
+      const Module& mod,
+      const std::shared_ptr<Source>& src);
+
+  std::shared_ptr<SugaredValue> resolveValue(
+      const std::string& name,
+      GraphFunction& m,
+      const SourceRange& loc) override;
+  TypePtr resolveType(const std::string& name, const SourceRange& loc) override;
+
+ private:
+  void importFunction(const std::string& qualifier, const Def& def);
+  void importNamedType(const std::string& qualifier, const ClassDef& class_def);
+  std::optional<Assign> attributeAssignmentSpecialHandlingHack(
+      const QualifiedName& qualified_classname,
+      const Assign& assign);
+  void importClass(
+      const QualifiedName& qualified_classname,
+      const ClassDef& class_def,
+      bool is_module);
+  void importEnum(
+      const QualifiedName& qualified_name,
+      const ClassDef& enum_def);
+  void importNamedTuple(
+      const QualifiedName& qualified_name,
+      const ClassDef& named_tuple_def);
+
+  void parsePossibleVersionNumber(Lexer& L);
+
+  void parseImports(Lexer& L);
+
+  std::shared_ptr<CompilationUnit> cu_;
+  std::unordered_map<std::string, std::shared_ptr<SugaredValue>> env_;
+  SourceLoader source_loader_;
+  std::optional<size_t> version_ = std::nullopt;
+  std::unordered_set<std::string> loaded_sources_;
+  // named types and functions loaded from a file but not yet defined because
+  // their type has not been requested yet.
+  std::unordered_map<QualifiedName, TreeRef> to_be_defined_;
+};
+
+// Given a directory of serialized TorchScript sources,
+// This class allows the loading of individual named types in source.
+// Resolves the dependencies between source files and parses
+// the source files as necessary.
+
+struct TORCH_API SourceImporter {
+  SourceImporter(
+      // The compilation unit that will own the imported source
+      std::shared_ptr<CompilationUnit> cu,
+      const std::vector<at::IValue>* constant_table,
+      SourceLoader loader,
+      size_t version);
+
+  TypePtr loadType(const QualifiedName& name) const;
+
+  // Add the methods defined in `src` to the module `mod`, using SourceImporter
+  // to resolve any classes via loadType
+  void LEGACY_import_methods(
+      const Module& mod,
+      const std::shared_ptr<Source>& src);
+  ~SourceImporter();
+
+ private:
+  std::shared_ptr<SourceImporterImpl> pImpl;
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/mobile_bytecode_generated.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/mobile_bytecode_generated.h
new file mode 100644
index 0000000000000000000000000000000000000000..7bbcd8468be1b1f268f99ba1809538d7e91db3f5
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/mobile_bytecode_generated.h
@@ -0,0 +1,2600 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_MOBILEBYTECODE_TORCH_JIT_MOBILE_SERIALIZATION_H_
+#define FLATBUFFERS_GENERATED_MOBILEBYTECODE_TORCH_JIT_MOBILE_SERIALIZATION_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+// Ensure the included flatbuffers.h is the same version as when this file was
+// generated, otherwise it may not be compatible.
+static_assert(FLATBUFFERS_VERSION_MAJOR == 24 &&
+              FLATBUFFERS_VERSION_MINOR == 12 &&
+              FLATBUFFERS_VERSION_REVISION == 23,
+             "Non-compatible flatbuffers version included");
+
+namespace torch {
+namespace jit {
+namespace mobile {
+namespace serialization {
+
+struct Int;
+
+struct Bool;
+
+struct Double;
+
+struct PerTensorAffineSchema;
+
+struct QuantizedSchema;
+struct QuantizedSchemaBuilder;
+
+struct TensorMetadata;
+struct TensorMetadataBuilder;
+
+struct String;
+struct StringBuilder;
+
+struct Device;
+struct DeviceBuilder;
+
+struct List;
+struct ListBuilder;
+
+struct IntList;
+struct IntListBuilder;
+
+struct DoubleList;
+struct DoubleListBuilder;
+
+struct BoolList;
+struct BoolListBuilder;
+
+struct Tuple;
+struct TupleBuilder;
+
+struct Dict;
+struct DictBuilder;
+
+struct ObjectType;
+struct ObjectTypeBuilder;
+
+struct Object;
+struct ObjectBuilder;
+
+struct ComplexDouble;
+
+struct EnumValue;
+struct EnumValueBuilder;
+
+struct Instruction;
+
+struct Operator;
+struct OperatorBuilder;
+
+struct Arg;
+struct ArgBuilder;
+
+struct Schema;
+struct SchemaBuilder;
+
+struct DebugInfo;
+struct DebugInfoBuilder;
+
+struct Function;
+struct FunctionBuilder;
+
+struct StorageData;
+struct StorageDataBuilder;
+
+struct IValue;
+struct IValueBuilder;
+
+struct ExtraFile;
+struct ExtraFileBuilder;
+
+struct Module;
+struct ModuleBuilder;
+
+enum class TypeType : uint8_t {
+  UNSET = 0,
+  CLASS_WITH_FIELD = 1,
+  CUSTOM_CLASS = 2,
+  CLASS_WITH_SETSTATE = 3,
+  NON_OBJ = 4,
+  MIN = UNSET,
+  MAX = NON_OBJ
+};
+
+inline const TypeType (&EnumValuesTypeType())[5] {
+  static const TypeType values[] = {
+    TypeType::UNSET,
+    TypeType::CLASS_WITH_FIELD,
+    TypeType::CUSTOM_CLASS,
+    TypeType::CLASS_WITH_SETSTATE,
+    TypeType::NON_OBJ
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesTypeType() {
+  static const char * const names[6] = {
+    "UNSET",
+    "CLASS_WITH_FIELD",
+    "CUSTOM_CLASS",
+    "CLASS_WITH_SETSTATE",
+    "NON_OBJ",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameTypeType(TypeType e) {
+  if (::flatbuffers::IsOutRange(e, TypeType::UNSET, TypeType::NON_OBJ)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesTypeType()[index];
+}
+
+enum class IValueUnion : uint8_t {
+  NONE = 0,
+  Int = 1,
+  Bool = 2,
+  Double = 3,
+  ComplexDouble = 4,
+  TensorMetadata = 5,
+  String = 6,
+  List = 7,
+  Tuple = 8,
+  Dict = 9,
+  Object = 10,
+  IntList = 11,
+  DoubleList = 12,
+  BoolList = 13,
+  Device = 14,
+  EnumValue = 15,
+  Function = 16,
+  MIN = NONE,
+  MAX = Function
+};
+
+inline const IValueUnion (&EnumValuesIValueUnion())[17] {
+  static const IValueUnion values[] = {
+    IValueUnion::NONE,
+    IValueUnion::Int,
+    IValueUnion::Bool,
+    IValueUnion::Double,
+    IValueUnion::ComplexDouble,
+    IValueUnion::TensorMetadata,
+    IValueUnion::String,
+    IValueUnion::List,
+    IValueUnion::Tuple,
+    IValueUnion::Dict,
+    IValueUnion::Object,
+    IValueUnion::IntList,
+    IValueUnion::DoubleList,
+    IValueUnion::BoolList,
+    IValueUnion::Device,
+    IValueUnion::EnumValue,
+    IValueUnion::Function
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesIValueUnion() {
+  static const char * const names[18] = {
+    "NONE",
+    "Int",
+    "Bool",
+    "Double",
+    "ComplexDouble",
+    "TensorMetadata",
+    "String",
+    "List",
+    "Tuple",
+    "Dict",
+    "Object",
+    "IntList",
+    "DoubleList",
+    "BoolList",
+    "Device",
+    "EnumValue",
+    "Function",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameIValueUnion(IValueUnion e) {
+  if (::flatbuffers::IsOutRange(e, IValueUnion::NONE, IValueUnion::Function)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesIValueUnion()[index];
+}
+
+template<typename T> struct IValueUnionTraits {
+  static const IValueUnion enum_value = IValueUnion::NONE;
+};
+
+template<> struct IValueUnionTraits<torch::jit::mobile::serialization::Int> {
+  static const IValueUnion enum_value = IValueUnion::Int;
+};
+
+template<> struct IValueUnionTraits<torch::jit::mobile::serialization::Bool> {
+  static const IValueUnion enum_value = IValueUnion::Bool;
+};
+
+template<> struct IValueUnionTraits<torch::jit::mobile::serialization::Double> {
+  static const IValueUnion enum_value = IValueUnion::Double;
+};
+
+template<> struct IValueUnionTraits<torch::jit::mobile::serialization::ComplexDouble> {
+  static const IValueUnion enum_value = IValueUnion::ComplexDouble;
+};
+
+template<> struct IValueUnionTraits<torch::jit::mobile::serialization::TensorMetadata> {
+  static const IValueUnion enum_value = IValueUnion::TensorMetadata;
+};
+
+template<> struct IValueUnionTraits<torch::jit::mobile::serialization::String> {
+  static const IValueUnion enum_value = IValueUnion::String;
+};
+
+template<> struct IValueUnionTraits<torch::jit::mobile::serialization::List> {
+  static const IValueUnion enum_value = IValueUnion::List;
+};
+
+template<> struct IValueUnionTraits<torch::jit::mobile::serialization::Tuple> {
+  static const IValueUnion enum_value = IValueUnion::Tuple;
+};
+
+template<> struct IValueUnionTraits<torch::jit::mobile::serialization::Dict> {
+  static const IValueUnion enum_value = IValueUnion::Dict;
+};
+
+template<> struct IValueUnionTraits<torch::jit::mobile::serialization::Object> {
+  static const IValueUnion enum_value = IValueUnion::Object;
+};
+
+template<> struct IValueUnionTraits<torch::jit::mobile::serialization::IntList> {
+  static const IValueUnion enum_value = IValueUnion::IntList;
+};
+
+template<> struct IValueUnionTraits<torch::jit::mobile::serialization::DoubleList> {
+  static const IValueUnion enum_value = IValueUnion::DoubleList;
+};
+
+template<> struct IValueUnionTraits<torch::jit::mobile::serialization::BoolList> {
+  static const IValueUnion enum_value = IValueUnion::BoolList;
+};
+
+template<> struct IValueUnionTraits<torch::jit::mobile::serialization::Device> {
+  static const IValueUnion enum_value = IValueUnion::Device;
+};
+
+template<> struct IValueUnionTraits<torch::jit::mobile::serialization::EnumValue> {
+  static const IValueUnion enum_value = IValueUnion::EnumValue;
+};
+
+template<> struct IValueUnionTraits<torch::jit::mobile::serialization::Function> {
+  static const IValueUnion enum_value = IValueUnion::Function;
+};
+
+bool VerifyIValueUnion(::flatbuffers::Verifier &verifier, const void *obj, IValueUnion type);
+bool VerifyIValueUnionVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<IValueUnion> *types);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) Int FLATBUFFERS_FINAL_CLASS {
+ private:
+  int64_t int_val_;
+
+ public:
+  Int()
+      : int_val_(0) {
+  }
+  Int(int64_t _int_val)
+      : int_val_(::flatbuffers::EndianScalar(_int_val)) {
+  }
+  int64_t int_val() const {
+    return ::flatbuffers::EndianScalar(int_val_);
+  }
+  void mutate_int_val(int64_t _int_val) {
+    ::flatbuffers::WriteScalar(&int_val_, _int_val);
+  }
+};
+FLATBUFFERS_STRUCT_END(Int, 8);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(1) Bool FLATBUFFERS_FINAL_CLASS {
+ private:
+  uint8_t bool_val_;
+
+ public:
+  Bool()
+      : bool_val_(0) {
+  }
+  Bool(bool _bool_val)
+      : bool_val_(::flatbuffers::EndianScalar(static_cast<uint8_t>(_bool_val))) {
+  }
+  bool bool_val() const {
+    return ::flatbuffers::EndianScalar(bool_val_) != 0;
+  }
+  void mutate_bool_val(bool _bool_val) {
+    ::flatbuffers::WriteScalar(&bool_val_, static_cast<uint8_t>(_bool_val));
+  }
+};
+FLATBUFFERS_STRUCT_END(Bool, 1);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) Double FLATBUFFERS_FINAL_CLASS {
+ private:
+  double double_val_;
+
+ public:
+  Double()
+      : double_val_(0) {
+  }
+  Double(double _double_val)
+      : double_val_(::flatbuffers::EndianScalar(_double_val)) {
+  }
+  double double_val() const {
+    return ::flatbuffers::EndianScalar(double_val_);
+  }
+  void mutate_double_val(double _double_val) {
+    ::flatbuffers::WriteScalar(&double_val_, _double_val);
+  }
+};
+FLATBUFFERS_STRUCT_END(Double, 8);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) PerTensorAffineSchema FLATBUFFERS_FINAL_CLASS {
+ private:
+  double q_scale_;
+  int32_t q_zero_point_;
+  int32_t padding0__;
+
+ public:
+  PerTensorAffineSchema()
+      : q_scale_(0),
+        q_zero_point_(0),
+        padding0__(0) {
+    (void)padding0__;
+  }
+  PerTensorAffineSchema(double _q_scale, int32_t _q_zero_point)
+      : q_scale_(::flatbuffers::EndianScalar(_q_scale)),
+        q_zero_point_(::flatbuffers::EndianScalar(_q_zero_point)),
+        padding0__(0) {
+    (void)padding0__;
+  }
+  double q_scale() const {
+    return ::flatbuffers::EndianScalar(q_scale_);
+  }
+  void mutate_q_scale(double _q_scale) {
+    ::flatbuffers::WriteScalar(&q_scale_, _q_scale);
+  }
+  int32_t q_zero_point() const {
+    return ::flatbuffers::EndianScalar(q_zero_point_);
+  }
+  void mutate_q_zero_point(int32_t _q_zero_point) {
+    ::flatbuffers::WriteScalar(&q_zero_point_, _q_zero_point);
+  }
+};
+FLATBUFFERS_STRUCT_END(PerTensorAffineSchema, 16);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) ComplexDouble FLATBUFFERS_FINAL_CLASS {
+ private:
+  double real_;
+  double imag_;
+
+ public:
+  ComplexDouble()
+      : real_(0),
+        imag_(0) {
+  }
+  ComplexDouble(double _real, double _imag)
+      : real_(::flatbuffers::EndianScalar(_real)),
+        imag_(::flatbuffers::EndianScalar(_imag)) {
+  }
+  double real() const {
+    return ::flatbuffers::EndianScalar(real_);
+  }
+  void mutate_real(double _real) {
+    ::flatbuffers::WriteScalar(&real_, _real);
+  }
+  double imag() const {
+    return ::flatbuffers::EndianScalar(imag_);
+  }
+  void mutate_imag(double _imag) {
+    ::flatbuffers::WriteScalar(&imag_, _imag);
+  }
+};
+FLATBUFFERS_STRUCT_END(ComplexDouble, 16);
+
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) Instruction FLATBUFFERS_FINAL_CLASS {
+ private:
+  int8_t op_;
+  int8_t padding0__;
+  uint16_t n_;
+  int32_t x_;
+
+ public:
+  Instruction()
+      : op_(0),
+        padding0__(0),
+        n_(0),
+        x_(0) {
+    (void)padding0__;
+  }
+  Instruction(int8_t _op, uint16_t _n, int32_t _x)
+      : op_(::flatbuffers::EndianScalar(_op)),
+        padding0__(0),
+        n_(::flatbuffers::EndianScalar(_n)),
+        x_(::flatbuffers::EndianScalar(_x)) {
+    (void)padding0__;
+  }
+  int8_t op() const {
+    return ::flatbuffers::EndianScalar(op_);
+  }
+  void mutate_op(int8_t _op) {
+    ::flatbuffers::WriteScalar(&op_, _op);
+  }
+  uint16_t n() const {
+    return ::flatbuffers::EndianScalar(n_);
+  }
+  void mutate_n(uint16_t _n) {
+    ::flatbuffers::WriteScalar(&n_, _n);
+  }
+  int32_t x() const {
+    return ::flatbuffers::EndianScalar(x_);
+  }
+  void mutate_x(int32_t _x) {
+    ::flatbuffers::WriteScalar(&x_, _x);
+  }
+};
+FLATBUFFERS_STRUCT_END(Instruction, 8);
+
+struct QuantizedSchema FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef QuantizedSchemaBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_QSCHEME = 4,
+    VT_SCALE = 6,
+    VT_ZERO_POINT = 8,
+    VT_SCALES = 10,
+    VT_ZERO_POINTS = 12,
+    VT_AXIS = 14
+  };
+  int8_t qscheme() const {
+    return GetField<int8_t>(VT_QSCHEME, 0);
+  }
+  bool mutate_qscheme(int8_t _qscheme = 0) {
+    return SetField<int8_t>(VT_QSCHEME, _qscheme, 0);
+  }
+  double scale() const {
+    return GetField<double>(VT_SCALE, 0.0);
+  }
+  bool mutate_scale(double _scale = 0.0) {
+    return SetField<double>(VT_SCALE, _scale, 0.0);
+  }
+  int32_t zero_point() const {
+    return GetField<int32_t>(VT_ZERO_POINT, 0);
+  }
+  bool mutate_zero_point(int32_t _zero_point = 0) {
+    return SetField<int32_t>(VT_ZERO_POINT, _zero_point, 0);
+  }
+  const torch::jit::mobile::serialization::TensorMetadata *scales() const {
+    return GetPointer<const torch::jit::mobile::serialization::TensorMetadata *>(VT_SCALES);
+  }
+  torch::jit::mobile::serialization::TensorMetadata *mutable_scales() {
+    return GetPointer<torch::jit::mobile::serialization::TensorMetadata *>(VT_SCALES);
+  }
+  const torch::jit::mobile::serialization::TensorMetadata *zero_points() const {
+    return GetPointer<const torch::jit::mobile::serialization::TensorMetadata *>(VT_ZERO_POINTS);
+  }
+  torch::jit::mobile::serialization::TensorMetadata *mutable_zero_points() {
+    return GetPointer<torch::jit::mobile::serialization::TensorMetadata *>(VT_ZERO_POINTS);
+  }
+  int32_t axis() const {
+    return GetField<int32_t>(VT_AXIS, 0);
+  }
+  bool mutate_axis(int32_t _axis = 0) {
+    return SetField<int32_t>(VT_AXIS, _axis, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_QSCHEME, 1) &&
+           VerifyField<double>(verifier, VT_SCALE, 8) &&
+           VerifyField<int32_t>(verifier, VT_ZERO_POINT, 4) &&
+           VerifyOffset(verifier, VT_SCALES) &&
+           verifier.VerifyTable(scales()) &&
+           VerifyOffset(verifier, VT_ZERO_POINTS) &&
+           verifier.VerifyTable(zero_points()) &&
+           VerifyField<int32_t>(verifier, VT_AXIS, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct QuantizedSchemaBuilder {
+  typedef QuantizedSchema Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_qscheme(int8_t qscheme) {
+    fbb_.AddElement<int8_t>(QuantizedSchema::VT_QSCHEME, qscheme, 0);
+  }
+  void add_scale(double scale) {
+    fbb_.AddElement<double>(QuantizedSchema::VT_SCALE, scale, 0.0);
+  }
+  void add_zero_point(int32_t zero_point) {
+    fbb_.AddElement<int32_t>(QuantizedSchema::VT_ZERO_POINT, zero_point, 0);
+  }
+  void add_scales(::flatbuffers::Offset<torch::jit::mobile::serialization::TensorMetadata> scales) {
+    fbb_.AddOffset(QuantizedSchema::VT_SCALES, scales);
+  }
+  void add_zero_points(::flatbuffers::Offset<torch::jit::mobile::serialization::TensorMetadata> zero_points) {
+    fbb_.AddOffset(QuantizedSchema::VT_ZERO_POINTS, zero_points);
+  }
+  void add_axis(int32_t axis) {
+    fbb_.AddElement<int32_t>(QuantizedSchema::VT_AXIS, axis, 0);
+  }
+  explicit QuantizedSchemaBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<QuantizedSchema> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<QuantizedSchema>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<QuantizedSchema> CreateQuantizedSchema(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    int8_t qscheme = 0,
+    double scale = 0.0,
+    int32_t zero_point = 0,
+    ::flatbuffers::Offset<torch::jit::mobile::serialization::TensorMetadata> scales = 0,
+    ::flatbuffers::Offset<torch::jit::mobile::serialization::TensorMetadata> zero_points = 0,
+    int32_t axis = 0) {
+  QuantizedSchemaBuilder builder_(_fbb);
+  builder_.add_scale(scale);
+  builder_.add_axis(axis);
+  builder_.add_zero_points(zero_points);
+  builder_.add_scales(scales);
+  builder_.add_zero_point(zero_point);
+  builder_.add_qscheme(qscheme);
+  return builder_.Finish();
+}
+
+struct TensorMetadata FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef TensorMetadataBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_STORAGE_LOCATION_INDEX = 4,
+    VT_SCALAR_TYPE = 6,
+    VT_STORAGE_OFFSET = 8,
+    VT_SIZES = 10,
+    VT_STRIDES = 12,
+    VT_REQUIRES_GRAD = 14,
+    VT_QUANTIZED_SCHEMA = 16
+  };
+  uint32_t storage_location_index() const {
+    return GetField<uint32_t>(VT_STORAGE_LOCATION_INDEX, 0);
+  }
+  bool mutate_storage_location_index(uint32_t _storage_location_index = 0) {
+    return SetField<uint32_t>(VT_STORAGE_LOCATION_INDEX, _storage_location_index, 0);
+  }
+  int8_t scalar_type() const {
+    return GetField<int8_t>(VT_SCALAR_TYPE, 0);
+  }
+  bool mutate_scalar_type(int8_t _scalar_type = 0) {
+    return SetField<int8_t>(VT_SCALAR_TYPE, _scalar_type, 0);
+  }
+  int32_t storage_offset() const {
+    return GetField<int32_t>(VT_STORAGE_OFFSET, 0);
+  }
+  bool mutate_storage_offset(int32_t _storage_offset = 0) {
+    return SetField<int32_t>(VT_STORAGE_OFFSET, _storage_offset, 0);
+  }
+  const ::flatbuffers::Vector<int32_t> *sizes() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_SIZES);
+  }
+  ::flatbuffers::Vector<int32_t> *mutable_sizes() {
+    return GetPointer<::flatbuffers::Vector<int32_t> *>(VT_SIZES);
+  }
+  const ::flatbuffers::Vector<int32_t> *strides() const {
+    return GetPointer<const ::flatbuffers::Vector<int32_t> *>(VT_STRIDES);
+  }
+  ::flatbuffers::Vector<int32_t> *mutable_strides() {
+    return GetPointer<::flatbuffers::Vector<int32_t> *>(VT_STRIDES);
+  }
+  bool requires_grad() const {
+    return GetField<uint8_t>(VT_REQUIRES_GRAD, 0) != 0;
+  }
+  bool mutate_requires_grad(bool _requires_grad = 0) {
+    return SetField<uint8_t>(VT_REQUIRES_GRAD, static_cast<uint8_t>(_requires_grad), 0);
+  }
+  const torch::jit::mobile::serialization::QuantizedSchema *quantized_schema() const {
+    return GetPointer<const torch::jit::mobile::serialization::QuantizedSchema *>(VT_QUANTIZED_SCHEMA);
+  }
+  torch::jit::mobile::serialization::QuantizedSchema *mutable_quantized_schema() {
+    return GetPointer<torch::jit::mobile::serialization::QuantizedSchema *>(VT_QUANTIZED_SCHEMA);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint32_t>(verifier, VT_STORAGE_LOCATION_INDEX, 4) &&
+           VerifyField<int8_t>(verifier, VT_SCALAR_TYPE, 1) &&
+           VerifyField<int32_t>(verifier, VT_STORAGE_OFFSET, 4) &&
+           VerifyOffset(verifier, VT_SIZES) &&
+           verifier.VerifyVector(sizes()) &&
+           VerifyOffset(verifier, VT_STRIDES) &&
+           verifier.VerifyVector(strides()) &&
+           VerifyField<uint8_t>(verifier, VT_REQUIRES_GRAD, 1) &&
+           VerifyOffset(verifier, VT_QUANTIZED_SCHEMA) &&
+           verifier.VerifyTable(quantized_schema()) &&
+           verifier.EndTable();
+  }
+};
+
+struct TensorMetadataBuilder {
+  typedef TensorMetadata Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_storage_location_index(uint32_t storage_location_index) {
+    fbb_.AddElement<uint32_t>(TensorMetadata::VT_STORAGE_LOCATION_INDEX, storage_location_index, 0);
+  }
+  void add_scalar_type(int8_t scalar_type) {
+    fbb_.AddElement<int8_t>(TensorMetadata::VT_SCALAR_TYPE, scalar_type, 0);
+  }
+  void add_storage_offset(int32_t storage_offset) {
+    fbb_.AddElement<int32_t>(TensorMetadata::VT_STORAGE_OFFSET, storage_offset, 0);
+  }
+  void add_sizes(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> sizes) {
+    fbb_.AddOffset(TensorMetadata::VT_SIZES, sizes);
+  }
+  void add_strides(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> strides) {
+    fbb_.AddOffset(TensorMetadata::VT_STRIDES, strides);
+  }
+  void add_requires_grad(bool requires_grad) {
+    fbb_.AddElement<uint8_t>(TensorMetadata::VT_REQUIRES_GRAD, static_cast<uint8_t>(requires_grad), 0);
+  }
+  void add_quantized_schema(::flatbuffers::Offset<torch::jit::mobile::serialization::QuantizedSchema> quantized_schema) {
+    fbb_.AddOffset(TensorMetadata::VT_QUANTIZED_SCHEMA, quantized_schema);
+  }
+  explicit TensorMetadataBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<TensorMetadata> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<TensorMetadata>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<TensorMetadata> CreateTensorMetadata(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    uint32_t storage_location_index = 0,
+    int8_t scalar_type = 0,
+    int32_t storage_offset = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> sizes = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> strides = 0,
+    bool requires_grad = false,
+    ::flatbuffers::Offset<torch::jit::mobile::serialization::QuantizedSchema> quantized_schema = 0) {
+  TensorMetadataBuilder builder_(_fbb);
+  builder_.add_quantized_schema(quantized_schema);
+  builder_.add_strides(strides);
+  builder_.add_sizes(sizes);
+  builder_.add_storage_offset(storage_offset);
+  builder_.add_storage_location_index(storage_location_index);
+  builder_.add_requires_grad(requires_grad);
+  builder_.add_scalar_type(scalar_type);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<TensorMetadata> CreateTensorMetadataDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    uint32_t storage_location_index = 0,
+    int8_t scalar_type = 0,
+    int32_t storage_offset = 0,
+    const std::vector<int32_t> *sizes = nullptr,
+    const std::vector<int32_t> *strides = nullptr,
+    bool requires_grad = false,
+    ::flatbuffers::Offset<torch::jit::mobile::serialization::QuantizedSchema> quantized_schema = 0) {
+  auto sizes__ = sizes ? _fbb.CreateVector<int32_t>(*sizes) : 0;
+  auto strides__ = strides ? _fbb.CreateVector<int32_t>(*strides) : 0;
+  return torch::jit::mobile::serialization::CreateTensorMetadata(
+      _fbb,
+      storage_location_index,
+      scalar_type,
+      storage_offset,
+      sizes__,
+      strides__,
+      requires_grad,
+      quantized_schema);
+}
+
+struct String FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StringBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DATA = 4
+  };
+  const ::flatbuffers::String *data() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_DATA);
+  }
+  ::flatbuffers::String *mutable_data() {
+    return GetPointer<::flatbuffers::String *>(VT_DATA);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_DATA) &&
+           verifier.VerifyString(data()) &&
+           verifier.EndTable();
+  }
+};
+
+struct StringBuilder {
+  typedef String Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_data(::flatbuffers::Offset<::flatbuffers::String> data) {
+    fbb_.AddOffset(String::VT_DATA, data);
+  }
+  explicit StringBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<String> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<String>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<String> CreateString(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> data = 0) {
+  StringBuilder builder_(_fbb);
+  builder_.add_data(data);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<String> CreateStringDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *data = nullptr) {
+  auto data__ = data ? _fbb.CreateString(data) : 0;
+  return torch::jit::mobile::serialization::CreateString(
+      _fbb,
+      data__);
+}
+
+struct Device FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DeviceBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_STR = 4
+  };
+  const ::flatbuffers::String *str() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_STR);
+  }
+  ::flatbuffers::String *mutable_str() {
+    return GetPointer<::flatbuffers::String *>(VT_STR);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_STR) &&
+           verifier.VerifyString(str()) &&
+           verifier.EndTable();
+  }
+};
+
+struct DeviceBuilder {
+  typedef Device Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_str(::flatbuffers::Offset<::flatbuffers::String> str) {
+    fbb_.AddOffset(Device::VT_STR, str);
+  }
+  explicit DeviceBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Device> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Device>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Device> CreateDevice(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> str = 0) {
+  DeviceBuilder builder_(_fbb);
+  builder_.add_str(str);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Device> CreateDeviceDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *str = nullptr) {
+  auto str__ = str ? _fbb.CreateString(str) : 0;
+  return torch::jit::mobile::serialization::CreateDevice(
+      _fbb,
+      str__);
+}
+
+struct List FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ListBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ITEMS = 4,
+    VT_ANNOTATION_STR = 6
+  };
+  const ::flatbuffers::Vector<uint32_t> *items() const {
+    return GetPointer<const ::flatbuffers::Vector<uint32_t> *>(VT_ITEMS);
+  }
+  ::flatbuffers::Vector<uint32_t> *mutable_items() {
+    return GetPointer<::flatbuffers::Vector<uint32_t> *>(VT_ITEMS);
+  }
+  const ::flatbuffers::String *annotation_str() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_ANNOTATION_STR);
+  }
+  ::flatbuffers::String *mutable_annotation_str() {
+    return GetPointer<::flatbuffers::String *>(VT_ANNOTATION_STR);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_ITEMS) &&
+           verifier.VerifyVector(items()) &&
+           VerifyOffset(verifier, VT_ANNOTATION_STR) &&
+           verifier.VerifyString(annotation_str()) &&
+           verifier.EndTable();
+  }
+};
+
+struct ListBuilder {
+  typedef List Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_items(::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> items) {
+    fbb_.AddOffset(List::VT_ITEMS, items);
+  }
+  void add_annotation_str(::flatbuffers::Offset<::flatbuffers::String> annotation_str) {
+    fbb_.AddOffset(List::VT_ANNOTATION_STR, annotation_str);
+  }
+  explicit ListBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<List> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<List>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<List> CreateList(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> items = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> annotation_str = 0) {
+  ListBuilder builder_(_fbb);
+  builder_.add_annotation_str(annotation_str);
+  builder_.add_items(items);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<List> CreateListDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<uint32_t> *items = nullptr,
+    const char *annotation_str = nullptr) {
+  auto items__ = items ? _fbb.CreateVector<uint32_t>(*items) : 0;
+  auto annotation_str__ = annotation_str ? _fbb.CreateString(annotation_str) : 0;
+  return torch::jit::mobile::serialization::CreateList(
+      _fbb,
+      items__,
+      annotation_str__);
+}
+
+struct IntList FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef IntListBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ITEMS = 4
+  };
+  const ::flatbuffers::Vector<int64_t> *items() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_ITEMS);
+  }
+  ::flatbuffers::Vector<int64_t> *mutable_items() {
+    return GetPointer<::flatbuffers::Vector<int64_t> *>(VT_ITEMS);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_ITEMS) &&
+           verifier.VerifyVector(items()) &&
+           verifier.EndTable();
+  }
+};
+
+struct IntListBuilder {
+  typedef IntList Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_items(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> items) {
+    fbb_.AddOffset(IntList::VT_ITEMS, items);
+  }
+  explicit IntListBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<IntList> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<IntList>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<IntList> CreateIntList(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> items = 0) {
+  IntListBuilder builder_(_fbb);
+  builder_.add_items(items);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<IntList> CreateIntListDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int64_t> *items = nullptr) {
+  auto items__ = items ? _fbb.CreateVector<int64_t>(*items) : 0;
+  return torch::jit::mobile::serialization::CreateIntList(
+      _fbb,
+      items__);
+}
+
+struct DoubleList FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DoubleListBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ITEMS = 4
+  };
+  const ::flatbuffers::Vector<double> *items() const {
+    return GetPointer<const ::flatbuffers::Vector<double> *>(VT_ITEMS);
+  }
+  ::flatbuffers::Vector<double> *mutable_items() {
+    return GetPointer<::flatbuffers::Vector<double> *>(VT_ITEMS);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_ITEMS) &&
+           verifier.VerifyVector(items()) &&
+           verifier.EndTable();
+  }
+};
+
+struct DoubleListBuilder {
+  typedef DoubleList Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_items(::flatbuffers::Offset<::flatbuffers::Vector<double>> items) {
+    fbb_.AddOffset(DoubleList::VT_ITEMS, items);
+  }
+  explicit DoubleListBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<DoubleList> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<DoubleList>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<DoubleList> CreateDoubleList(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<double>> items = 0) {
+  DoubleListBuilder builder_(_fbb);
+  builder_.add_items(items);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<DoubleList> CreateDoubleListDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<double> *items = nullptr) {
+  auto items__ = items ? _fbb.CreateVector<double>(*items) : 0;
+  return torch::jit::mobile::serialization::CreateDoubleList(
+      _fbb,
+      items__);
+}
+
+struct BoolList FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BoolListBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ITEMS = 4
+  };
+  const ::flatbuffers::Vector<uint8_t> *items() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_ITEMS);
+  }
+  ::flatbuffers::Vector<uint8_t> *mutable_items() {
+    return GetPointer<::flatbuffers::Vector<uint8_t> *>(VT_ITEMS);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_ITEMS) &&
+           verifier.VerifyVector(items()) &&
+           verifier.EndTable();
+  }
+};
+
+struct BoolListBuilder {
+  typedef BoolList Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_items(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> items) {
+    fbb_.AddOffset(BoolList::VT_ITEMS, items);
+  }
+  explicit BoolListBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BoolList> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<BoolList>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BoolList> CreateBoolList(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> items = 0) {
+  BoolListBuilder builder_(_fbb);
+  builder_.add_items(items);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<BoolList> CreateBoolListDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<uint8_t> *items = nullptr) {
+  auto items__ = items ? _fbb.CreateVector<uint8_t>(*items) : 0;
+  return torch::jit::mobile::serialization::CreateBoolList(
+      _fbb,
+      items__);
+}
+
+struct Tuple FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef TupleBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ITEMS = 4
+  };
+  const ::flatbuffers::Vector<uint32_t> *items() const {
+    return GetPointer<const ::flatbuffers::Vector<uint32_t> *>(VT_ITEMS);
+  }
+  ::flatbuffers::Vector<uint32_t> *mutable_items() {
+    return GetPointer<::flatbuffers::Vector<uint32_t> *>(VT_ITEMS);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_ITEMS) &&
+           verifier.VerifyVector(items()) &&
+           verifier.EndTable();
+  }
+};
+
+struct TupleBuilder {
+  typedef Tuple Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_items(::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> items) {
+    fbb_.AddOffset(Tuple::VT_ITEMS, items);
+  }
+  explicit TupleBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Tuple> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Tuple>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Tuple> CreateTuple(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> items = 0) {
+  TupleBuilder builder_(_fbb);
+  builder_.add_items(items);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Tuple> CreateTupleDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<uint32_t> *items = nullptr) {
+  auto items__ = items ? _fbb.CreateVector<uint32_t>(*items) : 0;
+  return torch::jit::mobile::serialization::CreateTuple(
+      _fbb,
+      items__);
+}
+
+struct Dict FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DictBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_KEYS = 4,
+    VT_VALUES = 6,
+    VT_ANNOTATION_STR = 8
+  };
+  const ::flatbuffers::Vector<uint32_t> *keys() const {
+    return GetPointer<const ::flatbuffers::Vector<uint32_t> *>(VT_KEYS);
+  }
+  ::flatbuffers::Vector<uint32_t> *mutable_keys() {
+    return GetPointer<::flatbuffers::Vector<uint32_t> *>(VT_KEYS);
+  }
+  const ::flatbuffers::Vector<uint32_t> *values() const {
+    return GetPointer<const ::flatbuffers::Vector<uint32_t> *>(VT_VALUES);
+  }
+  ::flatbuffers::Vector<uint32_t> *mutable_values() {
+    return GetPointer<::flatbuffers::Vector<uint32_t> *>(VT_VALUES);
+  }
+  const ::flatbuffers::String *annotation_str() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_ANNOTATION_STR);
+  }
+  ::flatbuffers::String *mutable_annotation_str() {
+    return GetPointer<::flatbuffers::String *>(VT_ANNOTATION_STR);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_KEYS) &&
+           verifier.VerifyVector(keys()) &&
+           VerifyOffset(verifier, VT_VALUES) &&
+           verifier.VerifyVector(values()) &&
+           VerifyOffset(verifier, VT_ANNOTATION_STR) &&
+           verifier.VerifyString(annotation_str()) &&
+           verifier.EndTable();
+  }
+};
+
+struct DictBuilder {
+  typedef Dict Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_keys(::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> keys) {
+    fbb_.AddOffset(Dict::VT_KEYS, keys);
+  }
+  void add_values(::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> values) {
+    fbb_.AddOffset(Dict::VT_VALUES, values);
+  }
+  void add_annotation_str(::flatbuffers::Offset<::flatbuffers::String> annotation_str) {
+    fbb_.AddOffset(Dict::VT_ANNOTATION_STR, annotation_str);
+  }
+  explicit DictBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Dict> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Dict>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Dict> CreateDict(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> keys = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> values = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> annotation_str = 0) {
+  DictBuilder builder_(_fbb);
+  builder_.add_annotation_str(annotation_str);
+  builder_.add_values(values);
+  builder_.add_keys(keys);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Dict> CreateDictDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<uint32_t> *keys = nullptr,
+    const std::vector<uint32_t> *values = nullptr,
+    const char *annotation_str = nullptr) {
+  auto keys__ = keys ? _fbb.CreateVector<uint32_t>(*keys) : 0;
+  auto values__ = values ? _fbb.CreateVector<uint32_t>(*values) : 0;
+  auto annotation_str__ = annotation_str ? _fbb.CreateString(annotation_str) : 0;
+  return torch::jit::mobile::serialization::CreateDict(
+      _fbb,
+      keys__,
+      values__,
+      annotation_str__);
+}
+
+struct ObjectType FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ObjectTypeBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_TYPE_NAME = 4,
+    VT_TYPE = 6,
+    VT_ATTR_NAMES = 8
+  };
+  const ::flatbuffers::String *type_name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_TYPE_NAME);
+  }
+  ::flatbuffers::String *mutable_type_name() {
+    return GetPointer<::flatbuffers::String *>(VT_TYPE_NAME);
+  }
+  torch::jit::mobile::serialization::TypeType type() const {
+    return static_cast<torch::jit::mobile::serialization::TypeType>(GetField<uint8_t>(VT_TYPE, 0));
+  }
+  bool mutate_type(torch::jit::mobile::serialization::TypeType _type = static_cast<torch::jit::mobile::serialization::TypeType>(0)) {
+    return SetField<uint8_t>(VT_TYPE, static_cast<uint8_t>(_type), 0);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *attr_names() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_ATTR_NAMES);
+  }
+  ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *mutable_attr_names() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_ATTR_NAMES);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_TYPE_NAME) &&
+           verifier.VerifyString(type_name()) &&
+           VerifyField<uint8_t>(verifier, VT_TYPE, 1) &&
+           VerifyOffset(verifier, VT_ATTR_NAMES) &&
+           verifier.VerifyVector(attr_names()) &&
+           verifier.VerifyVectorOfStrings(attr_names()) &&
+           verifier.EndTable();
+  }
+};
+
+struct ObjectTypeBuilder {
+  typedef ObjectType Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_type_name(::flatbuffers::Offset<::flatbuffers::String> type_name) {
+    fbb_.AddOffset(ObjectType::VT_TYPE_NAME, type_name);
+  }
+  void add_type(torch::jit::mobile::serialization::TypeType type) {
+    fbb_.AddElement<uint8_t>(ObjectType::VT_TYPE, static_cast<uint8_t>(type), 0);
+  }
+  void add_attr_names(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> attr_names) {
+    fbb_.AddOffset(ObjectType::VT_ATTR_NAMES, attr_names);
+  }
+  explicit ObjectTypeBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ObjectType> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ObjectType>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ObjectType> CreateObjectType(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> type_name = 0,
+    torch::jit::mobile::serialization::TypeType type = torch::jit::mobile::serialization::TypeType::UNSET,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> attr_names = 0) {
+  ObjectTypeBuilder builder_(_fbb);
+  builder_.add_attr_names(attr_names);
+  builder_.add_type_name(type_name);
+  builder_.add_type(type);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<ObjectType> CreateObjectTypeDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *type_name = nullptr,
+    torch::jit::mobile::serialization::TypeType type = torch::jit::mobile::serialization::TypeType::UNSET,
+    const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *attr_names = nullptr) {
+  auto type_name__ = type_name ? _fbb.CreateString(type_name) : 0;
+  auto attr_names__ = attr_names ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*attr_names) : 0;
+  return torch::jit::mobile::serialization::CreateObjectType(
+      _fbb,
+      type_name__,
+      type,
+      attr_names__);
+}
+
+struct Object FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ObjectBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_TYPE_INDEX = 4,
+    VT_STATE = 6,
+    VT_ATTRS = 8,
+    VT_SETSTATE_FUNC = 10
+  };
+  uint32_t type_index() const {
+    return GetField<uint32_t>(VT_TYPE_INDEX, 0);
+  }
+  bool mutate_type_index(uint32_t _type_index = 0) {
+    return SetField<uint32_t>(VT_TYPE_INDEX, _type_index, 0);
+  }
+  uint32_t state() const {
+    return GetField<uint32_t>(VT_STATE, 0);
+  }
+  bool mutate_state(uint32_t _state = 0) {
+    return SetField<uint32_t>(VT_STATE, _state, 0);
+  }
+  const ::flatbuffers::Vector<uint32_t> *attrs() const {
+    return GetPointer<const ::flatbuffers::Vector<uint32_t> *>(VT_ATTRS);
+  }
+  ::flatbuffers::Vector<uint32_t> *mutable_attrs() {
+    return GetPointer<::flatbuffers::Vector<uint32_t> *>(VT_ATTRS);
+  }
+  uint32_t setstate_func() const {
+    return GetField<uint32_t>(VT_SETSTATE_FUNC, 0);
+  }
+  bool mutate_setstate_func(uint32_t _setstate_func = 0) {
+    return SetField<uint32_t>(VT_SETSTATE_FUNC, _setstate_func, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint32_t>(verifier, VT_TYPE_INDEX, 4) &&
+           VerifyField<uint32_t>(verifier, VT_STATE, 4) &&
+           VerifyOffset(verifier, VT_ATTRS) &&
+           verifier.VerifyVector(attrs()) &&
+           VerifyField<uint32_t>(verifier, VT_SETSTATE_FUNC, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct ObjectBuilder {
+  typedef Object Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_type_index(uint32_t type_index) {
+    fbb_.AddElement<uint32_t>(Object::VT_TYPE_INDEX, type_index, 0);
+  }
+  void add_state(uint32_t state) {
+    fbb_.AddElement<uint32_t>(Object::VT_STATE, state, 0);
+  }
+  void add_attrs(::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> attrs) {
+    fbb_.AddOffset(Object::VT_ATTRS, attrs);
+  }
+  void add_setstate_func(uint32_t setstate_func) {
+    fbb_.AddElement<uint32_t>(Object::VT_SETSTATE_FUNC, setstate_func, 0);
+  }
+  explicit ObjectBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Object> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Object>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Object> CreateObject(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    uint32_t type_index = 0,
+    uint32_t state = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> attrs = 0,
+    uint32_t setstate_func = 0) {
+  ObjectBuilder builder_(_fbb);
+  builder_.add_setstate_func(setstate_func);
+  builder_.add_attrs(attrs);
+  builder_.add_state(state);
+  builder_.add_type_index(type_index);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Object> CreateObjectDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    uint32_t type_index = 0,
+    uint32_t state = 0,
+    const std::vector<uint32_t> *attrs = nullptr,
+    uint32_t setstate_func = 0) {
+  auto attrs__ = attrs ? _fbb.CreateVector<uint32_t>(*attrs) : 0;
+  return torch::jit::mobile::serialization::CreateObject(
+      _fbb,
+      type_index,
+      state,
+      attrs__,
+      setstate_func);
+}
+
+struct EnumValue FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef EnumValueBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_TYPE_NAME = 4,
+    VT_VALUE = 6
+  };
+  const ::flatbuffers::String *type_name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_TYPE_NAME);
+  }
+  ::flatbuffers::String *mutable_type_name() {
+    return GetPointer<::flatbuffers::String *>(VT_TYPE_NAME);
+  }
+  uint32_t value() const {
+    return GetField<uint32_t>(VT_VALUE, 0);
+  }
+  bool mutate_value(uint32_t _value = 0) {
+    return SetField<uint32_t>(VT_VALUE, _value, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_TYPE_NAME) &&
+           verifier.VerifyString(type_name()) &&
+           VerifyField<uint32_t>(verifier, VT_VALUE, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct EnumValueBuilder {
+  typedef EnumValue Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_type_name(::flatbuffers::Offset<::flatbuffers::String> type_name) {
+    fbb_.AddOffset(EnumValue::VT_TYPE_NAME, type_name);
+  }
+  void add_value(uint32_t value) {
+    fbb_.AddElement<uint32_t>(EnumValue::VT_VALUE, value, 0);
+  }
+  explicit EnumValueBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<EnumValue> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<EnumValue>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<EnumValue> CreateEnumValue(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> type_name = 0,
+    uint32_t value = 0) {
+  EnumValueBuilder builder_(_fbb);
+  builder_.add_value(value);
+  builder_.add_type_name(type_name);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<EnumValue> CreateEnumValueDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *type_name = nullptr,
+    uint32_t value = 0) {
+  auto type_name__ = type_name ? _fbb.CreateString(type_name) : 0;
+  return torch::jit::mobile::serialization::CreateEnumValue(
+      _fbb,
+      type_name__,
+      value);
+}
+
+struct Operator FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef OperatorBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4,
+    VT_OVERLOAD_NAME = 6,
+    VT_NUM_ARGS_SERIALIZED = 8
+  };
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
+  }
+  ::flatbuffers::String *mutable_name() {
+    return GetPointer<::flatbuffers::String *>(VT_NAME);
+  }
+  const ::flatbuffers::String *overload_name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_OVERLOAD_NAME);
+  }
+  ::flatbuffers::String *mutable_overload_name() {
+    return GetPointer<::flatbuffers::String *>(VT_OVERLOAD_NAME);
+  }
+  int32_t num_args_serialized() const {
+    return GetField<int32_t>(VT_NUM_ARGS_SERIALIZED, -1);
+  }
+  bool mutate_num_args_serialized(int32_t _num_args_serialized = -1) {
+    return SetField<int32_t>(VT_NUM_ARGS_SERIALIZED, _num_args_serialized, -1);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyOffset(verifier, VT_OVERLOAD_NAME) &&
+           verifier.VerifyString(overload_name()) &&
+           VerifyField<int32_t>(verifier, VT_NUM_ARGS_SERIALIZED, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct OperatorBuilder {
+  typedef Operator Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
+    fbb_.AddOffset(Operator::VT_NAME, name);
+  }
+  void add_overload_name(::flatbuffers::Offset<::flatbuffers::String> overload_name) {
+    fbb_.AddOffset(Operator::VT_OVERLOAD_NAME, overload_name);
+  }
+  void add_num_args_serialized(int32_t num_args_serialized) {
+    fbb_.AddElement<int32_t>(Operator::VT_NUM_ARGS_SERIALIZED, num_args_serialized, -1);
+  }
+  explicit OperatorBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Operator> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Operator>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Operator> CreateOperator(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> overload_name = 0,
+    int32_t num_args_serialized = -1) {
+  OperatorBuilder builder_(_fbb);
+  builder_.add_num_args_serialized(num_args_serialized);
+  builder_.add_overload_name(overload_name);
+  builder_.add_name(name);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Operator> CreateOperatorDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr,
+    const char *overload_name = nullptr,
+    int32_t num_args_serialized = -1) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  auto overload_name__ = overload_name ? _fbb.CreateString(overload_name) : 0;
+  return torch::jit::mobile::serialization::CreateOperator(
+      _fbb,
+      name__,
+      overload_name__,
+      num_args_serialized);
+}
+
+struct Arg FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ArgBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4,
+    VT_TYPE = 6,
+    VT_DEFAULT_VALUE = 8
+  };
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
+  }
+  ::flatbuffers::String *mutable_name() {
+    return GetPointer<::flatbuffers::String *>(VT_NAME);
+  }
+  const ::flatbuffers::String *type() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_TYPE);
+  }
+  ::flatbuffers::String *mutable_type() {
+    return GetPointer<::flatbuffers::String *>(VT_TYPE);
+  }
+  uint32_t default_value() const {
+    return GetField<uint32_t>(VT_DEFAULT_VALUE, 0);
+  }
+  bool mutate_default_value(uint32_t _default_value = 0) {
+    return SetField<uint32_t>(VT_DEFAULT_VALUE, _default_value, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyOffset(verifier, VT_TYPE) &&
+           verifier.VerifyString(type()) &&
+           VerifyField<uint32_t>(verifier, VT_DEFAULT_VALUE, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct ArgBuilder {
+  typedef Arg Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
+    fbb_.AddOffset(Arg::VT_NAME, name);
+  }
+  void add_type(::flatbuffers::Offset<::flatbuffers::String> type) {
+    fbb_.AddOffset(Arg::VT_TYPE, type);
+  }
+  void add_default_value(uint32_t default_value) {
+    fbb_.AddElement<uint32_t>(Arg::VT_DEFAULT_VALUE, default_value, 0);
+  }
+  explicit ArgBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Arg> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Arg>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Arg> CreateArg(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> type = 0,
+    uint32_t default_value = 0) {
+  ArgBuilder builder_(_fbb);
+  builder_.add_default_value(default_value);
+  builder_.add_type(type);
+  builder_.add_name(name);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Arg> CreateArgDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr,
+    const char *type = nullptr,
+    uint32_t default_value = 0) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  auto type__ = type ? _fbb.CreateString(type) : 0;
+  return torch::jit::mobile::serialization::CreateArg(
+      _fbb,
+      name__,
+      type__,
+      default_value);
+}
+
+struct Schema FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SchemaBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ARGUMENTS = 4,
+    VT_RETURNS = 6
+  };
+  const ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Arg>> *arguments() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Arg>> *>(VT_ARGUMENTS);
+  }
+  ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Arg>> *mutable_arguments() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Arg>> *>(VT_ARGUMENTS);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Arg>> *returns() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Arg>> *>(VT_RETURNS);
+  }
+  ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Arg>> *mutable_returns() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Arg>> *>(VT_RETURNS);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_ARGUMENTS) &&
+           verifier.VerifyVector(arguments()) &&
+           verifier.VerifyVectorOfTables(arguments()) &&
+           VerifyOffset(verifier, VT_RETURNS) &&
+           verifier.VerifyVector(returns()) &&
+           verifier.VerifyVectorOfTables(returns()) &&
+           verifier.EndTable();
+  }
+};
+
+struct SchemaBuilder {
+  typedef Schema Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_arguments(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Arg>>> arguments) {
+    fbb_.AddOffset(Schema::VT_ARGUMENTS, arguments);
+  }
+  void add_returns(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Arg>>> returns) {
+    fbb_.AddOffset(Schema::VT_RETURNS, returns);
+  }
+  explicit SchemaBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Schema> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Schema>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Schema> CreateSchema(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Arg>>> arguments = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Arg>>> returns = 0) {
+  SchemaBuilder builder_(_fbb);
+  builder_.add_returns(returns);
+  builder_.add_arguments(arguments);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Schema> CreateSchemaDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Arg>> *arguments = nullptr,
+    const std::vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Arg>> *returns = nullptr) {
+  auto arguments__ = arguments ? _fbb.CreateVector<::flatbuffers::Offset<torch::jit::mobile::serialization::Arg>>(*arguments) : 0;
+  auto returns__ = returns ? _fbb.CreateVector<::flatbuffers::Offset<torch::jit::mobile::serialization::Arg>>(*returns) : 0;
+  return torch::jit::mobile::serialization::CreateSchema(
+      _fbb,
+      arguments__,
+      returns__);
+}
+
+struct DebugInfo FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DebugInfoBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DEBUG_HANDLE = 4
+  };
+  const ::flatbuffers::Vector<int64_t> *debug_handle() const {
+    return GetPointer<const ::flatbuffers::Vector<int64_t> *>(VT_DEBUG_HANDLE);
+  }
+  ::flatbuffers::Vector<int64_t> *mutable_debug_handle() {
+    return GetPointer<::flatbuffers::Vector<int64_t> *>(VT_DEBUG_HANDLE);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_DEBUG_HANDLE) &&
+           verifier.VerifyVector(debug_handle()) &&
+           verifier.EndTable();
+  }
+};
+
+struct DebugInfoBuilder {
+  typedef DebugInfo Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_debug_handle(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> debug_handle) {
+    fbb_.AddOffset(DebugInfo::VT_DEBUG_HANDLE, debug_handle);
+  }
+  explicit DebugInfoBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<DebugInfo> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<DebugInfo>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<DebugInfo> CreateDebugInfo(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> debug_handle = 0) {
+  DebugInfoBuilder builder_(_fbb);
+  builder_.add_debug_handle(debug_handle);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<DebugInfo> CreateDebugInfoDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<int64_t> *debug_handle = nullptr) {
+  auto debug_handle__ = debug_handle ? _fbb.CreateVector<int64_t>(*debug_handle) : 0;
+  return torch::jit::mobile::serialization::CreateDebugInfo(
+      _fbb,
+      debug_handle__);
+}
+
+struct Function FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef FunctionBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_QN = 4,
+    VT_INSTRUCTIONS = 6,
+    VT_OPERATORS = 8,
+    VT_CONSTANTS = 10,
+    VT_TYPE_ANNOTATIONS = 12,
+    VT_REGISTER_SIZE = 14,
+    VT_SCHEMA = 16,
+    VT_DEBUG_INFO = 18,
+    VT_CLASS_TYPE = 20
+  };
+  const ::flatbuffers::String *qn() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_QN);
+  }
+  ::flatbuffers::String *mutable_qn() {
+    return GetPointer<::flatbuffers::String *>(VT_QN);
+  }
+  const ::flatbuffers::Vector<const torch::jit::mobile::serialization::Instruction *> *instructions() const {
+    return GetPointer<const ::flatbuffers::Vector<const torch::jit::mobile::serialization::Instruction *> *>(VT_INSTRUCTIONS);
+  }
+  ::flatbuffers::Vector<const torch::jit::mobile::serialization::Instruction *> *mutable_instructions() {
+    return GetPointer<::flatbuffers::Vector<const torch::jit::mobile::serialization::Instruction *> *>(VT_INSTRUCTIONS);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Operator>> *operators() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Operator>> *>(VT_OPERATORS);
+  }
+  ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Operator>> *mutable_operators() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Operator>> *>(VT_OPERATORS);
+  }
+  const ::flatbuffers::Vector<uint32_t> *constants() const {
+    return GetPointer<const ::flatbuffers::Vector<uint32_t> *>(VT_CONSTANTS);
+  }
+  ::flatbuffers::Vector<uint32_t> *mutable_constants() {
+    return GetPointer<::flatbuffers::Vector<uint32_t> *>(VT_CONSTANTS);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *type_annotations() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_TYPE_ANNOTATIONS);
+  }
+  ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *mutable_type_annotations() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_TYPE_ANNOTATIONS);
+  }
+  int32_t register_size() const {
+    return GetField<int32_t>(VT_REGISTER_SIZE, 0);
+  }
+  bool mutate_register_size(int32_t _register_size = 0) {
+    return SetField<int32_t>(VT_REGISTER_SIZE, _register_size, 0);
+  }
+  const torch::jit::mobile::serialization::Schema *schema() const {
+    return GetPointer<const torch::jit::mobile::serialization::Schema *>(VT_SCHEMA);
+  }
+  torch::jit::mobile::serialization::Schema *mutable_schema() {
+    return GetPointer<torch::jit::mobile::serialization::Schema *>(VT_SCHEMA);
+  }
+  const torch::jit::mobile::serialization::DebugInfo *debug_info() const {
+    return GetPointer<const torch::jit::mobile::serialization::DebugInfo *>(VT_DEBUG_INFO);
+  }
+  torch::jit::mobile::serialization::DebugInfo *mutable_debug_info() {
+    return GetPointer<torch::jit::mobile::serialization::DebugInfo *>(VT_DEBUG_INFO);
+  }
+  uint32_t class_type() const {
+    return GetField<uint32_t>(VT_CLASS_TYPE, 0);
+  }
+  bool mutate_class_type(uint32_t _class_type = 0) {
+    return SetField<uint32_t>(VT_CLASS_TYPE, _class_type, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_QN) &&
+           verifier.VerifyString(qn()) &&
+           VerifyOffset(verifier, VT_INSTRUCTIONS) &&
+           verifier.VerifyVector(instructions()) &&
+           VerifyOffset(verifier, VT_OPERATORS) &&
+           verifier.VerifyVector(operators()) &&
+           verifier.VerifyVectorOfTables(operators()) &&
+           VerifyOffset(verifier, VT_CONSTANTS) &&
+           verifier.VerifyVector(constants()) &&
+           VerifyOffset(verifier, VT_TYPE_ANNOTATIONS) &&
+           verifier.VerifyVector(type_annotations()) &&
+           verifier.VerifyVectorOfStrings(type_annotations()) &&
+           VerifyField<int32_t>(verifier, VT_REGISTER_SIZE, 4) &&
+           VerifyOffset(verifier, VT_SCHEMA) &&
+           verifier.VerifyTable(schema()) &&
+           VerifyOffset(verifier, VT_DEBUG_INFO) &&
+           verifier.VerifyTable(debug_info()) &&
+           VerifyField<uint32_t>(verifier, VT_CLASS_TYPE, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct FunctionBuilder {
+  typedef Function Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_qn(::flatbuffers::Offset<::flatbuffers::String> qn) {
+    fbb_.AddOffset(Function::VT_QN, qn);
+  }
+  void add_instructions(::flatbuffers::Offset<::flatbuffers::Vector<const torch::jit::mobile::serialization::Instruction *>> instructions) {
+    fbb_.AddOffset(Function::VT_INSTRUCTIONS, instructions);
+  }
+  void add_operators(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Operator>>> operators) {
+    fbb_.AddOffset(Function::VT_OPERATORS, operators);
+  }
+  void add_constants(::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> constants) {
+    fbb_.AddOffset(Function::VT_CONSTANTS, constants);
+  }
+  void add_type_annotations(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> type_annotations) {
+    fbb_.AddOffset(Function::VT_TYPE_ANNOTATIONS, type_annotations);
+  }
+  void add_register_size(int32_t register_size) {
+    fbb_.AddElement<int32_t>(Function::VT_REGISTER_SIZE, register_size, 0);
+  }
+  void add_schema(::flatbuffers::Offset<torch::jit::mobile::serialization::Schema> schema) {
+    fbb_.AddOffset(Function::VT_SCHEMA, schema);
+  }
+  void add_debug_info(::flatbuffers::Offset<torch::jit::mobile::serialization::DebugInfo> debug_info) {
+    fbb_.AddOffset(Function::VT_DEBUG_INFO, debug_info);
+  }
+  void add_class_type(uint32_t class_type) {
+    fbb_.AddElement<uint32_t>(Function::VT_CLASS_TYPE, class_type, 0);
+  }
+  explicit FunctionBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Function> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Function>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Function> CreateFunction(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> qn = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<const torch::jit::mobile::serialization::Instruction *>> instructions = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Operator>>> operators = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> constants = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> type_annotations = 0,
+    int32_t register_size = 0,
+    ::flatbuffers::Offset<torch::jit::mobile::serialization::Schema> schema = 0,
+    ::flatbuffers::Offset<torch::jit::mobile::serialization::DebugInfo> debug_info = 0,
+    uint32_t class_type = 0) {
+  FunctionBuilder builder_(_fbb);
+  builder_.add_class_type(class_type);
+  builder_.add_debug_info(debug_info);
+  builder_.add_schema(schema);
+  builder_.add_register_size(register_size);
+  builder_.add_type_annotations(type_annotations);
+  builder_.add_constants(constants);
+  builder_.add_operators(operators);
+  builder_.add_instructions(instructions);
+  builder_.add_qn(qn);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Function> CreateFunctionDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *qn = nullptr,
+    const std::vector<torch::jit::mobile::serialization::Instruction> *instructions = nullptr,
+    const std::vector<::flatbuffers::Offset<torch::jit::mobile::serialization::Operator>> *operators = nullptr,
+    const std::vector<uint32_t> *constants = nullptr,
+    const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *type_annotations = nullptr,
+    int32_t register_size = 0,
+    ::flatbuffers::Offset<torch::jit::mobile::serialization::Schema> schema = 0,
+    ::flatbuffers::Offset<torch::jit::mobile::serialization::DebugInfo> debug_info = 0,
+    uint32_t class_type = 0) {
+  auto qn__ = qn ? _fbb.CreateString(qn) : 0;
+  auto instructions__ = instructions ? _fbb.CreateVectorOfStructs<torch::jit::mobile::serialization::Instruction>(*instructions) : 0;
+  auto operators__ = operators ? _fbb.CreateVector<::flatbuffers::Offset<torch::jit::mobile::serialization::Operator>>(*operators) : 0;
+  auto constants__ = constants ? _fbb.CreateVector<uint32_t>(*constants) : 0;
+  auto type_annotations__ = type_annotations ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*type_annotations) : 0;
+  return torch::jit::mobile::serialization::CreateFunction(
+      _fbb,
+      qn__,
+      instructions__,
+      operators__,
+      constants__,
+      type_annotations__,
+      register_size,
+      schema,
+      debug_info,
+      class_type);
+}
+
+struct StorageData FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef StorageDataBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DATA = 4
+  };
+  const ::flatbuffers::Vector<uint8_t> *data() const {
+    return GetPointer<const ::flatbuffers::Vector<uint8_t> *>(VT_DATA);
+  }
+  ::flatbuffers::Vector<uint8_t> *mutable_data() {
+    return GetPointer<::flatbuffers::Vector<uint8_t> *>(VT_DATA);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_DATA) &&
+           verifier.VerifyVector(data()) &&
+           verifier.EndTable();
+  }
+};
+
+struct StorageDataBuilder {
+  typedef StorageData Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_data(::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> data) {
+    fbb_.AddOffset(StorageData::VT_DATA, data);
+  }
+  explicit StorageDataBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<StorageData> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<StorageData>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<StorageData> CreateStorageData(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint8_t>> data = 0) {
+  StorageDataBuilder builder_(_fbb);
+  builder_.add_data(data);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<StorageData> CreateStorageDataDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<uint8_t> *data = nullptr) {
+  if (data) { _fbb.ForceVectorAlignment(data->size(), sizeof(uint8_t), 16); }
+  auto data__ = data ? _fbb.CreateVector<uint8_t>(*data) : 0;
+  return torch::jit::mobile::serialization::CreateStorageData(
+      _fbb,
+      data__);
+}
+
+struct IValue FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef IValueBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_VAL_TYPE = 4,
+    VT_VAL = 6
+  };
+  torch::jit::mobile::serialization::IValueUnion val_type() const {
+    return static_cast<torch::jit::mobile::serialization::IValueUnion>(GetField<uint8_t>(VT_VAL_TYPE, 0));
+  }
+  const void *val() const {
+    return GetPointer<const void *>(VT_VAL);
+  }
+  template<typename T> const T *val_as() const;
+  const torch::jit::mobile::serialization::Int *val_as_Int() const {
+    return val_type() == torch::jit::mobile::serialization::IValueUnion::Int ? static_cast<const torch::jit::mobile::serialization::Int *>(val()) : nullptr;
+  }
+  const torch::jit::mobile::serialization::Bool *val_as_Bool() const {
+    return val_type() == torch::jit::mobile::serialization::IValueUnion::Bool ? static_cast<const torch::jit::mobile::serialization::Bool *>(val()) : nullptr;
+  }
+  const torch::jit::mobile::serialization::Double *val_as_Double() const {
+    return val_type() == torch::jit::mobile::serialization::IValueUnion::Double ? static_cast<const torch::jit::mobile::serialization::Double *>(val()) : nullptr;
+  }
+  const torch::jit::mobile::serialization::ComplexDouble *val_as_ComplexDouble() const {
+    return val_type() == torch::jit::mobile::serialization::IValueUnion::ComplexDouble ? static_cast<const torch::jit::mobile::serialization::ComplexDouble *>(val()) : nullptr;
+  }
+  const torch::jit::mobile::serialization::TensorMetadata *val_as_TensorMetadata() const {
+    return val_type() == torch::jit::mobile::serialization::IValueUnion::TensorMetadata ? static_cast<const torch::jit::mobile::serialization::TensorMetadata *>(val()) : nullptr;
+  }
+  const torch::jit::mobile::serialization::String *val_as_String() const {
+    return val_type() == torch::jit::mobile::serialization::IValueUnion::String ? static_cast<const torch::jit::mobile::serialization::String *>(val()) : nullptr;
+  }
+  const torch::jit::mobile::serialization::List *val_as_List() const {
+    return val_type() == torch::jit::mobile::serialization::IValueUnion::List ? static_cast<const torch::jit::mobile::serialization::List *>(val()) : nullptr;
+  }
+  const torch::jit::mobile::serialization::Tuple *val_as_Tuple() const {
+    return val_type() == torch::jit::mobile::serialization::IValueUnion::Tuple ? static_cast<const torch::jit::mobile::serialization::Tuple *>(val()) : nullptr;
+  }
+  const torch::jit::mobile::serialization::Dict *val_as_Dict() const {
+    return val_type() == torch::jit::mobile::serialization::IValueUnion::Dict ? static_cast<const torch::jit::mobile::serialization::Dict *>(val()) : nullptr;
+  }
+  const torch::jit::mobile::serialization::Object *val_as_Object() const {
+    return val_type() == torch::jit::mobile::serialization::IValueUnion::Object ? static_cast<const torch::jit::mobile::serialization::Object *>(val()) : nullptr;
+  }
+  const torch::jit::mobile::serialization::IntList *val_as_IntList() const {
+    return val_type() == torch::jit::mobile::serialization::IValueUnion::IntList ? static_cast<const torch::jit::mobile::serialization::IntList *>(val()) : nullptr;
+  }
+  const torch::jit::mobile::serialization::DoubleList *val_as_DoubleList() const {
+    return val_type() == torch::jit::mobile::serialization::IValueUnion::DoubleList ? static_cast<const torch::jit::mobile::serialization::DoubleList *>(val()) : nullptr;
+  }
+  const torch::jit::mobile::serialization::BoolList *val_as_BoolList() const {
+    return val_type() == torch::jit::mobile::serialization::IValueUnion::BoolList ? static_cast<const torch::jit::mobile::serialization::BoolList *>(val()) : nullptr;
+  }
+  const torch::jit::mobile::serialization::Device *val_as_Device() const {
+    return val_type() == torch::jit::mobile::serialization::IValueUnion::Device ? static_cast<const torch::jit::mobile::serialization::Device *>(val()) : nullptr;
+  }
+  const torch::jit::mobile::serialization::EnumValue *val_as_EnumValue() const {
+    return val_type() == torch::jit::mobile::serialization::IValueUnion::EnumValue ? static_cast<const torch::jit::mobile::serialization::EnumValue *>(val()) : nullptr;
+  }
+  const torch::jit::mobile::serialization::Function *val_as_Function() const {
+    return val_type() == torch::jit::mobile::serialization::IValueUnion::Function ? static_cast<const torch::jit::mobile::serialization::Function *>(val()) : nullptr;
+  }
+  void *mutable_val() {
+    return GetPointer<void *>(VT_VAL);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_VAL_TYPE, 1) &&
+           VerifyOffset(verifier, VT_VAL) &&
+           VerifyIValueUnion(verifier, val(), val_type()) &&
+           verifier.EndTable();
+  }
+};
+
+template<> inline const torch::jit::mobile::serialization::Int *IValue::val_as<torch::jit::mobile::serialization::Int>() const {
+  return val_as_Int();
+}
+
+template<> inline const torch::jit::mobile::serialization::Bool *IValue::val_as<torch::jit::mobile::serialization::Bool>() const {
+  return val_as_Bool();
+}
+
+template<> inline const torch::jit::mobile::serialization::Double *IValue::val_as<torch::jit::mobile::serialization::Double>() const {
+  return val_as_Double();
+}
+
+template<> inline const torch::jit::mobile::serialization::ComplexDouble *IValue::val_as<torch::jit::mobile::serialization::ComplexDouble>() const {
+  return val_as_ComplexDouble();
+}
+
+template<> inline const torch::jit::mobile::serialization::TensorMetadata *IValue::val_as<torch::jit::mobile::serialization::TensorMetadata>() const {
+  return val_as_TensorMetadata();
+}
+
+template<> inline const torch::jit::mobile::serialization::String *IValue::val_as<torch::jit::mobile::serialization::String>() const {
+  return val_as_String();
+}
+
+template<> inline const torch::jit::mobile::serialization::List *IValue::val_as<torch::jit::mobile::serialization::List>() const {
+  return val_as_List();
+}
+
+template<> inline const torch::jit::mobile::serialization::Tuple *IValue::val_as<torch::jit::mobile::serialization::Tuple>() const {
+  return val_as_Tuple();
+}
+
+template<> inline const torch::jit::mobile::serialization::Dict *IValue::val_as<torch::jit::mobile::serialization::Dict>() const {
+  return val_as_Dict();
+}
+
+template<> inline const torch::jit::mobile::serialization::Object *IValue::val_as<torch::jit::mobile::serialization::Object>() const {
+  return val_as_Object();
+}
+
+template<> inline const torch::jit::mobile::serialization::IntList *IValue::val_as<torch::jit::mobile::serialization::IntList>() const {
+  return val_as_IntList();
+}
+
+template<> inline const torch::jit::mobile::serialization::DoubleList *IValue::val_as<torch::jit::mobile::serialization::DoubleList>() const {
+  return val_as_DoubleList();
+}
+
+template<> inline const torch::jit::mobile::serialization::BoolList *IValue::val_as<torch::jit::mobile::serialization::BoolList>() const {
+  return val_as_BoolList();
+}
+
+template<> inline const torch::jit::mobile::serialization::Device *IValue::val_as<torch::jit::mobile::serialization::Device>() const {
+  return val_as_Device();
+}
+
+template<> inline const torch::jit::mobile::serialization::EnumValue *IValue::val_as<torch::jit::mobile::serialization::EnumValue>() const {
+  return val_as_EnumValue();
+}
+
+template<> inline const torch::jit::mobile::serialization::Function *IValue::val_as<torch::jit::mobile::serialization::Function>() const {
+  return val_as_Function();
+}
+
+struct IValueBuilder {
+  typedef IValue Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_val_type(torch::jit::mobile::serialization::IValueUnion val_type) {
+    fbb_.AddElement<uint8_t>(IValue::VT_VAL_TYPE, static_cast<uint8_t>(val_type), 0);
+  }
+  void add_val(::flatbuffers::Offset<void> val) {
+    fbb_.AddOffset(IValue::VT_VAL, val);
+  }
+  explicit IValueBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<IValue> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<IValue>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<IValue> CreateIValue(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    torch::jit::mobile::serialization::IValueUnion val_type = torch::jit::mobile::serialization::IValueUnion::NONE,
+    ::flatbuffers::Offset<void> val = 0) {
+  IValueBuilder builder_(_fbb);
+  builder_.add_val(val);
+  builder_.add_val_type(val_type);
+  return builder_.Finish();
+}
+
+struct ExtraFile FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ExtraFileBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4,
+    VT_CONTENT = 6
+  };
+  const ::flatbuffers::String *name() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_NAME);
+  }
+  ::flatbuffers::String *mutable_name() {
+    return GetPointer<::flatbuffers::String *>(VT_NAME);
+  }
+  const ::flatbuffers::String *content() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_CONTENT);
+  }
+  ::flatbuffers::String *mutable_content() {
+    return GetPointer<::flatbuffers::String *>(VT_CONTENT);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyOffset(verifier, VT_CONTENT) &&
+           verifier.VerifyString(content()) &&
+           verifier.EndTable();
+  }
+};
+
+struct ExtraFileBuilder {
+  typedef ExtraFile Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name) {
+    fbb_.AddOffset(ExtraFile::VT_NAME, name);
+  }
+  void add_content(::flatbuffers::Offset<::flatbuffers::String> content) {
+    fbb_.AddOffset(ExtraFile::VT_CONTENT, content);
+  }
+  explicit ExtraFileBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ExtraFile> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ExtraFile>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ExtraFile> CreateExtraFile(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> name = 0,
+    ::flatbuffers::Offset<::flatbuffers::String> content = 0) {
+  ExtraFileBuilder builder_(_fbb);
+  builder_.add_content(content);
+  builder_.add_name(name);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<ExtraFile> CreateExtraFileDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr,
+    const char *content = nullptr) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  auto content__ = content ? _fbb.CreateString(content) : 0;
+  return torch::jit::mobile::serialization::CreateExtraFile(
+      _fbb,
+      name__,
+      content__);
+}
+
+struct Module FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ModuleBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_BYTECODE_VERSION = 4,
+    VT_EXTRA_FILES = 6,
+    VT_METHODS = 8,
+    VT_STATE_OBJ = 10,
+    VT_IVALUES = 12,
+    VT_STORAGE_DATA_SIZE = 14,
+    VT_STORAGE_DATA = 16,
+    VT_OBJECT_TYPES = 18,
+    VT_JIT_SOURCES = 20,
+    VT_JIT_CONSTANTS = 22,
+    VT_OPERATOR_VERSION = 24,
+    VT_MOBILE_IVALUE_SIZE = 26
+  };
+  uint32_t bytecode_version() const {
+    return GetField<uint32_t>(VT_BYTECODE_VERSION, 0);
+  }
+  bool mutate_bytecode_version(uint32_t _bytecode_version = 0) {
+    return SetField<uint32_t>(VT_BYTECODE_VERSION, _bytecode_version, 0);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>> *extra_files() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>> *>(VT_EXTRA_FILES);
+  }
+  ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>> *mutable_extra_files() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>> *>(VT_EXTRA_FILES);
+  }
+  const ::flatbuffers::Vector<uint32_t> *methods() const {
+    return GetPointer<const ::flatbuffers::Vector<uint32_t> *>(VT_METHODS);
+  }
+  ::flatbuffers::Vector<uint32_t> *mutable_methods() {
+    return GetPointer<::flatbuffers::Vector<uint32_t> *>(VT_METHODS);
+  }
+  uint32_t state_obj() const {
+    return GetField<uint32_t>(VT_STATE_OBJ, 0);
+  }
+  bool mutate_state_obj(uint32_t _state_obj = 0) {
+    return SetField<uint32_t>(VT_STATE_OBJ, _state_obj, 0);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::IValue>> *ivalues() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::IValue>> *>(VT_IVALUES);
+  }
+  ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::IValue>> *mutable_ivalues() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::IValue>> *>(VT_IVALUES);
+  }
+  int32_t storage_data_size() const {
+    return GetField<int32_t>(VT_STORAGE_DATA_SIZE, 0);
+  }
+  bool mutate_storage_data_size(int32_t _storage_data_size = 0) {
+    return SetField<int32_t>(VT_STORAGE_DATA_SIZE, _storage_data_size, 0);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::StorageData>> *storage_data() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::StorageData>> *>(VT_STORAGE_DATA);
+  }
+  ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::StorageData>> *mutable_storage_data() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::StorageData>> *>(VT_STORAGE_DATA);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ObjectType>> *object_types() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ObjectType>> *>(VT_OBJECT_TYPES);
+  }
+  ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ObjectType>> *mutable_object_types() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ObjectType>> *>(VT_OBJECT_TYPES);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>> *jit_sources() const {
+    return GetPointer<const ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>> *>(VT_JIT_SOURCES);
+  }
+  ::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>> *mutable_jit_sources() {
+    return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>> *>(VT_JIT_SOURCES);
+  }
+  const ::flatbuffers::Vector<uint32_t> *jit_constants() const {
+    return GetPointer<const ::flatbuffers::Vector<uint32_t> *>(VT_JIT_CONSTANTS);
+  }
+  ::flatbuffers::Vector<uint32_t> *mutable_jit_constants() {
+    return GetPointer<::flatbuffers::Vector<uint32_t> *>(VT_JIT_CONSTANTS);
+  }
+  uint32_t operator_version() const {
+    return GetField<uint32_t>(VT_OPERATOR_VERSION, 0);
+  }
+  bool mutate_operator_version(uint32_t _operator_version = 0) {
+    return SetField<uint32_t>(VT_OPERATOR_VERSION, _operator_version, 0);
+  }
+  uint32_t mobile_ivalue_size() const {
+    return GetField<uint32_t>(VT_MOBILE_IVALUE_SIZE, 0);
+  }
+  bool mutate_mobile_ivalue_size(uint32_t _mobile_ivalue_size = 0) {
+    return SetField<uint32_t>(VT_MOBILE_IVALUE_SIZE, _mobile_ivalue_size, 0);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint32_t>(verifier, VT_BYTECODE_VERSION, 4) &&
+           VerifyOffset(verifier, VT_EXTRA_FILES) &&
+           verifier.VerifyVector(extra_files()) &&
+           verifier.VerifyVectorOfTables(extra_files()) &&
+           VerifyOffset(verifier, VT_METHODS) &&
+           verifier.VerifyVector(methods()) &&
+           VerifyField<uint32_t>(verifier, VT_STATE_OBJ, 4) &&
+           VerifyOffset(verifier, VT_IVALUES) &&
+           verifier.VerifyVector(ivalues()) &&
+           verifier.VerifyVectorOfTables(ivalues()) &&
+           VerifyField<int32_t>(verifier, VT_STORAGE_DATA_SIZE, 4) &&
+           VerifyOffset(verifier, VT_STORAGE_DATA) &&
+           verifier.VerifyVector(storage_data()) &&
+           verifier.VerifyVectorOfTables(storage_data()) &&
+           VerifyOffset(verifier, VT_OBJECT_TYPES) &&
+           verifier.VerifyVector(object_types()) &&
+           verifier.VerifyVectorOfTables(object_types()) &&
+           VerifyOffset(verifier, VT_JIT_SOURCES) &&
+           verifier.VerifyVector(jit_sources()) &&
+           verifier.VerifyVectorOfTables(jit_sources()) &&
+           VerifyOffset(verifier, VT_JIT_CONSTANTS) &&
+           verifier.VerifyVector(jit_constants()) &&
+           VerifyField<uint32_t>(verifier, VT_OPERATOR_VERSION, 4) &&
+           VerifyField<uint32_t>(verifier, VT_MOBILE_IVALUE_SIZE, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct ModuleBuilder {
+  typedef Module Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_bytecode_version(uint32_t bytecode_version) {
+    fbb_.AddElement<uint32_t>(Module::VT_BYTECODE_VERSION, bytecode_version, 0);
+  }
+  void add_extra_files(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>>> extra_files) {
+    fbb_.AddOffset(Module::VT_EXTRA_FILES, extra_files);
+  }
+  void add_methods(::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> methods) {
+    fbb_.AddOffset(Module::VT_METHODS, methods);
+  }
+  void add_state_obj(uint32_t state_obj) {
+    fbb_.AddElement<uint32_t>(Module::VT_STATE_OBJ, state_obj, 0);
+  }
+  void add_ivalues(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::IValue>>> ivalues) {
+    fbb_.AddOffset(Module::VT_IVALUES, ivalues);
+  }
+  void add_storage_data_size(int32_t storage_data_size) {
+    fbb_.AddElement<int32_t>(Module::VT_STORAGE_DATA_SIZE, storage_data_size, 0);
+  }
+  void add_storage_data(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::StorageData>>> storage_data) {
+    fbb_.AddOffset(Module::VT_STORAGE_DATA, storage_data);
+  }
+  void add_object_types(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ObjectType>>> object_types) {
+    fbb_.AddOffset(Module::VT_OBJECT_TYPES, object_types);
+  }
+  void add_jit_sources(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>>> jit_sources) {
+    fbb_.AddOffset(Module::VT_JIT_SOURCES, jit_sources);
+  }
+  void add_jit_constants(::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> jit_constants) {
+    fbb_.AddOffset(Module::VT_JIT_CONSTANTS, jit_constants);
+  }
+  void add_operator_version(uint32_t operator_version) {
+    fbb_.AddElement<uint32_t>(Module::VT_OPERATOR_VERSION, operator_version, 0);
+  }
+  void add_mobile_ivalue_size(uint32_t mobile_ivalue_size) {
+    fbb_.AddElement<uint32_t>(Module::VT_MOBILE_IVALUE_SIZE, mobile_ivalue_size, 0);
+  }
+  explicit ModuleBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Module> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<Module>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Module> CreateModule(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    uint32_t bytecode_version = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>>> extra_files = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> methods = 0,
+    uint32_t state_obj = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::IValue>>> ivalues = 0,
+    int32_t storage_data_size = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::StorageData>>> storage_data = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ObjectType>>> object_types = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>>> jit_sources = 0,
+    ::flatbuffers::Offset<::flatbuffers::Vector<uint32_t>> jit_constants = 0,
+    uint32_t operator_version = 0,
+    uint32_t mobile_ivalue_size = 0) {
+  ModuleBuilder builder_(_fbb);
+  builder_.add_mobile_ivalue_size(mobile_ivalue_size);
+  builder_.add_operator_version(operator_version);
+  builder_.add_jit_constants(jit_constants);
+  builder_.add_jit_sources(jit_sources);
+  builder_.add_object_types(object_types);
+  builder_.add_storage_data(storage_data);
+  builder_.add_storage_data_size(storage_data_size);
+  builder_.add_ivalues(ivalues);
+  builder_.add_state_obj(state_obj);
+  builder_.add_methods(methods);
+  builder_.add_extra_files(extra_files);
+  builder_.add_bytecode_version(bytecode_version);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Module> CreateModuleDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    uint32_t bytecode_version = 0,
+    const std::vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>> *extra_files = nullptr,
+    const std::vector<uint32_t> *methods = nullptr,
+    uint32_t state_obj = 0,
+    const std::vector<::flatbuffers::Offset<torch::jit::mobile::serialization::IValue>> *ivalues = nullptr,
+    int32_t storage_data_size = 0,
+    const std::vector<::flatbuffers::Offset<torch::jit::mobile::serialization::StorageData>> *storage_data = nullptr,
+    const std::vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ObjectType>> *object_types = nullptr,
+    const std::vector<::flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>> *jit_sources = nullptr,
+    const std::vector<uint32_t> *jit_constants = nullptr,
+    uint32_t operator_version = 0,
+    uint32_t mobile_ivalue_size = 0) {
+  auto extra_files__ = extra_files ? _fbb.CreateVector<::flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>>(*extra_files) : 0;
+  auto methods__ = methods ? _fbb.CreateVector<uint32_t>(*methods) : 0;
+  auto ivalues__ = ivalues ? _fbb.CreateVector<::flatbuffers::Offset<torch::jit::mobile::serialization::IValue>>(*ivalues) : 0;
+  auto storage_data__ = storage_data ? _fbb.CreateVector<::flatbuffers::Offset<torch::jit::mobile::serialization::StorageData>>(*storage_data) : 0;
+  auto object_types__ = object_types ? _fbb.CreateVector<::flatbuffers::Offset<torch::jit::mobile::serialization::ObjectType>>(*object_types) : 0;
+  auto jit_sources__ = jit_sources ? _fbb.CreateVector<::flatbuffers::Offset<torch::jit::mobile::serialization::ExtraFile>>(*jit_sources) : 0;
+  auto jit_constants__ = jit_constants ? _fbb.CreateVector<uint32_t>(*jit_constants) : 0;
+  return torch::jit::mobile::serialization::CreateModule(
+      _fbb,
+      bytecode_version,
+      extra_files__,
+      methods__,
+      state_obj,
+      ivalues__,
+      storage_data_size,
+      storage_data__,
+      object_types__,
+      jit_sources__,
+      jit_constants__,
+      operator_version,
+      mobile_ivalue_size);
+}
+
+inline bool VerifyIValueUnion(::flatbuffers::Verifier &verifier, const void *obj, IValueUnion type) {
+  switch (type) {
+    case IValueUnion::NONE: {
+      return true;
+    }
+    case IValueUnion::Int: {
+      return verifier.VerifyField<torch::jit::mobile::serialization::Int>(static_cast<const uint8_t *>(obj), 0, 8);
+    }
+    case IValueUnion::Bool: {
+      return verifier.VerifyField<torch::jit::mobile::serialization::Bool>(static_cast<const uint8_t *>(obj), 0, 1);
+    }
+    case IValueUnion::Double: {
+      return verifier.VerifyField<torch::jit::mobile::serialization::Double>(static_cast<const uint8_t *>(obj), 0, 8);
+    }
+    case IValueUnion::ComplexDouble: {
+      return verifier.VerifyField<torch::jit::mobile::serialization::ComplexDouble>(static_cast<const uint8_t *>(obj), 0, 8);
+    }
+    case IValueUnion::TensorMetadata: {
+      auto ptr = reinterpret_cast<const torch::jit::mobile::serialization::TensorMetadata *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case IValueUnion::String: {
+      auto ptr = reinterpret_cast<const torch::jit::mobile::serialization::String *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case IValueUnion::List: {
+      auto ptr = reinterpret_cast<const torch::jit::mobile::serialization::List *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case IValueUnion::Tuple: {
+      auto ptr = reinterpret_cast<const torch::jit::mobile::serialization::Tuple *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case IValueUnion::Dict: {
+      auto ptr = reinterpret_cast<const torch::jit::mobile::serialization::Dict *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case IValueUnion::Object: {
+      auto ptr = reinterpret_cast<const torch::jit::mobile::serialization::Object *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case IValueUnion::IntList: {
+      auto ptr = reinterpret_cast<const torch::jit::mobile::serialization::IntList *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case IValueUnion::DoubleList: {
+      auto ptr = reinterpret_cast<const torch::jit::mobile::serialization::DoubleList *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case IValueUnion::BoolList: {
+      auto ptr = reinterpret_cast<const torch::jit::mobile::serialization::BoolList *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case IValueUnion::Device: {
+      auto ptr = reinterpret_cast<const torch::jit::mobile::serialization::Device *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case IValueUnion::EnumValue: {
+      auto ptr = reinterpret_cast<const torch::jit::mobile::serialization::EnumValue *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case IValueUnion::Function: {
+      auto ptr = reinterpret_cast<const torch::jit::mobile::serialization::Function *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifyIValueUnionVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset<void>> *values, const ::flatbuffers::Vector<IValueUnion> *types) {
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (::flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyIValueUnion(
+        verifier,  values->Get(i), types->GetEnum<IValueUnion>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline const torch::jit::mobile::serialization::Module *GetModule(const void *buf) {
+  return ::flatbuffers::GetRoot<torch::jit::mobile::serialization::Module>(buf);
+}
+
+inline const torch::jit::mobile::serialization::Module *GetSizePrefixedModule(const void *buf) {
+  return ::flatbuffers::GetSizePrefixedRoot<torch::jit::mobile::serialization::Module>(buf);
+}
+
+inline Module *GetMutableModule(void *buf) {
+  return ::flatbuffers::GetMutableRoot<Module>(buf);
+}
+
+inline torch::jit::mobile::serialization::Module *GetMutableSizePrefixedModule(void *buf) {
+  return ::flatbuffers::GetMutableSizePrefixedRoot<torch::jit::mobile::serialization::Module>(buf);
+}
+
+inline const char *ModuleIdentifier() {
+  return "PTMF";
+}
+
+inline bool ModuleBufferHasIdentifier(const void *buf) {
+  return ::flatbuffers::BufferHasIdentifier(
+      buf, ModuleIdentifier());
+}
+
+inline bool SizePrefixedModuleBufferHasIdentifier(const void *buf) {
+  return ::flatbuffers::BufferHasIdentifier(
+      buf, ModuleIdentifier(), true);
+}
+
+inline bool VerifyModuleBuffer(
+    ::flatbuffers::Verifier &verifier) {
+  return verifier.VerifyBuffer<torch::jit::mobile::serialization::Module>(ModuleIdentifier());
+}
+
+inline bool VerifySizePrefixedModuleBuffer(
+    ::flatbuffers::Verifier &verifier) {
+  return verifier.VerifySizePrefixedBuffer<torch::jit::mobile::serialization::Module>(ModuleIdentifier());
+}
+
+inline void FinishModuleBuffer(
+    ::flatbuffers::FlatBufferBuilder &fbb,
+    ::flatbuffers::Offset<torch::jit::mobile::serialization::Module> root) {
+  fbb.Finish(root, ModuleIdentifier());
+}
+
+inline void FinishSizePrefixedModuleBuffer(
+    ::flatbuffers::FlatBufferBuilder &fbb,
+    ::flatbuffers::Offset<torch::jit::mobile::serialization::Module> root) {
+  fbb.FinishSizePrefixed(root, ModuleIdentifier());
+}
+
+}  // namespace serialization
+}  // namespace mobile
+}  // namespace jit
+}  // namespace torch
+
+#endif  // FLATBUFFERS_GENERATED_MOBILEBYTECODE_TORCH_JIT_MOBILE_SERIALIZATION_H_
+// @generated
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/onnx.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/onnx.h
new file mode 100644
index 0000000000000000000000000000000000000000..8570296e2d098f9e27aa79be532bf3a0fbab143e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/onnx.h
@@ -0,0 +1,18 @@
+#pragma once
+
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED(
+    "-Winconsistent-missing-destructor-override")
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wsuggest-override")
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED(
+    "-Wdeprecated-dynamic-exception-spec")
+#include <onnx/onnx_pb.h>
+C10_DIAGNOSTIC_POP()
+C10_DIAGNOSTIC_POP()
+C10_DIAGNOSTIC_POP()
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit {
+
+TORCH_API std::string prettyPrint(const ::ONNX_NAMESPACE::ModelProto& model);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/pickle.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/pickle.h
new file mode 100644
index 0000000000000000000000000000000000000000..b027bcb9f8cfdc9aee068540fdb848573df68b9c
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/pickle.h
@@ -0,0 +1,140 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <c10/util/ArrayRef.h>
+#include <caffe2/serialize/inline_container.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/serialization/pickler.h>
+#include <torch/csrc/jit/serialization/unpickler.h>
+
+namespace torch::jit {
+
+/// Pickle an IValue by calling a function to handle writing the data.
+///
+/// `writer` is a function that takes in a pointer to a chunk of memory and its
+/// size and consumes it.
+///
+/// See `jit::pickle` for more details.
+TORCH_API void pickle(
+    std::function<void(const char* data_start, size_t data_len)> writer,
+    const IValue& ivalue,
+    std::vector<at::Tensor>* tensor_table = nullptr);
+
+/// Save a `torch::IValue` in a format compatible with Python's `pickle` module
+///
+/// If present, `tensor_table` is a pointer to a table in which tensors that
+/// are contained within `ivalue` are stored, and the bytes returned by the
+/// pickler will only include references to these tensors in the table. This can
+/// be used to keep the binary blob size small.
+/// If not provided, tensors are stored in the same byte stream as the pickle
+/// data, similar to `torch.save()` in eager Python.
+///
+/// Pickled values can be loaded in Python and C++:
+/// \rst
+/// .. code-block:: cpp
+///
+///  torch::IValue float_value(2.3);
+///
+///  // TODO: when tensors are stored in the pickle, delete this
+///  std::vector<at::Tensor> tensor_table;
+///  auto data = torch::jit::pickle(float_value, &tensor_table);
+///
+///  std::vector<torch::IValue> ivalues =
+///      torch::jit::unpickle(data.data(), data.size());
+///
+/// .. code-block:: python
+///
+///   values = torch.load('data.pkl')
+///   print(values)
+///
+/// \endrst
+TORCH_API std::vector<char> pickle(
+    const IValue& ivalue,
+    std::vector<at::Tensor>* tensor_table = nullptr);
+
+/// Save a `torch::IValue` in a format that can be loaded by both
+/// `torch::pickle_load` in C++ and `torch.load` in Python.
+TORCH_API std::vector<char> pickle_save(const IValue& ivalue);
+
+/// Deserialize a `torch::IValue` from bytes produced by either
+/// `torch::pickle_save` in C++ or `torch.save` in Python
+TORCH_API IValue pickle_load(const std::vector<char>& data);
+
+/// Deserialize a `torch::IValue` from bytes produced by either
+/// `torch::pickle_save` in C++ or `torch.save` in Python with custom object.
+TORCH_API IValue pickle_load_obj(std::string_view data);
+
+/// `reader` is a function that takes in a size to read from some pickled
+/// binary. `reader` should remember where it last read, and return
+/// the number of bytes read.
+/// See `torch::pickle` for details.
+/// type_resolver is used to resolve any JIT type based on type str
+TORCH_API IValue unpickle(
+    std::function<size_t(char*, size_t)> reader,
+    TypeResolver type_resolver,
+    c10::ArrayRef<at::Tensor> tensor_table,
+    c10::TypePtr (*type_parser)(const std::string&) =
+        Unpickler::defaultTypeParser,
+    ObjLoader obj_loader = nullptr);
+
+/// Decode a chunk of memory containing pickled data into its `torch::IValue`s.
+///
+/// If any `torch::IValue`s in the pickled data are `Object`s, then a
+/// `class_resolver` function must be provided.
+///
+/// See `torch::pickle` for details.
+TORCH_API IValue unpickle(
+    const char* data,
+    size_t size,
+    TypeResolver type_resolver = nullptr,
+    c10::ArrayRef<at::Tensor> tensor_table = {},
+    c10::TypePtr (*type_parser)(const std::string&) =
+        Unpickler::defaultTypeParser);
+
+/// Decode a chunk of memory containing pickled data into its `torch::IValue`s.
+///
+/// If any `torch::IValue`s in the pickled data are `Object`s, then a
+/// `class_resolver` function must be provided.
+///
+/// See `torch::pickle` for details.
+TORCH_API IValue unpickle(
+    const char* data,
+    size_t size,
+    ObjLoader obj_loader,
+    TypeResolver type_resolver = nullptr,
+    c10::ArrayRef<at::Tensor> tensor_table = {},
+    c10::TypePtr (*type_parser)(const std::string&) =
+        Unpickler::defaultTypeParser);
+
+#ifndef C10_MOBILE
+class VectorReader : public caffe2::serialize::ReadAdapterInterface {
+ public:
+  VectorReader(std::vector<char> data) : data_(std::move(data)) {}
+
+  size_t size() const override {
+    return data_.size();
+  }
+
+  size_t read(uint64_t pos, void* buf, size_t n, const char* what)
+      const override;
+
+ private:
+  std::vector<char> data_;
+};
+
+class StringViewReader : public caffe2::serialize::ReadAdapterInterface {
+ public:
+  StringViewReader(std::string_view data) : data_(data) {}
+
+  size_t size() const override {
+    return data_.size();
+  }
+
+  size_t read(uint64_t pos, void* buf, size_t n, const char* what)
+      const override;
+
+ private:
+  std::string_view data_;
+};
+#endif
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/pickler.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/pickler.h
new file mode 100644
index 0000000000000000000000000000000000000000..1e925c3ece9bdf2a27efe8a6bfef97fe916a2e0f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/pickler.h
@@ -0,0 +1,419 @@
+#pragma once
+
+#include <ATen/core/qualified_name.h>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include <ATen/Utils.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/jit_type.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/FbcodeMaps.h>
+#include <c10/util/intrusive_ptr.h>
+#include <torch/csrc/Export.h>
+
+namespace torch::jit {
+
+// See Python's pickletools.py for a detailed description of each of these codes
+enum class PickleOpCode : char {
+  MARK = '(',
+  STOP = '.',
+  POP = '0',
+  POP_MARK = '1',
+  DUP = '2',
+  FLOAT = 'F',
+  INT = 'I',
+  BININT = 'J',
+  BININT1 = 'K',
+  LONG = 'L',
+  BININT2 = 'M',
+  NONE = 'N',
+  PERSID = 'P',
+  BINPERSID = 'Q',
+  REDUCE = 'R',
+  STRING = 'S',
+  BINSTRING = 'T',
+  SHORT_BINSTRING = 'U',
+  // NB: Avoid using UNICODE as it is a macro in the Windows API
+  UNICODE_ = 'V',
+  BINUNICODE = 'X',
+  APPEND = 'a',
+  BUILD = 'b',
+  GLOBAL = 'c',
+  DICT = 'd',
+  EMPTY_DICT = '}',
+  APPENDS = 'e',
+  GET = 'g',
+  BINGET = 'h',
+  INST = 'i',
+  LONG_BINGET = 'j',
+  LIST = 'l',
+  EMPTY_LIST = ']',
+  OBJ = 'o',
+  PUT = 'p',
+  BINPUT = 'q',
+  LONG_BINPUT = 'r',
+  SETITEM = 's',
+  TUPLE = 't',
+  EMPTY_TUPLE = ')',
+  SETITEMS = 'u',
+  BINFLOAT = 'G',
+
+  // Protocol 2
+  PROTO = char('\x80'),
+  NEWOBJ = '\x81',
+  EXT1 = '\x82',
+  EXT2 = '\x83',
+  EXT4 = '\x84',
+  TUPLE1 = '\x85',
+  TUPLE2 = '\x86',
+  TUPLE3 = '\x87',
+  NEWTRUE = '\x88',
+  NEWFALSE = '\x89',
+  LONG1 = '\x8a',
+  LONG4 = '\x8b',
+
+  // Protocol 3 (Python 3.x)
+  BINBYTES = 'B',
+  SHORT_BINBYTES = 'C',
+
+  // Protocol 4
+  SHORT_BINUNICODE = char('\x8c'),
+  BINUNICODE8 = '\x8d',
+  BINBYTES8 = '\x8e',
+  EMPTY_SET = '\x8f',
+  ADDITEMS = '\x90',
+  FROZENSET = '\x91',
+  NEWOBJ_EX = '\x92',
+  STACK_GLOBAL = '\x93',
+  MEMOIZE = '\x94',
+  FRAME = '\x95'
+};
+
+using ::c10::IValue;
+
+struct WriteableTensorData {
+  const char* data() const {
+    return static_cast<const char*>(tensor_.storage().data());
+  }
+  size_t sizeInBytes() const {
+    return size_;
+  }
+  size_t nbytes() const {
+    return tensor_.storage().nbytes();
+  }
+  bool storageHasDeleter() const {
+    return tensor_.storage().data_ptr().get_context() != nullptr;
+  }
+
+ private:
+  friend TORCH_API WriteableTensorData
+  getWriteableTensorData(const at::Tensor& tensor, bool to_cpu);
+  at::Tensor tensor_;
+  uint64_t size_;
+};
+
+class TORCH_API Pickler {
+  AT_DISALLOW_COPY_AND_ASSIGN(Pickler);
+
+ public:
+  Pickler(std::function<void(const char*, size_t)> writer)
+      : Pickler(std::move(writer), nullptr, nullptr, nullptr) {}
+
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  Pickler(
+      std::function<void(const char*, size_t)> writer,
+      std::vector<at::Tensor>* tensor_table,
+      std::function<c10::QualifiedName(const c10::ClassTypePtr&)> type_renamer,
+      std::vector<c10::ClassTypePtr>* memoized_class_types,
+      std::function<std::string(const at::Tensor&)> get_tensor_id = nullptr,
+      bool tag_aggregates = true)
+      : writer_(std::move(writer)),
+        tensor_table_(tensor_table),
+        type_renamer_(std::move(type_renamer)),
+        memoized_class_types_(memoized_class_types),
+        get_tensor_id_(std::move(get_tensor_id)),
+        tag_aggregates_(tag_aggregates) {}
+  ~Pickler();
+
+  // Push protocol onto the stack
+  void protocol();
+
+  // Push STOP PickleOpCode onto the stack
+  void stop();
+
+  void pushIValue(const IValue& ivalue);
+
+  void startTuple();
+  void endTuple();
+
+  const std::vector<at::Tensor>& tensorData() {
+    return tensor_data_;
+  }
+
+  void pushEmptyDict();
+  void pushDict(const IValue& ivalue);
+  void pushInt(int64_t value);
+  void pushLong(const std::string& data);
+
+ private:
+  void pushIValueImpl(const IValue& ivalue);
+  void startTypeTag();
+  void endTypeTag(const IValue& value);
+  void pushBool(bool value);
+  void pushDouble(double value);
+  void pushComplexDouble(const IValue& value);
+  void pushGenericList(const IValue& ivalue);
+  void pushIntList(const IValue& ivalue);
+  void pushList(const IValue& ivalue);
+  void pushTensor(const IValue& ivalue);
+  void pushTensorReference(const IValue& ivalue);
+  void pushLiteralTensor(const IValue& ivalue);
+  void pushLiteralSparseTensor(const at::Tensor& tensor);
+  void pushTuple(const IValue& ivalue);
+  void pushString(const std::string& string);
+  void pushDevice(const IValue& ivalue);
+#ifdef USE_DISTRIBUTED
+  void pushRRef(const IValue& ivalue);
+#endif
+  // unmemoized version
+  void pushStringImpl(const std::string& string);
+  void pushStorageOfTensor(const at::Tensor& tensor);
+
+  void pushBinGet(uint32_t memo_id);
+  void pushSpecializedList(
+      const IValue& ivalue,
+      const char* list_name,
+      const std::function<void(const IValue&)>& item_pusher);
+  void pushGlobal(std::string_view module_name, std::string_view class_name);
+  // raw string data is appended directly to the byte stream
+  void pushBytes(const std::string& string);
+  void pushTensorData(const at::Tensor& tensor);
+
+  // Add a BINPUT op and return the memoization id used
+  size_t pushNextBinPut();
+
+  const void* getPointer(const IValue& ivalue);
+
+  // Caller checks that bufferPos_ > 0
+  void flushNonEmpty() {
+    writer_(buffer_.data(), bufferPos_);
+    bufferPos_ = 0;
+  }
+
+  void flush() {
+    if (bufferPos_ != 0) {
+      flushNonEmpty();
+    }
+  }
+
+  // These convert values to bytes and add them to the stack (NB: since T is to
+  // the left of a '::', its type cannot be deduced by the compiler so one must
+  // explicitly instantiate the template, i.e. push<int>(int) works, push(int)
+  // does not)
+  static constexpr size_t kBufferSize = 256;
+  template <typename T>
+  void push(std::common_type_t<T> value) {
+    const char* begin = reinterpret_cast<const char*>(&value);
+    if (bufferPos_ + sizeof(T) > buffer_.size()) {
+      flushNonEmpty();
+    }
+    static_assert(sizeof(T) <= kBufferSize, "Buffer size assumption");
+    memcpy(buffer_.data() + bufferPos_, begin, sizeof(T));
+    bufferPos_ += sizeof(T);
+  }
+
+  // Stream to write binary data to
+  // Code shouldn't call writer_ directly without first flushing.
+  std::function<void(const char*, size_t)> writer_;
+
+  // Buffer to avoid calling a writer_ on a per-byte basis.
+  std::array<char, kBufferSize> buffer_;
+  size_t bufferPos_{0};
+
+  // Stack of opcodes/data
+  std::vector<char> stack_;
+
+  // External table of tensors to serialize. If this is missing, then tensors
+  // are serialized directly into the pickle
+  std::vector<at::Tensor>* tensor_table_;
+
+  // TODO: only use this if necessary (add a pass to find all shared ivalues,
+  // and only memoize those)
+  uint32_t memo_id_ = 0;
+
+  // Memoization of IValues that have been written (index in table is used for
+  // BINPUT opcodes) to enable shared references
+  c10::FastMap<const void*, uint32_t> memoized_ivalue_map_;
+
+  // because we de-dup ivalues based on their raw pointer address in the above
+  // map we need to keep all the memoized values alive during the pickle.
+  // Otherwise, it is possible that a raw address gets reused for another
+  // object, and we will alias it to the old object at that address.
+  std::vector<IValue> memoized_ivalues_;
+
+  std::function<c10::QualifiedName(const c10::ClassTypePtr&)> type_renamer_;
+
+  // List of all the types that it wrote, inspect from the IValues it wrote.
+  std::vector<c10::ClassTypePtr>* memoized_class_types_;
+
+  // Function to grab next id_name for tensor storage, function is responsible
+  // for returning unique ids
+  std::function<std::string(const at::Tensor&)> get_tensor_id_;
+
+  // List of tensor storages to serialize in the same binary as the pickle data
+  // similar to ivalues, they are memoized using BINPUT
+  std::vector<at::Tensor> tensor_data_;
+  c10::FastMap<const void*, uint32_t> memoized_storage_map_;
+
+  c10::FastMap<std::string, uint32_t> memoized_globals_map_;
+  c10::FastMap<std::string, uint32_t> memoized_strings_map_;
+  c10::FastMap<std::string, uint32_t> memoized_devices_map_;
+  // when true, List and Dict objects will be wrapped in a
+  // torch.jit._pickle.restore_type_tag call to correctly set the dynamic
+  // TorchScript type for the object. When true the thing unpickling must have
+  // torch installed.
+  bool tag_aggregates_;
+};
+
+// returns a (tensor, record_size) for a tensor, converting it to a CPU tensor
+// if it was CUDA and to_cpu is True.
+TORCH_API WriteableTensorData
+getWriteableTensorData(const at::Tensor& tensor, bool to_cpu = true);
+
+// if the cls has __getstate__/__setstate__
+// assert they have the right schema and return true,
+// otherwise return false
+bool checkHasValidSetGetState(const std::shared_ptr<c10::ClassType>& cls);
+
+// Declare BackendMeta serialization and deserialization function pointer types.
+using BackendMetaPtr = std::function<
+    void(const at::Tensor&, std::unordered_map<std::string, bool>&)>;
+
+// A allowlist of device type, currently available is PrivateUse1
+inline std::unordered_set<c10::DeviceType>& GetBackendMetaAllowlist() {
+  static std::unordered_set<c10::DeviceType> DeviceTypeAllowlist{
+      c10::DeviceType::PrivateUse1};
+  return DeviceTypeAllowlist;
+}
+
+// Dynamically obtain serialization function pairs
+// that require the corresponding backend.
+inline std::array<
+    std::optional<std::pair<BackendMetaPtr, BackendMetaPtr>>,
+    at::COMPILE_TIME_MAX_DEVICE_TYPES>&
+GetBackendMetaSerialization() {
+  // The array to save function pointer for BackendMeta serialization.
+  // key is the DeviceType, value is std::pair obj.
+  // value.first represent get function and value.seconde represent set function
+  static std::array<
+      std::optional<std::pair<BackendMetaPtr, BackendMetaPtr>>,
+      at::COMPILE_TIME_MAX_DEVICE_TYPES>
+      BackendMetaSerialization;
+  return BackendMetaSerialization;
+}
+
+// Register function pointer of Tensor BackendMetadata for serialization.
+TORCH_API inline void TensorBackendMetaRegistry(
+    c10::DeviceType t,
+    const BackendMetaPtr& get_fptr,
+    const BackendMetaPtr& set_fptr) {
+  // allowlist verification
+  // Only if the devicetype is in the allowlist,
+  // we allow the serialization extension to be registered for backendmeta data.
+  const auto& DeviceTypeAllowlist = GetBackendMetaAllowlist();
+  TORCH_CHECK(
+      DeviceTypeAllowlist.find(t) != DeviceTypeAllowlist.end(),
+      "It is not allowed to register the serialization method ",
+      "of backendMeta data for PrivateUse1. ",
+      "If you have related serialization requirements, ",
+      "please expand the allowlist");
+  // Register function pointer
+  int device_type = static_cast<int>(t);
+  auto& BackendMetaSerialization = GetBackendMetaSerialization();
+  TORCH_CHECK(
+      !BackendMetaSerialization[device_type].has_value(),
+      "The tensor BackendMeta serialization function pointer for ",
+      t,
+      " has been registered.");
+  BackendMetaSerialization[device_type] =
+      std::optional<std::pair<BackendMetaPtr, BackendMetaPtr>>(
+          std::make_pair(get_fptr, set_fptr));
+}
+
+// Return a map of Tensor Metadata which including BackendMetaData for
+// serialization. For now, it only takes care of `conj` and `neg` bit.
+inline std::unordered_map<std::string, bool> getTensorMetadata(
+    const at::Tensor& t) {
+  // We don't support serializing `ZeroTensor` as it is not public
+  // facing yet.
+  TORCH_CHECK(
+      !t._is_zerotensor(),
+      "ZeroTensor is not serializable,",
+      " please file an issue if required.");
+  std::unordered_map<std::string, bool> metadata{};
+
+  // Only add meta-data if the value is not default.
+  if (t.is_conj()) {
+    metadata["conj"] = true;
+  }
+  if (t.is_neg()) {
+    metadata["neg"] = true;
+  }
+  // Only add BackendMetaData for custom backend if the function pointer is
+  // registered.
+  int device_type = static_cast<int>(t.device().type());
+  const auto& BackendMetaSerialization = GetBackendMetaSerialization();
+  if (BackendMetaSerialization[device_type].has_value()) {
+    // Pass the tensor and metadata map references as parameters to the custom
+    // serialization function.
+    BackendMetaPtr fptr = BackendMetaSerialization[device_type].value().first;
+    fptr(t, metadata);
+  }
+  return metadata;
+}
+
+// set Tensor Metadata based on the map.
+// Refer: getTensorMetadata
+inline void setTensorMetadata(
+    const at::Tensor& t,
+    std::unordered_map<std::string, bool> metadata) {
+  auto iter_end = metadata.end();
+  auto iter_temp = metadata.find("conj");
+  if (iter_temp != iter_end) {
+    t._set_conj(true);
+    metadata.erase(iter_temp);
+  }
+  iter_temp = metadata.find("neg");
+  if (iter_temp != iter_end) {
+    t._set_neg(true);
+    metadata.erase(iter_temp);
+  }
+  // Only set BackendMetaData for custom backend if the function pointer is
+  // registered.
+  int device_type = static_cast<int>(t.device().type());
+  const auto& BackendMetaSerialization = GetBackendMetaSerialization();
+  if (BackendMetaSerialization[device_type].has_value()) {
+    // Pass the tensor and metadata map references as parameters to the custom
+    // deserialization function.
+    BackendMetaPtr fptr = BackendMetaSerialization[device_type].value().second;
+    fptr(t, metadata);
+  }
+}
+
+// set Tensor metadata based on the map.
+// NOTE: This overload is required by unpickler.cpp
+inline void setTensorMetadata(
+    const at::Tensor& t,
+    const c10::Dict<c10::IValue, c10::IValue>& metadata_idict) {
+  std::unordered_map<std::string, bool> metadata;
+  for (auto& pair : metadata_idict) {
+    auto key = *pair.key().toString();
+    metadata[key] = pair.value().toBool();
+  }
+  setTensorMetadata(t, std::move(metadata));
+}
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/python_print.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/python_print.h
new file mode 100644
index 0000000000000000000000000000000000000000..b917f86139d8563f027f8920c601704a7c10e7f1
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/python_print.h
@@ -0,0 +1,56 @@
+#pragma once
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/api/module.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <vector>
+
+namespace torch::jit {
+
+struct Method;
+struct Module;
+struct PythonPrintImpl;
+
+struct PrintDepsTable {
+  void add(const c10::NamedTypePtr& type);
+
+  size_t size() const {
+    return table_.size();
+  }
+
+  const c10::NamedTypePtr& operator[](size_t index) const {
+    return table_[index];
+  }
+
+ private:
+  std::vector<c10::NamedTypePtr> table_;
+  std::unordered_set<c10::NamedTypePtr> non_unique_;
+};
+
+struct TORCH_API PythonPrint {
+  PythonPrint(
+      std::vector<IValue>& constant_table,
+      PrintDepsTable& deps_table,
+      c10::TypePrinter type_printer = nullptr,
+      bool enforce_importable = false);
+
+  void printNamedType(const c10::NamedTypePtr& classType);
+  void printFunction(const Function& callee);
+  void printMethod(const Function& callee);
+
+  std::string str() const;
+  const SourceRangeRecords& ranges() const;
+  uint64_t minVersion() const;
+
+ private:
+  std::shared_ptr<PythonPrintImpl> pImpl;
+};
+
+TORCH_API bool printerHasSpecialCaseFor(c10::Symbol sym);
+
+TORCH_API void jitModuleToPythonCodeAndConstants(
+    const Module& module,
+    ExtraFilesMap* jit_sources, // output
+    std::vector<IValue>* constants // output
+);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/source_range_serialization.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/source_range_serialization.h
new file mode 100644
index 0000000000000000000000000000000000000000..e764172c8120a212116c2da3fccb397159b3d578
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/source_range_serialization.h
@@ -0,0 +1,66 @@
+#pragma once
+
+#include <c10/core/Allocator.h>
+#include <torch/csrc/jit/frontend/source_range.h>
+
+#include <ATen/core/ivalue.h>
+
+#include <unordered_map>
+#include <vector>
+
+namespace c10 {
+struct IValue;
+}
+
+namespace torch::jit {
+
+class Pickler;
+class SourceRangeSerializer;
+static constexpr size_t kByteOffsetIndex = 0;
+static constexpr size_t kSourceRangeIndex = 1;
+static constexpr size_t kSourceRangeTagIndex = 2;
+constexpr std::string_view kFormatWithStringTable = "FORMAT_WITH_STRING_TABLE";
+
+class SourceRangePickler {
+ public:
+  SourceRangePickler();
+
+  std::vector<char> pickle(
+      const SourceRangeRecords& ranges,
+      const SourceRangeTagMap& source_range_tags);
+
+ private:
+  std::shared_ptr<SourceRangeSerializer> srs;
+};
+
+class SourceRangeDeserializer {
+ public:
+  SourceRangeDeserializer() = default;
+  explicit SourceRangeDeserializer(const c10::IValue& text_table) {
+    for (const auto& x : text_table.toTuple()->elements()) {
+      text_table_.emplace_back(std::make_shared<std::string>(x.toStringRef()));
+    }
+  }
+  SourceRange deserialize(const c10::IValue& iv);
+
+ private:
+  std::shared_ptr<Source> deserialize_source(const c10::IValue& iv);
+  std::unordered_map<
+      c10::intrusive_ptr<c10::ivalue::Tuple>,
+      std::shared_ptr<Source>>
+      cached_sources;
+  std::vector<std::shared_ptr<std::string>> text_table_;
+};
+
+class SourceRangeUnpickler {
+ public:
+  virtual std::optional<SourceRange> findSourceRangeThatGenerated(
+      const SourceRange& range) = 0;
+
+  virtual ~SourceRangeUnpickler() = default;
+};
+
+TORCH_API void setShouldUseFormatWithStringTable(
+    bool should_use_format_with_string_table);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/source_range_serialization_impl.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/source_range_serialization_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..01a6d956cf490043c93d425f19c9efd6b7074782
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/source_range_serialization_impl.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include <torch/csrc/jit/serialization/source_range_serialization.h>
+
+namespace torch::jit {
+
+// Do this clownyness with virtual functions because of the split
+// between ATen core and torch
+
+class ConcreteSourceRangeUnpickler : public SourceRangeUnpickler {
+ public:
+  ConcreteSourceRangeUnpickler(at::DataPtr&& data, size_t size);
+
+  std::optional<SourceRange> findSourceRangeThatGenerated(
+      const SourceRange& range) override;
+
+ private:
+  at::DataPtr data;
+  size_t size;
+
+  void unpickle();
+
+  std::mutex mutex;
+  std::shared_ptr<SourceRangeDeserializer> deserializer;
+  std::shared_ptr<SourceRangeRecords> unpickled_records;
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/storage_context.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/storage_context.h
new file mode 100644
index 0000000000000000000000000000000000000000..cbe87fbd6a4b173f7091a9393db9cfe203f4334e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/storage_context.h
@@ -0,0 +1,83 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+
+namespace torch::jit {
+
+// Used in torch.package and TorchScript serialization to coordinate
+// sharing of storages between models. Also used to create deterministic
+// naming for storages.
+class TORCH_API SerializationStorageContext {
+ public:
+  explicit SerializationStorageContext() = default;
+  SerializationStorageContext operator=(const SerializationStorageContext&) =
+      delete;
+  SerializationStorageContext(const SerializationStorageContext&) = delete;
+
+  uint64_t getOrAddStorage(const c10::Storage& storage) {
+    if (!hasStorage(storage)) {
+      uint64_t size = storage_id_map_.size();
+      storage_id_map_[storage] = size;
+    }
+    return storage_id_map_[storage];
+  }
+
+  bool hasStorage(const c10::Storage& storage) {
+    return storage_id_map_.find(storage) != storage_id_map_.end();
+  }
+
+  ~SerializationStorageContext() = default;
+
+ private:
+  class StorageSerializationHash {
+   public:
+    size_t operator()(const c10::Storage& storage) const {
+      return std::hash<void*>()(
+          reinterpret_cast<void*>(storage.unsafeGetStorageImpl()));
+    }
+  };
+
+  class StorageSerializationEqual {
+   public:
+    bool operator()(const c10::Storage& lhs, const c10::Storage& rhs) const {
+      return lhs.unsafeGetStorageImpl() == rhs.unsafeGetStorageImpl();
+    }
+  };
+
+  std::unordered_map<
+      c10::Storage,
+      uint64_t,
+      StorageSerializationHash,
+      StorageSerializationEqual>
+      storage_id_map_;
+};
+
+// Used in torch.package and TorchScript deserialization to coordinate
+// sharing of storages between models.
+class TORCH_API DeserializationStorageContext {
+ public:
+  explicit DeserializationStorageContext() = default;
+  DeserializationStorageContext operator=(
+      const DeserializationStorageContext&) = delete;
+  DeserializationStorageContext(const DeserializationStorageContext&) = delete;
+
+  void addStorage(std::string name, c10::Storage storage) {
+    TORCH_INTERNAL_ASSERT(!hasStorage(name));
+    name_storage_map_.emplace(std::move(name), std::move(storage));
+  }
+
+  bool hasStorage(const std::string& name) {
+    return name_storage_map_.find(name) != name_storage_map_.end();
+  }
+
+  c10::Storage getStorage(const std::string& name) {
+    TORCH_INTERNAL_ASSERT(hasStorage(name));
+    return name_storage_map_.find(name)->second;
+  }
+  ~DeserializationStorageContext() = default;
+
+ private:
+  std::unordered_map<std::string, c10::Storage> name_storage_map_;
+};
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/type_name_uniquer.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/type_name_uniquer.h
new file mode 100644
index 0000000000000000000000000000000000000000..a79282eed5633245ed195b19d651ae7c00c7c110
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/type_name_uniquer.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <torch/csrc/jit/frontend/name_mangler.h>
+#include <torch/csrc/jit/ir/type_hashing.h>
+
+namespace torch::jit {
+
+/**
+ * class TypeNameUniquer
+ *
+ * Generates a unique name for every type `t` passed in. Types that compare
+ * equal with EqualType will receive the same unique name.
+ *
+ * This is used during Module::save(), to resolve type name collisions during
+ * serialization.
+ */
+class TORCH_API TypeNameUniquer {
+ public:
+  c10::QualifiedName getUniqueName(c10::ConstNamedTypePtr t);
+
+ private:
+  NameMangler mangler_;
+  std::unordered_set<c10::QualifiedName> used_names_;
+  std::unordered_map<
+      c10::ConstNamedTypePtr,
+      c10::QualifiedName,
+      HashType,
+      EqualType>
+      name_map_;
+};
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/unpickler.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/unpickler.h
new file mode 100644
index 0000000000000000000000000000000000000000..00f3352e6a67aac18520ffd7f24ed067cf37a764
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/serialization/unpickler.h
@@ -0,0 +1,204 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <c10/util/ArrayRef.h>
+#include <caffe2/serialize/inline_container.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/frontend/script_type_parser.h>
+#include <torch/csrc/jit/serialization/pickler.h>
+
+namespace torch::jit {
+
+using TypeResolver =
+    std::function<c10::StrongTypePtr(const c10::QualifiedName&)>;
+
+using ObjLoader = std::function<
+    c10::intrusive_ptr<c10::ivalue::Object>(const at::StrongTypePtr&, IValue)>;
+
+class DeserializationStorageContext;
+
+// [unpickler refactor] there is some cruft around PickleOpCode::BUILD,
+// PickleOpCode::NEWOBJ, and the last_opcode_ member below that should be
+// deleted at some point, the Pickler doesn't produce it and it's only around to
+// support models saved before 1.1
+class TORCH_API Unpickler {
+  AT_DISALLOW_COPY_AND_ASSIGN(Unpickler);
+
+  using TypeParserT = c10::TypePtr (*)(const std::string&);
+
+ public:
+  // tensors inside the pickle are references to the tensor_table.
+  // class_resolver is to resolve strong class type, type_resolver_ is
+  // to resolve any JIT type. class_resolver and type_resolver are not merged
+  // here because some use cases need to get strong class type that
+  // type_resolver_ can not return.
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  Unpickler(
+      std::function<size_t(char*, size_t)> reader,
+      TypeResolver type_resolver,
+      c10::ArrayRef<at::Tensor> tensor_table,
+      TypeParserT type_parser = defaultTypeParser)
+      : reader_(std::move(reader)),
+        tensor_table_(tensor_table),
+        type_resolver_(std::move(type_resolver)),
+        use_storage_device_(false),
+        type_parser_(type_parser),
+        version_(caffe2::serialize::kProducedFileFormatVersion) {}
+
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  Unpickler(
+      std::function<size_t(char*, size_t)> reader,
+      TypeResolver type_resolver,
+      c10::ArrayRef<at::Tensor> tensor_table,
+      ObjLoader obj_loader,
+      TypeParserT type_parser = defaultTypeParser)
+      : reader_(std::move(reader)),
+        tensor_table_(tensor_table),
+        type_resolver_(std::move(type_resolver)),
+        obj_loader_(std::move(obj_loader)),
+        use_storage_device_(false),
+        type_parser_(type_parser),
+        version_(caffe2::serialize::kProducedFileFormatVersion) {}
+
+  // tensors inside the pickle contain meta-data, the raw tensor
+  // dead is retrieved by calling `read_record`.
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+  Unpickler(
+      std::function<size_t(char*, size_t)> reader,
+      TypeResolver type_resolver,
+      ObjLoader obj_loader,
+      std::function<at::DataPtr(const std::string&)> read_record,
+      std::optional<at::Device> device,
+      bool use_storage_device = false,
+      TypeParserT type_parser = defaultTypeParser,
+      std::shared_ptr<DeserializationStorageContext> storage_context = nullptr)
+      : reader_(std::move(reader)),
+        type_resolver_(std::move(type_resolver)),
+        obj_loader_(std::move(obj_loader)),
+        read_record_(std::move(read_record)),
+        device_(device),
+        use_storage_device_(use_storage_device),
+        type_parser_(type_parser),
+        storage_context_(std::move(storage_context)),
+        version_(caffe2::serialize::kProducedFileFormatVersion) {}
+
+  Unpickler(Unpickler&&) = delete;
+  Unpickler& operator=(Unpickler&&) = delete;
+  ~Unpickler() = default;
+
+  // consume the pickle stream, producing an IValue from the contents.
+  // Type Tags: the pickler will restore the type tags on
+  // List and Dict objects when possible IValue is an Object.
+  // Otherwise, Dict and List objects will end up with Any as their tag.
+  // If you know the type of the ivalue, tags can be restored with
+  // restoreAccurateTypeTags
+  IValue parse_ivalue();
+
+  // [type tag serialization]
+  // This is used to determine whether to restore type tags be recursively
+  // descending into the returned stack object (if version_number <= 2), or
+  // if version_number >= 3, to use the type strings included in the pickle
+  // archive for container types. By default this is set to
+  // `kProducedFileFormatVersion` so unless you're loading a pickle file
+  // from alongside a corresponding `version` file, you don't need to set
+  // the version manually.
+  void set_version(uint64_t version_number) {
+    version_ = version_number;
+  }
+
+  static c10::TypePtr defaultTypeParser(const std::string& str) {
+    ScriptTypeParser parser;
+    return parser.parseType(str);
+  }
+
+ private:
+  // No arguments ensures that a template argument must be specified
+  // so that the number of bytes read / type read is explicit
+  template <typename T>
+  T read() {
+    T item;
+    if (sizeof(T) <= buffer_remaining_) {
+      // Fast path: entirely from buffer.
+      memcpy(&item, buffer_.data() + buffer_pos_, sizeof(T));
+      buffer_remaining_ -= sizeof(T);
+      buffer_pos_ += sizeof(T);
+    } else {
+      // Don't over-template the slow path, to avoid code size bloat.
+      readSlowWithBuffer(reinterpret_cast<char*>(&item), sizeof(T));
+    }
+    return item;
+  }
+  void readSlowWithBuffer(char* dest, size_t sz);
+  std::string readBytes(size_t num_bytes);
+
+  double readFloat();
+  void readGlobal(
+      const std::string& module_name,
+      const std::string& class_name);
+  void rebuildTensor(bool quantized);
+  void rebuildTensorFromTypeV2();
+  void rebuildSparseTensor();
+#ifdef USE_DISTRIBUTED
+  void rebuildRRef();
+#endif
+  PickleOpCode readInstruction();
+  PickleOpCode readOpCode() {
+    return static_cast<PickleOpCode>(read<uint8_t>());
+  }
+  std::string readString();
+  void readList(IValue list_ivalue);
+  void readListElements(IValue list_ivalue, size_t start);
+  void setInput(size_t memo_id);
+  void run();
+
+  // Returns the number of bytes read. This should statefully
+  // remember the position. Don't call reader_ directly.
+  std::function<size_t(char*, size_t)> reader_;
+  // Small buffer to avoid calling reader_ on a per-byte basis.
+  std::array<char, 256> buffer_;
+  size_t buffer_pos_{0};
+  size_t buffer_remaining_{0};
+
+  std::vector<IValue> stack_;
+
+  // globals are represented on the stack as IValue integer indices
+  // into this list
+  std::vector<std::function<void(void)>> globals_;
+  std::vector<IValue> memo_table_;
+  std::vector<size_t> marks_;
+  c10::ArrayRef<at::Tensor> tensor_table_;
+
+  // When deserializing types on lists and dicts, cache the type here
+  // so we don't have to parse the same type multiple times. Strings
+  // are already de-duplicated and replaced with BINGETs in the
+  // pickler, so we can just use the actual data pointer of each string.
+  std::unordered_map<std::string, c10::TypePtr> type_cache_;
+
+  // optionally nullptr, needs to be present for creating classes
+  TypeResolver type_resolver_;
+  ObjLoader obj_loader_;
+  IValue empty_tuple_;
+
+  std::function<at::DataPtr(const std::string&)> read_record_;
+  std::optional<at::Device> device_;
+  // When set to true, Unpickler will ignore the pickled device and use the
+  // device of the DataPtr returned by the read_record_ function. The default
+  // value of this flag is false.
+  const bool use_storage_device_;
+
+  TypeParserT type_parser_{defaultTypeParser};
+
+  // Used for torch.package to enable sharing of storages across
+  // ScriptModules and eager modules
+  std::shared_ptr<DeserializationStorageContext> storage_context_;
+
+  // See [type tag serialization]
+  uint64_t version_;
+
+  // See [NOTE] skip_next_read_global
+  uint8_t skip_next_read_global = 0;
+};
+
+void restoreAccurateTypeTags(const IValue& root, const c10::TypePtr& type_tag);
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/analysis.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/analysis.h
new file mode 100644
index 0000000000000000000000000000000000000000..86cd71f34d2ff58dfa879b48a5a8d2a419d08697
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/analysis.h
@@ -0,0 +1,398 @@
+#pragma once
+
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_visitor.h>
+#include <torch/csrc/jit/tensorexpr/stmt.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+
+#include <utility>
+
+namespace torch::jit::tensorexpr {
+class HasRand : public IRVisitor {
+ public:
+  HasRand(StmtPtr stmt) : stmt_(std::move(stmt)) {
+    stmt_->accept(this);
+  }
+
+  bool has_rand() const {
+    return has_rand_;
+  }
+
+ private:
+  void visit(const IntrinsicsPtr& v) override {
+    if (v->op_type() == IntrinsicsOp::kRand) {
+      has_rand_ = true;
+    } else {
+      IRVisitor::visit(v);
+    }
+  }
+  StmtPtr stmt_;
+  bool has_rand_ = false;
+};
+
+template <typename Op>
+class NodeFinder : public IRVisitor {
+ public:
+  void visit(const NodePtr<Op>& v) override {
+    nodes.push_back((NodePtr<Op>)v);
+    IRVisitor::visit(v);
+  }
+
+  static std::vector<NodePtr<Op>> find(const StmtPtr& s) {
+    NodeFinder<Op> nf;
+    s->accept(&nf);
+    return nf.nodes;
+  }
+
+  static std::vector<NodePtr<Op>> find(const ExprPtr& e) {
+    NodeFinder<Op> nf;
+    e->accept(&nf);
+    return nf.nodes;
+  }
+
+  std::vector<NodePtr<Op>> nodes;
+};
+
+class VarFinder : public IRVisitor {
+ public:
+  void visit(const VarPtr& v) override {
+    vars_.insert(v);
+    IRVisitor::visit(v);
+  }
+
+  static std::unordered_set<VarPtr> find(const StmtPtr& s) {
+    VarFinder nf;
+    s->accept(&nf);
+    return nf.vars();
+  }
+
+  static std::unordered_set<VarPtr> find(const ExprPtr& e) {
+    VarFinder nf;
+    e->accept(&nf);
+    return nf.vars();
+  }
+
+  const std::unordered_set<VarPtr>& vars() {
+    return vars_;
+  }
+
+ private:
+  std::unordered_set<VarPtr> vars_;
+};
+
+class BufFinder : public IRVisitor {
+ public:
+  void visit(const BufPtr& v) override {
+    bufs_.insert(v);
+    IRVisitor::visit(v);
+  }
+
+  static std::unordered_set<BufPtr> find(const StmtPtr& s) {
+    BufFinder nf;
+    s->accept(&nf);
+    return nf.bufs();
+  }
+
+  static std::unordered_set<BufPtr> find(const ExprPtr& e) {
+    BufFinder nf;
+    e->accept(&nf);
+    return nf.bufs();
+  }
+
+  const std::unordered_set<BufPtr>& bufs() {
+    return bufs_;
+  }
+
+ private:
+  std::unordered_set<BufPtr> bufs_;
+};
+
+// Finds all kinds of write operations to the provided Buf.
+class WritesToBuf : public IRVisitor {
+ public:
+  WritesToBuf(BufPtr target) : target_(std::move(target)) {}
+
+  std::vector<StmtPtr> writes() {
+    return writes_;
+  }
+
+  static std::vector<StmtPtr> find(const StmtPtr& s, BufPtr b) {
+    WritesToBuf finder(std::move(b));
+    s->accept(&finder);
+    return finder.writes();
+  }
+
+ private:
+  void visit(const StorePtr& v) override {
+    if (v->buf() == target_) {
+      writes_.push_back(v);
+    }
+  }
+
+  void visit(const AtomicAddPtr& v) override {
+    if (v->buf() == target_) {
+      writes_.push_back(v);
+    }
+  }
+
+  BufPtr target_;
+  std::vector<StmtPtr> writes_;
+};
+
+class StmtsReadingBuf : public IRVisitor {
+ public:
+  StmtsReadingBuf(BufPtr target) : target_(std::move(target)) {}
+
+  std::vector<StmtPtr> reads() {
+    return reads_;
+  }
+
+  static std::vector<StmtPtr> find(const StmtPtr& s, BufPtr b) {
+    StmtsReadingBuf finder(std::move(b));
+    s->accept(&finder);
+    return finder.reads();
+  }
+
+ private:
+  bool readsBuffer(const StmtPtr& s) {
+    auto loads = NodeFinder<Load>::find(s);
+    for (const auto& l : loads) {
+      if (l->buf() == target_) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  void visit(const StorePtr& v) override {
+    if (readsBuffer(v)) {
+      reads_.push_back(v);
+    }
+  }
+
+  void visit(const LetPtr& v) override {
+    if (readsBuffer(v)) {
+      reads_.push_back(v);
+    }
+  }
+
+  void visit(const CondPtr& v) override {
+    if (readsBuffer(v)) {
+      reads_.push_back(v);
+    }
+  }
+
+  void visit(const AtomicAddPtr& v) override {
+    if (readsBuffer(v)) {
+      reads_.push_back(v);
+    }
+  }
+
+  BufPtr target_;
+  std::vector<StmtPtr> reads_;
+};
+
+class ExternalAllocBufFinder : public IRVisitor {
+ public:
+  void visit(const ExternalCallWithAllocPtr& v) override {
+    const auto& bufs_out = v->buf_out_args();
+    bufs_.insert(bufs_out.begin(), bufs_out.end());
+    IRVisitor::visit(v);
+  }
+
+  static std::unordered_set<BufPtr> find(const StmtPtr& s) {
+    ExternalAllocBufFinder f;
+    s->accept(&f);
+    return f.bufs();
+  }
+
+  static std::unordered_set<BufPtr> find(const ExprPtr& e) {
+    ExternalAllocBufFinder f;
+    e->accept(&f);
+    return f.bufs();
+  }
+
+  const std::unordered_set<BufPtr>& bufs() {
+    return bufs_;
+  }
+
+ private:
+  std::unordered_set<BufPtr> bufs_;
+};
+
+// Traverses the IR to determine if a particular Var is modified within it.
+class ModifiesVarChecker : public IRVisitor {
+ public:
+  ModifiesVarChecker(VarPtr v) : var_(std::move(v)) {}
+
+  static bool check(const StmtPtr& s, VarPtr v) {
+    ModifiesVarChecker checker(std::move(v));
+    s->accept(&checker);
+    return checker.found();
+  }
+
+  bool found() {
+    return found_;
+  }
+
+ private:
+  void visit(const StorePtr& v) override {
+    if (v->buf()->base_handle() == var_) {
+      found_ = true;
+      return;
+    }
+    IRVisitor::visit(v);
+  }
+
+  void visit(const AtomicAddPtr& v) override {
+    if (v->buf()->base_handle() == var_) {
+      found_ = true;
+      return;
+    }
+    IRVisitor::visit(v);
+  }
+
+  void visit(const LetPtr& v) override {
+    if (v->var() == var_) {
+      found_ = true;
+      return;
+    }
+    IRVisitor::visit(v);
+  }
+
+  void visit(const ForPtr& v) override {
+    if (v->var() == var_) {
+      found_ = true;
+      return;
+    }
+    IRVisitor::visit(v);
+  }
+
+  VarPtr var_;
+  bool found_{false};
+};
+
+// Traverse the Block stmt to identify the live range of the specified buf. The
+// live range, indicated by a pair of integers, specifies the first and last
+// stmt in block stmts that access to the buf.
+class BufLiveRange : public IRVisitor {
+ public:
+  BufLiveRange(BufPtr b) : buf_(std::move(b)) {}
+
+  static std::tuple<int32_t, int32_t> liveRange(const StmtPtr& s, BufPtr b) {
+    BlockPtr block = to<Block>(s);
+    // We Only analyze buffer live ranges for block stmts.
+    if (!block) {
+      return std::make_tuple(0, 0);
+    }
+
+    BufLiveRange analyzer(std::move(b));
+    block->accept(&analyzer);
+    return analyzer.getLiveRange();
+  }
+
+ private:
+  std::tuple<int32_t, int32_t> getLiveRange() {
+    return std::make_tuple(begin_, end_);
+  }
+
+  bool hasBufReads(const StmtPtr& s) {
+    auto loads1 = NodeFinder<Load>::find(s);
+    for (const auto& l : loads1) {
+      if (l->buf() == buf_) {
+        return true;
+      }
+    }
+    auto loads2 = NodeFinder<ExternalCall>::find(s);
+    for (const auto& l : loads2) {
+      for (const auto& lb : l->buf_args()) {
+        if (lb == buf_) {
+          return true;
+        }
+      }
+    }
+    auto loads3 = NodeFinder<ExternalCallWithAlloc>::find(s);
+    for (const auto& l : loads3) {
+      for (const auto& lb : l->buf_args()) {
+        if (lb == buf_) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  bool hasBufWrites(const StmtPtr& s) {
+    auto writes1 = NodeFinder<Store>::find(s);
+    for (const auto& w : writes1) {
+      if (w->buf() == buf_) {
+        return true;
+      }
+    }
+    auto writes2 = NodeFinder<ExternalCall>::find(s);
+    for (const auto& w : writes2) {
+      if (w->buf() == buf_) {
+        return true;
+      }
+    }
+    auto writes3 = NodeFinder<ExternalCallWithAlloc>::find(s);
+    for (const auto& w : writes3) {
+      for (const auto& wb : w->buf_out_args()) {
+        if (wb == buf_) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  void findAccAndUpdateLiveRange(const StmtPtr& s) {
+    bool has_reads = hasBufReads(s), has_writes = hasBufWrites(s);
+    if (has_reads || has_writes) {
+      if (begin_ == -1) {
+        begin_ = curr_index_;
+      };
+      end_ = curr_index_;
+    }
+  }
+
+  void visit(const BlockPtr& v) override {
+    for (const StmtPtr& s : *v) {
+      curr_index_ += 1;
+      findAccAndUpdateLiveRange(s);
+    }
+  }
+
+  BufPtr buf_;
+  int32_t begin_ = -1;
+  int32_t end_ = -1;
+  int32_t curr_index_ = -1;
+};
+
+// A class that analyzes the given program relevant for Block backend
+// It creates a map of multi dim buffers and their flat versions
+class CreateBufferMap : public IRVisitor {
+ public:
+  const std::unordered_map<std::string, BufPtr>& getBufferMap() const {
+    return map_input_to_tensor_bufs_;
+  }
+
+ private:
+  void visit(const StorePtr& v) override {
+    auto load_node = to<Load>(v->value());
+    if (load_node) {
+      auto t_buf = load_node->buf();
+      map_input_to_tensor_bufs_.emplace(t_buf->name_hint(), v->buf());
+    } else {
+      auto add_node = to<Add>(v->value());
+      auto mul_node = to<Mul>(v->value());
+      // This means for now, v->value() can be Add or Mul
+      TORCH_INTERNAL_ASSERT(add_node || mul_node, buildErrorMessage());
+      map_input_to_tensor_bufs_.emplace(v->buf()->name_hint(), v->buf());
+    }
+    v->value()->accept(this);
+  }
+  std::unordered_map<std::string, BufPtr> map_input_to_tensor_bufs_;
+};
+
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/block_codegen.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/block_codegen.h
new file mode 100644
index 0000000000000000000000000000000000000000..63018e1b673163ba5f74f9bbba8b2386bb7e6e81
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/block_codegen.h
@@ -0,0 +1,146 @@
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+
+#include <ATen/ATen.h>
+#include <torch/csrc/jit/resource_guard.h>
+#include <torch/csrc/jit/tensorexpr/analysis.h>
+#include <torch/csrc/jit/tensorexpr/codegen.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_printer.h>
+#include <torch/csrc/jit/tensorexpr/ir_visitor.h>
+#include <torch/csrc/jit/tensorexpr/unique_name_manager.h>
+
+namespace torch::jit::tensorexpr {
+
+// A class that analyzes the given program relevant for Block backend.
+class BlockAnalysis : public IRVisitor {
+ public:
+  bool is_buf_store_target(const BufPtr& buf) const {
+    return store_targets_.count(buf) > 0;
+  }
+
+  const std::unordered_set<BufPtr>& loads() const {
+    return loads_;
+  }
+
+  const std::unordered_set<BufPtr>& stores() const {
+    return store_targets_;
+  }
+
+  int64_t block_size() const {
+    return block_size_;
+  }
+
+  bool areBufsInMap(const std::unordered_set<BufPtr>& bufs) const;
+
+  BufPtr getMultiDimBuf(const BufPtr& buf) const;
+
+  std::string getInputName(const BufPtr& buf) const;
+
+  std::string getFlatInputName(const BufPtr& buf) const {
+    return getInputName(buf) + "_flat";
+  }
+
+  std::unordered_map<std::string, BufPtr> getBufferMap() const {
+    return map_input_to_tensor_bufs_;
+  }
+
+ private:
+  void visit(const StorePtr& v) override;
+  void visit(const LoadPtr& v) override;
+  void visit(const ForPtr& v) override;
+
+  std::unordered_map<std::string, BufPtr> map_input_to_tensor_bufs_;
+  std::unordered_set<BufPtr> store_targets_;
+  std::unordered_set<BufPtr> loads_;
+  int64_t block_size_ = 32;
+};
+
+// A class that overrides the underlying IRPrinter to produce Block.
+class BlockPrinter : public IRPrinter {
+ public:
+  BlockPrinter(std::ostream* os, BlockAnalysis* block_analysis)
+      : IRPrinter(*os), block_analysis_(block_analysis) {}
+
+  using IRPrinter::name_manager;
+  using IRPrinter::visit;
+
+ private:
+  BlockAnalysis* block_analysis_;
+  std::unordered_map<std::string, int> dim_values_map;
+  std::vector<std::string> dim_names = {"N", "H", "W", "C"};
+  std::vector<std::string> flat_dim_names = {"N", "NH", "NHW", "NHWC"};
+  void PrintTensorInfo(const std::unordered_set<BufPtr>& bufs);
+  void PrintArguments(const std::unordered_set<BufPtr>& bufs);
+  void PrintBufferInfo(const std::unordered_set<BufPtr>& bufs);
+  void PrintDistribution(const std::unordered_set<BufPtr>& bufs);
+  void PrintLoop(const std::unordered_set<BufPtr>& bufs, bool block_idx = true);
+  void PrintReshapeInfo(
+      const std::unordered_set<BufPtr>& bufs,
+      bool reverse = false);
+  void PrintDMAs(const std::unordered_set<BufPtr>& bufs);
+  void PrintAdjustBuffers(const std::unordered_set<BufPtr>& bufs);
+
+  void visit(const ForPtr& v) override;
+  void visit(const LoadPtr& v) override;
+  void visit(const StorePtr& v) override;
+  void visit(const BlockPtr& v) override;
+  void visit(const AddPtr& v) override;
+  void visit(const MulPtr& v) override;
+};
+
+class TORCH_API BlockCodeGen : public CodeGen {
+ public:
+  template <typename... Ts>
+  /* implicit */
+  BlockCodeGen(StmtPtr stmt, Ts... ts)
+      : CodeGen(
+            stmt,
+            std::vector<BufferArg>({BufferArg(ts)...}),
+            at::Device(at::kCPU)) {
+    Initialize();
+  }
+
+  BlockCodeGen(
+      StmtPtr stmt,
+      const std::vector<BufferArg>& buffer_args,
+      at::Device device = at::Device(at::kCPU),
+      const std::string& kernel_func_name = "func")
+      : CodeGen(std::move(stmt), buffer_args, device, kernel_func_name) {
+    Initialize();
+  }
+
+  ~BlockCodeGen() override;
+
+  void call(const std::vector<CallArg>& args) override;
+  void call_raw(const std::vector<void*>& args) override;
+
+  void Initialize();
+
+  std::string getCodeText(const std::string& attr = "") override {
+    return oss_.str();
+  }
+
+ private:
+  UniqueNameManager* name_manager() {
+    if (!printer_) {
+      throw std::runtime_error("Null IRPrinter is not expected");
+    }
+    return printer_->name_manager();
+  }
+
+  std::ostream& os() {
+    return printer_->os();
+  }
+
+  std::ostringstream oss_;
+  std::unique_ptr<BlockPrinter> printer_;
+  std::unique_ptr<BlockAnalysis> block_analysis_;
+
+  std::string GetUniqueFuncName(const std::string& func_prefix);
+};
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/bounds_inference.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/bounds_inference.h
new file mode 100644
index 0000000000000000000000000000000000000000..1716475f26a29f1861df2f464be92de47bf0dbeb
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/bounds_inference.h
@@ -0,0 +1,75 @@
+#pragma once
+
+#include <unordered_map>
+#include <vector>
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/tensorexpr/mem_dependency_checker.h>
+
+namespace torch::jit::tensorexpr {
+
+class Expr;
+class Buf;
+class Stmt;
+
+enum C10_API_ENUM TensorAccessKind { kLoad, kStore, kMutate };
+
+struct TORCH_API TensorAccessBoundsInfo {
+  TensorAccessKind kind;
+  std::vector<ExprPtr> start;
+  std::vector<ExprPtr> stop;
+};
+
+using BoundsInfo =
+    std::unordered_map<BufPtr, std::vector<TensorAccessBoundsInfo>>;
+
+TORCH_API BoundsInfo
+inferBounds(const StmtPtr& s, bool distinctAccessKinds = true);
+
+// Bounds inference caching the analysis. The MemDependencyChecker must already
+// have been run.
+TORCH_API BoundsInfo getInferredBounds(
+    analysis::MemDependencyChecker& analyzer,
+    const StmtPtr& s,
+    bool distinctAccessKinds = true);
+TORCH_API BoundsInfo getInferredBounds(
+    analysis::MemDependencyChecker& analyzer,
+    const ExprPtr& e,
+    bool distinctAccessKinds = true);
+
+TORCH_API void printBoundsInfo(const BoundsInfo& v);
+
+TORCH_API std::vector<ExprPtr> getBoundExtents(
+    const std::vector<TensorAccessBoundsInfo>& infos);
+
+// The kind of dependency found, in increasing order of exclusivity.
+enum class HazardKind {
+  ReadAfterWrite,
+  WriteAfterRead,
+  WriteAfterWrite,
+  NoDependency,
+};
+TORCH_API HazardKind getPotentialHazards(
+    analysis::MemDependencyChecker& analyzer,
+    const StmtPtr& A,
+    const StmtPtr& B);
+
+// Returns true if there is a conflicting overlap between accesses in
+// statements A and B. A conflicting overlap is an overlap in buffer accesses
+// where at least one of the accesses is a Store.
+TORCH_API bool hasConflictingOverlap(
+    analysis::MemDependencyChecker& analyzer,
+    const StmtPtr& A,
+    const StmtPtr& B);
+// Same as above, between accesses in stores S1 and S2.
+TORCH_API bool isOverlapping(
+    analysis::MemDependencyChecker& analyzer,
+    const StorePtr& S1,
+    const StorePtr& S2);
+// Same as above, between accesses in store S and load L.
+TORCH_API bool isOverlapping(
+    analysis::MemDependencyChecker& analyzer,
+    const StorePtr& S,
+    const LoadPtr& L);
+
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/bounds_overlap.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/bounds_overlap.h
new file mode 100644
index 0000000000000000000000000000000000000000..0db092066136a551739d03b87cea5d5be1184dfd
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/bounds_overlap.h
@@ -0,0 +1,121 @@
+#pragma once
+
+#include <torch/csrc/jit/tensorexpr/expr.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+
+#include <utility>
+#include <vector>
+
+namespace torch::jit::tensorexpr::analysis {
+
+// A simple class containing the start and end of a range in a single dimension.
+struct TORCH_API Bound {
+  ExprPtr start{nullptr};
+  ExprPtr end{nullptr};
+
+  // This stores whether or not the start and end of this Bound have previously
+  // been swapped. This occurs when the bound is in a loop with a negative
+  // stride.
+  bool swapped{false};
+
+  Bound() = default;
+  Bound(ExprPtr s, ExprPtr e) : start(std::move(s)), end(std::move(e)) {}
+
+  void print() const;
+  bool equals(const Bound& other) const;
+
+  // The comparison operators are conservative. If the compare operator returns
+  // true, it means that all the elements satisfy the logical expression. But
+  // the false does not mean the opposite comparison is satisfied. It could be
+  // but not always.
+  bool operator==(const Bound& other) const;
+  bool operator!=(const Bound& other) const;
+  bool operator<(const Bound& other) const;
+  bool operator<=(const Bound& other) const;
+  bool operator>(const Bound& other) const;
+  bool operator>=(const Bound& other) const;
+
+  void swap() noexcept {
+    std::swap(start, end);
+    swapped = !swapped;
+  }
+};
+
+struct BoundHash {
+  size_t operator()(const Bound& b) const {
+    return std::hash<ExprPtr>()(b.start) ^ std::hash<ExprPtr>()(b.end);
+  }
+};
+
+// The type of overlap found. Each condition is true only if none of the
+// previous conditions hold.
+//     ContainedOrEqual: All elements in the Bound A are in the Bound B (this
+//                       includes the case where the bounds are equal).
+//     Contains: All elements in the Bound B are in the Bound B.
+//     PartialOverlap: Any elements in the Bound B are in the Bound A.
+//     NoOverlap: No elements in the Bound A are in the bound B.
+enum class OverlapKind {
+  ContainedOrEqual,
+  Contains,
+  PartialOverlap,
+  NoOverlap
+};
+
+// The Bound comparison result.
+//     True: Every Bound element always satisfies the given comparison operator
+//     False: Every Bound element always does NOT satisfy the given comparison
+//     operator
+//     NotDetermined: Some elements satisfy the given comparison operator and
+//     some elements not
+enum class CmpEvalResult { True, False, NotDetermined };
+
+// Returns the kind of overlap between Bound A and Bound A in a single
+// dimension.
+OverlapKind TORCH_API boundOverlap(const Bound& A, const Bound& B);
+
+// The comparison is conservative and the compare result is deterministic.
+// It means that every element of the Bound to be compared needs to satisfy
+// the given comparison operator.
+CmpEvalResult TORCH_API compareBound(
+    const Bound& a,
+    const Bound& b,
+    const CompareSelectOperation& cmp_op);
+
+// A multi dimensional bound representing the bound of a set of indices.
+using IndexBounds = std::vector<Bound>;
+
+// Returns true if two IndexBounds are equivalent.
+bool TORCH_API indexBoundsEquals(const IndexBounds& A, const IndexBounds& B);
+
+// Flattens a multi dimensional bound to a single dimension. The IndexBounds "a"
+// *must* encapsulate the entire range of the buffer.
+Bound TORCH_API flattenBounds(const IndexBounds& a);
+
+// Determines the kind of overlap in X dimensions.
+OverlapKind TORCH_API overlaps(const IndexBounds& a, const IndexBounds& b);
+
+// Returns the Bound slices created by subtracing bound B from bound A.
+// Multiple Bounds can be returned in the case where B slices A into two
+// distinct regions with no overlap.
+//
+// For example:
+//    subtractBound((0, 10), (2, 4)) => [(0, 1), (5, 10)]
+//       bound A: (0, 10)
+//       bound B: (2, 4)
+//       If we remove slice (2, 4) from the slice (0, 10), we will be left
+//       with 2 slices, one at the start (0, 1), and one at the end (5, 10).
+//       So, the result of this subtraction is [(0, 1), (5, 10)].
+//
+// Note: this doesn't use IndexBounds because the Bounds returned do not
+// represent multiple different dimensions.
+std::vector<Bound> TORCH_API subtractBound(const Bound& a, const Bound& b);
+
+// Returns the bound slices created by subtracting the IndexBounds B from A.
+std::vector<IndexBounds> TORCH_API subtractIndicesBounds(
+    const IndexBounds& A,
+    const IndexBounds& B,
+    OverlapKind overlap);
+std::vector<IndexBounds> TORCH_API
+subtractIndicesBounds(const IndexBounds& A, const IndexBounds& B);
+
+} // namespace torch::jit::tensorexpr::analysis
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/codegen.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/codegen.h
new file mode 100644
index 0000000000000000000000000000000000000000..23779b30ef5e8313f1df350799e2469bad8107ff
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/codegen.h
@@ -0,0 +1,269 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+
+#include <utility>
+
+namespace torch::jit::tensorexpr {
+
+template <typename T>
+class PaddedBuffer;
+
+class TORCH_API CodeGen {
+ public:
+  class BufferArg;
+  class CallArg;
+
+  template <typename... Ts>
+  CodeGen(StmtPtr stmt, Ts... ts)
+      : stmt_(std::move(stmt)), buffer_args_({BufferArg(ts)...}) {}
+
+  CodeGen(
+      StmtPtr stmt,
+      std::vector<BufferArg> buffer_args,
+      at::Device device = at::kCPU,
+      std::string kernel_func_name = "func");
+
+  virtual ~CodeGen() = default;
+
+  StmtPtr stmt() const {
+    return stmt_;
+  }
+
+  void set_stmt(StmtPtr s) {
+    stmt_ = std::move(s);
+  }
+
+  void apply_mutator(IRMutator* mutator) {
+    stmt_ = stmt_->accept_mutator(mutator);
+  }
+
+  void apply_visitor(IRVisitor* visitor) {
+    stmt_->accept(visitor);
+  }
+
+  std::vector<BufferArg>& buffer_args() {
+    return buffer_args_;
+  }
+
+  const std::vector<BufferArg>& buffer_args() const {
+    return buffer_args_;
+  }
+
+  at::Device device() {
+    return device_;
+  }
+
+  // This function returns the generated code as
+  // a string.
+  virtual std::string getCodeText(
+      const std::string& attr [[maybe_unused]] = "") {
+    return "";
+  }
+
+  // TODO: Figure out how to unify these call interfaces.
+
+  /// Call a function with a vector of CallArgs, which are tagged
+  /// unions that properly type the arguments.
+  virtual void call(const std::vector<CallArg>& args) = 0;
+
+  /// Call a function faster than a regular `call` by assuming that
+  /// the generated kernel already knows the type of the arguments, so
+  /// they can be type-punned with `void*`s.
+  virtual void call_raw(const std::vector<void*>& args) = 0;
+
+  /// Call a function even faster than a regular call, by assuming
+  /// that the number of thread blocks can be derived from `numel` via
+  /// a simple division, rather than evaluating an expression.
+  virtual void call_with_numel(void** args, int64_t numel);
+
+  virtual at::Tensor empty_strided(
+      c10::IntArrayRef size,
+      c10::IntArrayRef stride,
+      std::optional<c10::ScalarType> dtype_opt,
+      std::optional<c10::Layout> layout_opt,
+      std::optional<c10::Device> device_opt,
+      std::optional<bool> pin_memory_opt) {
+    return at::empty_strided(
+        size, stride, dtype_opt, layout_opt, device_opt, pin_memory_opt);
+  }
+
+  const std::string& kernel_func_name() const {
+    return kernel_func_name_;
+  }
+
+  void allocIntermediateBufs();
+
+ protected:
+  static void* argToPtr(const BufferArg& bufferArg, const CallArg& callArg);
+
+ private:
+  StmtPtr stmt_;
+  std::vector<BufferArg> buffer_args_;
+  at::Device device_ = at::kCPU;
+  std::string kernel_func_name_ = "func";
+};
+
+class TORCH_API ExtCallMemoryReuse : public IRMutator {
+  static std::unordered_map<std::string, std::string> makeExtCallFuncNameMap();
+  static const std::unordered_map<std::string, std::string> extCallFuncNameMap_;
+
+ public:
+  explicit ExtCallMemoryReuse(
+      const std::vector<CodeGen::BufferArg>& bufferArgs);
+  ~ExtCallMemoryReuse() override = default;
+  StmtPtr mutate(const ExternalCallPtr& v) override;
+
+ private:
+  std::unordered_set<BufPtr> bufferArgs_;
+};
+
+class CodeGen::BufferArg {
+ public:
+  BufferArg(const Tensor& tensor) : buf_(tensor.buf()) {}
+  BufferArg(const VarHandle& var) : var_(var.node()), isVar_(true) {}
+  BufferArg(const BufHandle& buf) : buf_(buf.node()) {}
+  BufferArg(BufPtr buf) : buf_(std::move(buf)) {}
+
+  VarPtr var() const {
+    return isVar_ ? var_ : buf_->base_handle();
+  }
+
+  BufPtr buf() const {
+    return buf_;
+  }
+
+  bool isVar() const {
+    return isVar_;
+  }
+
+  Dtype dtype() const {
+    return isVar_ ? var_->dtype() : buf_->dtype();
+  }
+
+ private:
+  VarPtr var_ = nullptr;
+  BufPtr buf_ = nullptr;
+  bool isVar_ = false;
+};
+
+class CodeGen::CallArg {
+ public:
+  template <typename T>
+  CallArg(const PaddedBuffer<T>& buffer);
+
+  template <typename T>
+  CallArg(const std::vector<T>& buffer)
+      : data_(const_cast<T*>(buffer.data())) {}
+
+  CallArg(void* ptr) : data_(ptr) {}
+
+#define ARG_TYPE_CTOR(Type, Name)      \
+  CallArg(Type v) {                    \
+    memcpy(buffer_, &v, sizeof(Type)); \
+    data_ = (void*)buffer_;            \
+  }
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, ARG_TYPE_CTOR)
+#undef ARG_TYPE_CTOR
+
+  void* data() const {
+    return data_;
+  }
+
+  CallArg(const CallArg& rhs) {
+    if (rhs.data_ == rhs.buffer_) {
+      memcpy(this->buffer_, rhs.buffer_, sizeof(rhs.buffer_));
+      this->data_ = (void*)(this->buffer_);
+    } else {
+      this->data_ = rhs.data_;
+    }
+  }
+
+  CallArg& operator=(const CallArg& rhs) {
+    if (this == &rhs) {
+      return *this;
+    }
+    if (rhs.data_ == rhs.buffer_) {
+      memcpy(this->buffer_, rhs.buffer_, sizeof(rhs.buffer_));
+      this->data_ = (void*)(this->buffer_);
+    } else {
+      this->data_ = rhs.data_;
+    }
+    return *this;
+  }
+
+#define ARG_PTR_DEFINE(Type, Name)                  \
+  Type* Name##Ptr() const {                         \
+    TORCH_INTERNAL_ASSERT(data_ == (void*)buffer_); \
+    return (Type*)data_;                            \
+  }
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, ARG_PTR_DEFINE)
+#undef ARG_PTR_DEFINE
+
+ private:
+  void* data_;
+  // Regarding a scalar value, CallArg uses void**=&data_ to store it. But the
+  // bit width of a pointer is 32bit on a 32bit platform. It cannot store the
+  // scalar if the bit width of the scalar is larger than 32bit, such as double
+  // and long. Hence, we add 8 bytes buffer dedicated to storing the scalar
+  // value regardless its bit width is less or greater than 32bits.
+  char buffer_[8] = {0}; // 64bits
+};
+
+class RegisterCodeGenList {
+ public:
+  TORCH_API static RegisterCodeGenList& GetInstance();
+
+  using StmtFactoryMethod = std::function<std::unique_ptr<CodeGen>(
+      StmtPtr stmt,
+      const std::vector<CodeGen::BufferArg>&,
+      at::Device device,
+      const std::string& kernel_func_name)>;
+
+  TORCH_API StmtFactoryMethod FindStmtFactoryMethod(const std::string& name);
+  RegisterCodeGenList(const RegisterCodeGenList&) = delete;
+  RegisterCodeGenList& operator=(const RegisterCodeGenList&) = delete;
+
+ private:
+  template <class CodeGenType>
+  friend class RegisterCodeGen;
+  RegisterCodeGenList() = default;
+  TORCH_API void AddStmtFactoryMethod(
+      const std::string& name,
+      const StmtFactoryMethod& stmt_factory_method);
+
+  std::unordered_map<std::string, StmtFactoryMethod> stmt_factory_methods_;
+};
+
+template <class CodeGenType>
+class RegisterCodeGen {
+ public:
+  explicit RegisterCodeGen(const std::string& name) {
+    RegisterCodeGenList& codegen_list = RegisterCodeGenList::GetInstance();
+    codegen_list.AddStmtFactoryMethod(
+        name,
+        [](const StmtPtr& stmt,
+           const std::vector<CodeGen::BufferArg>& params,
+           at::Device device,
+           const std::string& kernel_func_name) {
+          return std::make_unique<CodeGenType>(
+              stmt, params, device, kernel_func_name);
+        });
+  }
+};
+
+TORCH_API std::unique_ptr<CodeGen> CreateCodeGen(
+    const std::string& name,
+    StmtPtr stmt,
+    const std::vector<CodeGen::BufferArg>& params,
+    at::Device device = at::kCPU,
+    const std::string& kernel_func_name = "func");
+
+class TORCH_API GenericIntrinsicsExpander : public IRMutator {
+ protected:
+  ExprPtr mutate(const IntrinsicsPtr& v) override;
+};
+
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/cpp_codegen.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/cpp_codegen.h
new file mode 100644
index 0000000000000000000000000000000000000000..0715939ff5587ce394ccf936843058e1bf9ee630
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/cpp_codegen.h
@@ -0,0 +1,98 @@
+#pragma once
+
+#include <torch/csrc/jit/tensorexpr/codegen.h>
+#include <torch/csrc/jit/tensorexpr/ir_printer.h>
+
+namespace torch::jit::tensorexpr {
+
+class CppVarNameRewriter;
+
+// Generates C++ code from the IR.
+//
+// Vector operations are unrolled.
+// For example:
+// C[Ramp(0, 1, 3)] = A[Ramp(0, 2, 3)] + B[Ramp(0, 3, 3)];
+// is unrolled into:
+// C[0] = A[0] + B[0];
+// C[1] = A[2] + B[3];
+// C[2] = A[4] + B[6];
+class TORCH_API CppPrinter : public IRPrinter {
+ public:
+  explicit CppPrinter(std::ostream* os);
+  ~CppPrinter() override;
+
+  void printPrologue();
+
+  using IRPrinter::visit;
+
+  // Binary expressions.
+  void visit(const ModPtr&) override;
+  void visit(const MaxPtr&) override;
+  void visit(const MinPtr&) override;
+
+  // Conditional expressions.
+  void visit(const CompareSelectPtr&) override;
+  void visit(const IfThenElsePtr&) override;
+
+  // Tensor operations.
+  void visit(const AllocatePtr&) override;
+  void visit(const FreePtr&) override;
+  void visit(const LoadPtr&) override;
+  void visit(const StorePtr&) override;
+
+  // Casts.
+  void visit(const CastPtr&) override;
+  void visit(const BitCastPtr&) override;
+
+  // Calls.
+  void visit(const IntrinsicsPtr&) override;
+  void visit(const ExternalCallPtr&) override;
+
+  // Vars.
+  void visit(const LetPtr&) override;
+  void visit(const VarPtr&) override;
+
+  // Vector data types.
+  void visit(const RampPtr&) override;
+  void visit(const BroadcastPtr&) override;
+
+ private:
+  int lane_;
+  std::unordered_map<VarPtr, ExprPtr> vector_vars_;
+};
+
+class TORCH_API CppCodeGen : public CodeGen {
+ public:
+  CppCodeGen(
+      StmtPtr stmt,
+      const std::vector<BufferArg>& buffer_args,
+      at::Device device = at::kCPU,
+      const std::string& kernel_func_name = "func");
+
+  ~CppCodeGen() override;
+
+  void call(const std::vector<CallArg>& args) override;
+  void call_raw(const std::vector<void*>& args) override;
+
+  template <typename... Ts>
+  void operator()(const Ts&... ts) {
+    call(std::vector<CallArg>({CallArg(ts)...}));
+  }
+
+  std::string getCodeText(const std::string& attr = "") override {
+    return oss_.str();
+  }
+
+ private:
+  void init();
+
+  std::ostream& os() {
+    return printer_->os();
+  }
+
+  std::ostringstream oss_;
+  std::unique_ptr<CppPrinter> printer_;
+  std::unique_ptr<CppVarNameRewriter> var_name_rewriter_;
+};
+
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/cpp_intrinsics.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/cpp_intrinsics.h
new file mode 100644
index 0000000000000000000000000000000000000000..d91cf9a0709c2caf9c3a42d15536acbbec17ff1c
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/cpp_intrinsics.h
@@ -0,0 +1,32 @@
+#pragma once
+
+namespace torch::jit::tensorexpr {
+
+constexpr auto cpp_intrinsics_definition = R"(
+namespace std {
+
+template <typename T,
+          std::enable_if_t<std::is_floating_point_v<T>, int> = 0>
+T rsqrt(T v) {
+  return 1.0f / std::sqrt(v);
+}
+
+template <typename T,
+          std::enable_if_t<std::is_floating_point_v<T>, int> = 0>
+T frac(T v) {
+  T intpart;
+  return std::modf(v, &intpart);
+}
+
+template <typename From, typename To>
+To bitcast(const From& v) {
+  assert(sizeof(To) == sizeof(From));
+  To res;
+  std::memcpy(&res, &v, sizeof(From));
+  return res;
+}
+
+} // namespace std
+)";
+
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/cuda_codegen.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/cuda_codegen.h
new file mode 100644
index 0000000000000000000000000000000000000000..92176b8f95f7cb2deb2836abf7c19c576da8f0ae
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/cuda_codegen.h
@@ -0,0 +1,286 @@
+#pragma once
+
+#include <unordered_set>
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/csrc/jit/resource_guard.h>
+#include <torch/csrc/jit/tensorexpr/codegen.h>
+#include <torch/csrc/jit/tensorexpr/eval.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_printer.h>
+#include <torch/csrc/jit/tensorexpr/ir_visitor.h>
+#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
+#include <torch/csrc/jit/tensorexpr/unique_name_manager.h>
+
+namespace torch::jit::tensorexpr {
+
+// A class that analyzes the given program relevant for Cuda backends.
+class CudaAnalysis : public IRVisitor {
+ public:
+  CudaAnalysis() {
+    gpu_block_extents_ = {alloc<IntImm>(1), alloc<IntImm>(1), alloc<IntImm>(1)};
+    gpu_thread_extents_ = {
+        alloc<IntImm>(1), alloc<IntImm>(1), alloc<IntImm>(1)};
+  }
+  bool is_buf_store_target(const BufPtr& buf) const {
+    return store_targets_.count(buf) > 0;
+  }
+
+  const std::unordered_set<VarPtr>& thread_local_bufs() const {
+    return thread_local_bufs_;
+  }
+
+  const std::unordered_set<VarPtr>& cross_block_bufs() const {
+    return cross_block_bufs_;
+  }
+
+  const std::vector<ExprPtr>& gpu_block_extents() const {
+    return gpu_block_extents_;
+  }
+
+  const std::vector<ExprPtr>& gpu_thread_extents() const {
+    return gpu_thread_extents_;
+  }
+
+ private:
+  void visit(const StorePtr& v) override {
+    store_targets_.insert(v->buf());
+  }
+
+  void visit(const AllocatePtr& v) override;
+  void visit(const FreePtr& v) override;
+  void visit(const PlacementAllocatePtr& v) override;
+  void visit(const ForPtr& v) override;
+
+  std::unordered_set<BufPtr> store_targets_;
+  std::unordered_set<VarPtr> thread_local_bufs_;
+  std::unordered_set<VarPtr> cross_block_bufs_;
+
+  std::vector<ExprPtr> gpu_block_extents_;
+  std::vector<ExprPtr> gpu_thread_extents_;
+};
+
+// An IRMutator that replaces binding loop options with Cuda metavars, and masks
+// statements blocks which should execute with less reach than the launch
+// parameter extent.
+//
+// We do this by segmenting each block into chunks which should have the same
+// execution parameters, then if those params differ from the max mask each dim.
+class GPUMetaVarRewriter : public IRMutator {
+ public:
+  explicit GPUMetaVarRewriter(const CudaAnalysis* cuda_analysis)
+      : cuda_analysis_(cuda_analysis) {
+    gpu_block_vars_ = {
+        alloc<Var>("blockIdx.x", kInt),
+        alloc<Var>("blockIdx.y", kInt),
+        alloc<Var>("blockIdx.z", kInt)};
+    gpu_thread_vars_ = {
+        alloc<Var>("threadIdx.x", kInt),
+        alloc<Var>("threadIdx.y", kInt),
+        alloc<Var>("threadIdx.z", kInt)};
+
+    current_block_reach_ = {
+        alloc<IntImm>(1), alloc<IntImm>(1), alloc<IntImm>(1)};
+    current_thread_reach_ = {
+        alloc<IntImm>(1), alloc<IntImm>(1), alloc<IntImm>(1)};
+  }
+
+  StmtPtr mutate(const ForPtr& v) override;
+  StmtPtr mutate(const BlockPtr& v) override;
+
+  const std::vector<VarPtr>& gpu_block_vars() const {
+    return gpu_block_vars_;
+  }
+
+  const std::vector<VarPtr>& gpu_thread_vars() const {
+    return gpu_thread_vars_;
+  }
+
+  const std::vector<ExprPtr>& gpu_block_extents() const {
+    return cuda_analysis_->gpu_block_extents();
+  }
+
+  const std::vector<ExprPtr>& gpu_thread_extents() const {
+    return cuda_analysis_->gpu_thread_extents();
+  }
+
+ private:
+  // When processing a block, stores the contents of each sub-segment.
+  class Segment {
+   public:
+    void reset(bool mask) {
+      stmts_.clear();
+      mask_ = mask;
+    }
+
+    bool empty() const {
+      return stmts_.empty();
+    }
+
+    std::vector<StmtPtr>& stmts() {
+      return stmts_;
+    }
+    bool mask() {
+      return mask_;
+    }
+
+   private:
+    std::vector<StmtPtr> stmts_;
+    bool mask_{true};
+  };
+
+  // Returns true if the current execution scope is equivalent to the launch
+  // parameters.
+  bool isFullExtent();
+
+  std::vector<VarPtr> gpu_block_vars_;
+  std::vector<VarPtr> gpu_thread_vars_;
+
+  std::vector<ExprPtr> current_block_reach_;
+  std::vector<ExprPtr> current_thread_reach_;
+
+  const CudaAnalysis* cuda_analysis_;
+};
+
+// A class that overrides the underlying IRPrinter to produce Cuda C.
+class CudaPrinter : public IRPrinter {
+ public:
+  explicit CudaPrinter(
+      std::ostream* os,
+      const CudaAnalysis* cuda_analysis,
+      bool has_random)
+      : IRPrinter(*os), cuda_analysis_(cuda_analysis) {
+    if (has_random) {
+      rand_func_ = alloc<Var>("rand", kHandle);
+    }
+  }
+
+  void visit(const CastPtr& v) override;
+  void visit(const IntrinsicsPtr& v) override;
+  void visit(const ForPtr& v) override;
+
+  void visit(const LoadPtr& v) override;
+  void visit(const StorePtr& v) override;
+  void visit(const AtomicAddPtr& v) override;
+  void visit(const MaxPtr& v) override;
+  void visit(const MinPtr& v) override;
+  void visit(const IfThenElsePtr& v) override;
+  void visit(const BlockPtr& v) override;
+  void visit(const AllocatePtr& v) override;
+  void visit(const FreePtr& v) override;
+  void visit(const LetPtr& v) override;
+
+  void visit(const ExternalCallPtr& v) override;
+
+  VarPtr rand_func() const {
+    return rand_func_;
+  }
+
+  std::string dtypeToCppString(const Dtype& dtype) override;
+
+  using IRPrinter::name_manager;
+  using IRPrinter::visit;
+
+ private:
+  VarPtr rand_func_;
+  const CudaAnalysis* cuda_analysis_;
+
+  void print_flat_alloc(const AllocatePtr& alloc);
+};
+
+// Construct Cuda C from the buffer and tensor input, and invoke the
+// kernel when real arguments are provided.
+class TORCH_CUDA_CU_API CudaCodeGen : public CodeGen {
+ public:
+  template <typename... Ts>
+  CudaCodeGen(StmtPtr stmt, Ts... ts)
+      : CodeGen(
+            stmt,
+            std::vector<BufferArg>({BufferArg(ts)...}),
+            at::Device(at::kCUDA, at::cuda::current_device())) {
+    Initialize();
+  }
+
+  CudaCodeGen(
+      StmtPtr stmt,
+      const std::vector<BufferArg>& buffer_args,
+      at::Device device = at::Device(at::kCUDA, at::cuda::current_device()),
+      const std::string& kernel_func_name = "func")
+      : CodeGen(std::move(stmt), buffer_args, device, kernel_func_name) {
+    Initialize();
+  }
+
+  ~CudaCodeGen() override;
+
+  void call(const std::vector<CallArg>& args) override;
+  void call_raw(const std::vector<void*>& args) override;
+  void call_with_numel(void** args, int64_t numel) override;
+
+  template <typename... Ts>
+  void operator()(const Ts&... ts) {
+    call(std::vector<CallArg>({CallArg(ts)...}));
+  }
+
+  at::Tensor empty_strided(
+      c10::IntArrayRef size,
+      c10::IntArrayRef stride,
+      std::optional<c10::ScalarType> dtype_opt,
+      std::optional<c10::Layout> layout_opt,
+      std::optional<c10::Device> device_opt,
+      std::optional<bool> pin_memory_opt) override;
+
+  const std::vector<ExprPtr>& gpu_block_extents() const {
+    return cuda_analysis_->gpu_block_extents();
+  }
+
+  const std::vector<ExprPtr>& gpu_thread_extents() const {
+    return cuda_analysis_->gpu_thread_extents();
+  }
+
+  std::string getCodeText(const std::string& attr = "") override {
+    return oss_.str();
+  }
+
+ private:
+  void Initialize();
+
+  void CompileToNVRTC(const std::string& code, const std::string& func_name);
+
+  UniqueNameManager* name_manager() {
+    if (!printer_) {
+      throw std::runtime_error("Null IRPrinter is not expected");
+    }
+    return printer_->name_manager();
+  }
+
+  std::ostream& os() {
+    return printer_->os();
+  }
+
+  std::ostringstream oss_;
+  std::unique_ptr<CudaPrinter> printer_;
+  std::unique_ptr<CudaAnalysis> cuda_analysis_;
+  std::unique_ptr<GPUMetaVarRewriter> metavar_rewriter_;
+  std::unordered_set<std::string> taken_func_names;
+  std::mutex eval_lock_;
+  CUfunction function_{nullptr};
+  bool has_random_ = false;
+  int thread_block_size_ = -1;
+
+  std::vector<bool> arg_pos_in_extents_;
+#ifdef TORCH_ENABLE_LLVM
+  std::vector<ExprEval<LLVMCodeGen>> block_extents_eval_;
+  std::vector<ExprEval<LLVMCodeGen>> thread_extents_eval_;
+#else
+  std::vector<ExprEval<SimpleIREvaluator>> block_extents_eval_;
+  std::vector<ExprEval<SimpleIREvaluator>> thread_extents_eval_;
+#endif
+
+  std::string GetUniqueFuncName(const std::string& func_prefix);
+};
+
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/cuda_random.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/cuda_random.h
new file mode 100644
index 0000000000000000000000000000000000000000..b1f91a8c1eb951b8ba9b1d4adde5b356932f8235
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/cuda_random.h
@@ -0,0 +1,100 @@
+#pragma once
+
+namespace torch::jit::tensorexpr {
+
+constexpr auto philox_random_string = R"(
+
+class Philox {
+public:
+  __device__ inline Philox(unsigned long long seed,
+                           unsigned long long subsequence,
+                           unsigned long long offset) {
+    key.x = (unsigned int)seed;
+    key.y = (unsigned int)(seed >> 32);
+    counter = make_uint4(0, 0, 0, 0);
+    counter.z = (unsigned int)(subsequence);
+    counter.w = (unsigned int)(subsequence >> 32);
+    STATE = 0;
+    incr_n(offset / 4);
+  }
+
+  __device__ inline unsigned long operator()() {
+    if(STATE == 0) {
+      uint4 counter_ = counter;
+      uint2 key_ = key;
+      for(int i = 0; i < 9; i++) {
+        counter_ = single_round(counter_, key_);
+        key_.x += (kPhilox10A); key_.y += (kPhilox10B);
+      }
+      output = single_round(counter_, key_);
+      incr();
+    }
+    unsigned long ret;
+    switch(STATE) {
+      case 0: ret = output.x; break;
+      case 1: ret = output.y; break;
+      case 2: ret = output.z; break;
+      case 3: ret = output.w; break;
+    }
+    STATE = (STATE + 1) % 4;
+    return ret;
+  }
+
+private:
+  uint4 counter;
+  uint4 output;
+  uint2 key;
+  unsigned int STATE;
+  __device__ inline void incr_n(unsigned long long n) {
+    unsigned int nlo = (unsigned int)(n);
+    unsigned int nhi = (unsigned int)(n >> 32);
+    counter.x += nlo;
+    if (counter.x < nlo)
+      nhi++;
+    counter.y += nhi;
+    if (nhi <= counter.y)
+      return;
+    if (++counter.z)
+      return;
+    ++counter.w;
+  }
+  __device__ inline void incr() {
+    if (++counter.x)
+      return;
+    if (++counter.y)
+      return;
+    if (++counter.z)
+      return;
+    ++counter.w;
+  }
+  __device__ unsigned int mulhilo32(unsigned int a, unsigned int b,
+                                    unsigned int *result_high) {
+    *result_high = __umulhi(a, b);
+    return a*b;
+  }
+
+  __device__ inline uint4 single_round(uint4 ctr, uint2 key) {
+    unsigned int hi0;
+    unsigned int hi1;
+    unsigned int lo0 = mulhilo32(kPhiloxSA, ctr.x, &hi0);
+    unsigned int lo1 = mulhilo32(kPhiloxSB, ctr.z, &hi1);
+
+    uint4 ret = {hi1 ^ ctr.y ^ key.x, lo1, hi0 ^ ctr.w ^ key.y, lo0};
+    return ret;
+  }
+
+  static const unsigned long kPhilox10A = 0x9E3779B9;
+  static const unsigned long kPhilox10B = 0xBB67AE85;
+  static const unsigned long kPhiloxSA = 0xD2511F53;
+  static const unsigned long kPhiloxSB = 0xCD9E8D57;
+};
+
+// Inverse of 2^32.
+#define M_RAN_INVM32 2.3283064e-10f
+__device__  __inline__ float Uint32ToFloat(unsigned int x) {
+  return x * M_RAN_INVM32;
+}
+
+)";
+
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/eval.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/eval.h
new file mode 100644
index 0000000000000000000000000000000000000000..3b8f789fee64df737ca8c758743c281faaea75ae
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/eval.h
@@ -0,0 +1,325 @@
+#pragma once
+
+#include <cmath>
+#include <cstring>
+#include <utility>
+#include <vector>
+
+#include <c10/macros/Macros.h>
+#include <c10/util/Logging.h>
+#include <torch/csrc/jit/tensorexpr/codegen.h>
+#include <torch/csrc/jit/tensorexpr/exceptions.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_printer.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+#include <torch/csrc/jit/tensorexpr/types.h>
+#include <torch/csrc/jit/tensorexpr/var_substitutor.h>
+
+namespace torch::jit::tensorexpr {
+
+class InterpValue {
+ public:
+  InterpValue() : dtype_(kInt) {
+    Intvalues.push_back(0);
+  }
+
+  template <typename T>
+  InterpValue(Dtype dtype, T v) : dtype_(dtype) {
+#define TYPE_CASE(Type, Name)  \
+  if (dtype == k##Name) {      \
+    Name##values.push_back(v); \
+    return;                    \
+  }
+    AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE)
+#undef TYPE_CASE
+    throw unsupported_dtype();
+  }
+
+#define VALUE_CTOR(Type, Name)            \
+  InterpValue(Type v) : dtype_(k##Name) { \
+    Name##values.push_back(v);            \
+  }
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, VALUE_CTOR)
+#undef VALUE_CTOR
+
+  explicit InterpValue(c10::quint8 v) : dtype_(kQUInt8) {
+    QUInt8values.emplace_back(v.val_);
+  }
+
+  explicit InterpValue(c10::qint8 v) : dtype_(kQInt8) {
+    QInt8values.emplace_back(v.val_);
+  }
+
+#define VALUE_VEC_CTOR(Type, Name)        \
+  InterpValue(const std::vector<Type>& v) \
+      : dtype_(Dtype(k##Name, v.size())), Name##values(v) {}
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, VALUE_VEC_CTOR)
+  VALUE_VEC_CTOR(c10::quint8, QUInt8)
+  VALUE_VEC_CTOR(c10::qint8, QInt8)
+#undef VALUE_VEC_CTOR
+
+  template <typename T>
+  T as() const;
+
+  template <typename T>
+  const std::vector<T>& as_vec() const;
+
+  int64_t intValue() const;
+
+  Dtype dtype() const {
+    return dtype_;
+  }
+
+ private:
+  Dtype dtype_;
+
+#define VALUE_STORAGE(Type, Name) std::vector<Type> Name##values;
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, VALUE_STORAGE)
+  VALUE_STORAGE(c10::qint8, QInt8)
+  VALUE_STORAGE(c10::quint8, QUInt8)
+#undef VALUE_STORAGE
+  void* ptr{nullptr};
+};
+
+#define VALUE_AS_DISPATCH(Type, Name)         \
+  template <>                                 \
+  inline Type InterpValue::as<Type>() const { \
+    if (dtype_ != k##Name) {                  \
+      throw unsupported_dtype();              \
+    }                                         \
+    return Name##values[0];                   \
+  }
+AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, VALUE_AS_DISPATCH)
+VALUE_AS_DISPATCH(c10::quint8, QUInt8)
+VALUE_AS_DISPATCH(c10::qint8, QInt8)
+#undef VALUE_AS_DISPATCH
+
+#define VALUE_AS_VEC_DISPATCH(Type, Name)                             \
+  template <>                                                         \
+  inline const std::vector<Type>& InterpValue::as_vec<Type>() const { \
+    if (dtype_.scalar_type() != ScalarType::Name) {                   \
+      throw unsupported_dtype();                                      \
+    }                                                                 \
+    return Name##values;                                              \
+  }
+AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, VALUE_AS_VEC_DISPATCH)
+VALUE_AS_VEC_DISPATCH(c10::quint8, QUInt8)
+VALUE_AS_VEC_DISPATCH(c10::qint8, QInt8)
+#undef VALUE_AS_VEC_DISPATCH
+
+template <typename Type>
+auto underlyingValue(Type x) {
+  return x;
+}
+
+template <>
+inline auto underlyingValue<c10::quint8>(c10::quint8 x) {
+  return x.val_;
+}
+
+template <>
+inline auto underlyingValue<c10::qint8>(c10::qint8 x) {
+  return x.val_;
+}
+
+template <typename To, typename From>
+To raw_bitcast(const From& src) {
+  TORCH_CHECK(sizeof(To) == sizeof(From), "Invalid bitcast invocation");
+  To storage;
+  std::memcpy(&storage, &src, sizeof(To));
+  return reinterpret_cast<To&>(storage);
+}
+
+class SimpleIREvaluatorImpl;
+class TORCH_API SimpleIREvaluator : public CodeGen {
+ public:
+  SimpleIREvaluator(
+      StmtPtr stmt,
+      const std::vector<BufferArg>& buffer_args,
+      at::Device device = at::kCPU,
+      const std::string& kernel_func_name = "func");
+
+  ~SimpleIREvaluator() override;
+
+  void call(const std::vector<CallArg>& args) override;
+  void call_raw(const std::vector<void*>& args) override;
+
+  template <typename... Ts>
+  void operator()(const Ts&... ts) {
+    std::vector<CallArg> args({CallArg(ts)...});
+    call(args);
+  }
+
+  void bindVar(const VarPtr& v, const ExprPtr& e);
+  InterpValue value() const;
+
+ private:
+  void bindArg(const BufferArg& buf, void* data);
+  void expand_intrinsics() {
+    GenericIntrinsicsExpander intrinsics_expander;
+    apply_mutator(&intrinsics_expander);
+  }
+
+  std::unique_ptr<SimpleIREvaluatorImpl> impl_;
+};
+
+template <class CodeGenType>
+class ExprEval {
+ public:
+  using BufferArg = CodeGen::BufferArg;
+  using CallArg = CodeGen::CallArg;
+
+  template <typename... Ts>
+  ExprEval(const ExprHandle& expr, Ts... ts)
+      : ExprEval(expr, {BufferArg(ts)...}) {}
+
+  ExprEval(const ExprHandle& expr, const std::vector<BufferArg>& buffer_args)
+      : dtype_(expr.dtype()) {
+    std::vector<BufferArg> buffer_args_extended = buffer_args;
+    BufHandle ret_buf("ret_val", {1}, dtype_);
+    std::vector<ExprHandle> indices;
+    ExprHandle zero = IntImm::make(0);
+    indices.reserve(ret_buf.ndim());
+    for (size_t i = 0; i < ret_buf.ndim(); i++) {
+      indices.push_back(zero);
+    }
+    StmtPtr store_stmt = Store::make(ret_buf, indices, expr);
+    buffer_args_extended.emplace_back(ret_buf);
+    codegen_.reset(new CodeGenType(store_stmt, buffer_args_extended));
+  }
+
+  template <typename... Ts>
+  void operator()(Ts... ts) {
+    call(ts...);
+  }
+
+  void operator()(const std::vector<CallArg>& call_args) {
+    call(call_args);
+  }
+
+  void bindVar(VarPtr v, ExprPtr e) {
+    codegen_->bindVar(v, e);
+  }
+
+  void bindVar(const VarHandle& v, const ExprHandle& e) {
+    codegen_->bindVar(v.node(), e.node());
+  }
+
+  template <typename... Ts>
+  void call(Ts... ts) {
+    call({CallArg(ts)...});
+  }
+
+  void call(const std::vector<CallArg>& call_args) {
+    std::vector<CallArg> call_args_extended = call_args;
+    switch (dtype_.scalar_type()) {
+#define TYPE_CASE(Type, Name)                     \
+  case ScalarType::Name: {                        \
+    std::vector<Type> ret_val_arg(1);             \
+    call_args_extended.emplace_back(ret_val_arg); \
+    codegen_->call(call_args_extended);           \
+    ret_value_ = InterpValue(ret_val_arg[0]);     \
+  } break;
+      AT_FORALL_SCALAR_TYPES_AND2(Half, BFloat16, TYPE_CASE);
+      TYPE_CASE(c10::quint8, QUInt8);
+      TYPE_CASE(c10::qint8, QInt8);
+#undef TYPE_CASE
+      case ScalarType::Bool: {
+        std::vector<unsigned char> ret_val_arg(1);
+        call_args_extended.emplace_back(ret_val_arg.data());
+        codegen_->call(call_args_extended);
+        ret_value_ = InterpValue((bool)ret_val_arg[0]);
+      } break;
+      default:
+        throw unsupported_dtype();
+    }
+  }
+
+  void call_raw(const std::vector<void*>& args) {
+    std::vector<void*> args_extended = args;
+    switch (dtype_.scalar_type()) {
+#define TYPE_CASE(Type, Name)                    \
+  case ScalarType::Name: {                       \
+    std::vector<Type> ret_val_arg(1);            \
+    args_extended.push_back(ret_val_arg.data()); \
+    codegen_->call_raw(args_extended);           \
+    ret_value_ = InterpValue(ret_val_arg[0]);    \
+  } break;
+      AT_FORALL_SCALAR_TYPES_AND2(Half, BFloat16, TYPE_CASE);
+      TYPE_CASE(c10::quint8, QUInt8);
+      TYPE_CASE(c10::qint8, QInt8);
+#undef TYPE_CASE
+      case ScalarType::Bool: {
+        std::vector<unsigned char> ret_val_arg(1);
+        args_extended.push_back(ret_val_arg.data());
+        codegen_->call_raw(args_extended);
+        ret_value_ = InterpValue((bool)ret_val_arg[0]);
+      } break;
+      default:
+        throw unsupported_dtype();
+    }
+  }
+
+  template <typename T>
+  T value(const std::vector<void*>& args) {
+    call_raw(args);
+    return ret_value_.as<T>();
+  }
+
+  template <typename T, typename... Ts>
+  T value(Ts... ts) {
+    call(std::forward<Ts>(ts)...);
+    return ret_value_.as<T>();
+  }
+
+  Dtype dtype() {
+    return dtype_;
+  }
+
+ private:
+  Dtype dtype_;
+  std::unique_ptr<CodeGenType> codegen_;
+  InterpValue ret_value_;
+};
+
+// Evaluates the given expression and returns an int64_t value if the result of
+// the given expression is int64_t.
+std::optional<int64_t> evalInt(ExprPtr e);
+
+// Substitutes the given vars with their corresponding expressions in the input
+// expression.
+inline ExprPtr Substitute(const ExprPtr& expr, const VarMapping& var_mapping) {
+  VarSubMutator var_sub(var_mapping);
+  return expr->accept_mutator(&var_sub);
+}
+
+// Substitutes the given vars with their corresponding expressions in the input
+// statement.
+inline StmtPtr Substitute(const StmtPtr& stmt, const VarMapping& var_mapping) {
+  VarSubMutator var_sub(var_mapping);
+  return stmt->accept_mutator(&var_sub);
+}
+
+// Creates a clone of the input expression and substitutes the given vars with
+// their corresponding expressions in the clone.
+// NOTE: This works because cloning reuses variables and does not create new
+// ones, and `VarMapping` input has variables as the key.
+inline ExprPtr SubstituteInClone(
+    const ExprPtr& expr,
+    const VarMapping& var_mapping) {
+  VarSubMutator var_sub(var_mapping);
+  return Expr::clone(expr)->accept_mutator(&var_sub);
+}
+
+// Creates a clone of the input statement and substitutes the given vars with
+// their corresponding expressions in the clone.
+// NOTE: This works because cloning reuses variables and does not create new
+// ones, and `VarMapping` input has variables as the key.
+inline StmtPtr SubstituteInClone(
+    const StmtPtr& stmt,
+    const VarMapping& var_mapping) {
+  VarSubMutator var_sub(var_mapping);
+  return Stmt::clone(stmt)->accept_mutator(&var_sub);
+}
+
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/exceptions.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/exceptions.h
new file mode 100644
index 0000000000000000000000000000000000000000..99af02275d0d9ac97b87bfbfe6b2e6755e6ca2db
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/exceptions.h
@@ -0,0 +1,83 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/tensorexpr/fwd_decls.h>
+
+#include <stdexcept>
+
+// Forward declarations of types
+
+namespace torch::jit::tensorexpr {
+class Expr;
+class Stmt;
+} // namespace torch::jit::tensorexpr
+
+// Forward declarations of functions
+namespace std {
+TORCH_API std::string to_string(const torch::jit::tensorexpr::ExprPtr&);
+TORCH_API std::string to_string(const torch::jit::tensorexpr::StmtPtr&);
+} // namespace std
+
+namespace torch::jit::tensorexpr {
+
+class unsupported_dtype : public std::runtime_error {
+ public:
+  explicit unsupported_dtype() : std::runtime_error("UNSUPPORTED DTYPE") {}
+  explicit unsupported_dtype(const std::string& err)
+      : std::runtime_error("UNSUPPORTED DTYPE: " + err) {}
+};
+
+class out_of_range_index : public std::runtime_error {
+ public:
+  explicit out_of_range_index() : std::runtime_error("OUT OF RANGE INDEX") {}
+  explicit out_of_range_index(const std::string& err)
+      : std::runtime_error("OUT OF RANGE INDEX: " + err) {}
+};
+
+class unimplemented_lowering : public std::runtime_error {
+ public:
+  explicit unimplemented_lowering()
+      : std::runtime_error("UNIMPLEMENTED LOWERING") {}
+  explicit unimplemented_lowering(const ExprPtr& expr)
+      : std::runtime_error("UNIMPLEMENTED LOWERING: " + std::to_string(expr)) {}
+  explicit unimplemented_lowering(const StmtPtr& stmt)
+      : std::runtime_error("UNIMPLEMENTED LOWERING: " + std::to_string(stmt)) {}
+};
+
+class malformed_input : public std::runtime_error {
+ public:
+  explicit malformed_input() : std::runtime_error("MALFORMED INPUT") {}
+  explicit malformed_input(const std::string& err)
+      : std::runtime_error("MALFORMED INPUT: " + err) {}
+  explicit malformed_input(const ExprPtr& expr)
+      : std::runtime_error("MALFORMED INPUT: " + std::to_string(expr)) {}
+  explicit malformed_input(const std::string& err, const ExprPtr& expr)
+      : std::runtime_error(
+            "MALFORMED INPUT: " + err + " - " + std::to_string(expr)) {}
+  explicit malformed_input(const StmtPtr& stmt)
+      : std::runtime_error("MALFORMED INPUT: " + std::to_string(stmt)) {}
+  explicit malformed_input(const std::string& err, const StmtPtr& stmt)
+      : std::runtime_error(
+            "MALFORMED INPUT: " + err + " - " + std::to_string(stmt)) {}
+};
+
+class malformed_ir : public std::runtime_error {
+ public:
+  explicit malformed_ir() : std::runtime_error("MALFORMED IR") {}
+  explicit malformed_ir(const std::string& err)
+      : std::runtime_error("MALFORMED IR: " + err) {}
+  explicit malformed_ir(const ExprPtr& expr)
+      : std::runtime_error("MALFORMED IR: " + std::to_string(expr)) {}
+  explicit malformed_ir(const std::string& err, const ExprPtr& expr)
+      : std::runtime_error(
+            "MALFORMED IR: " + err + " - " + std::to_string(expr)) {}
+  explicit malformed_ir(const StmtPtr& stmt)
+      : std::runtime_error("MALFORMED IR: " + std::to_string(stmt)) {}
+  explicit malformed_ir(const std::string& err, const StmtPtr& stmt)
+      : std::runtime_error(
+            "MALFORMED IR: " + err + " - " + std::to_string(stmt)) {}
+};
+
+TORCH_API std::string buildErrorMessage(const std::string& s = "");
+
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/expr.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/expr.h
new file mode 100644
index 0000000000000000000000000000000000000000..d362f60adf21e9a44c208c9fc0f8821c9ce3cb8c
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/expr.h
@@ -0,0 +1,493 @@
+/**
+ * This file implements the core classes for Tensor Expressions.
+ *
+ * The structure of the expressions is inspired by Halide/TVM IR.
+ */
+#pragma once
+
+#include <c10/core/MemoryFormat.h>
+#include <torch/csrc/jit/tensorexpr/fwd_decls.h>
+#include <torch/csrc/jit/tensorexpr/ir_mutator.h>
+#include <torch/csrc/jit/tensorexpr/ir_visitor.h>
+#include <torch/csrc/jit/tensorexpr/types.h>
+#include <optional>
+
+#include <utility>
+
+namespace torch::jit::tensorexpr {
+
+enum IRNodeType {
+  kPrimitive,
+  kAdd,
+  kSub,
+  kMul,
+  kDiv,
+  kMod,
+  kMax,
+  kMin,
+  kAnd,
+  kOr,
+  kLshift,
+  kRshift,
+  kXor,
+  kCompareSelect,
+  kCast,
+  kBitCast,
+  kOther,
+};
+
+// The common base between all expression node.
+class TORCH_API Expr : public std::enable_shared_from_this<Expr> {
+ public:
+  explicit Expr(Dtype dtype, IRNodeType expr_type = kOther)
+      : dtype_(dtype), expr_type_(expr_type) {}
+  virtual ~Expr() = default;
+  Dtype dtype() const {
+    return dtype_;
+  }
+  virtual void accept(IRVisitor* visitor) = 0;
+  virtual ExprPtr accept_mutator(IRMutator* mutator) = 0;
+
+  IRNodeType expr_type() const {
+    return expr_type_;
+  }
+  // Is this a fixed (constant) immediate value.
+  virtual bool isConstant() const {
+    return false;
+  }
+
+  void set_dtype(Dtype dtype) {
+    dtype_ = dtype;
+  }
+
+  /*
+   * Make a deep copy of the given expression.
+   *
+   * All sub-expressions inside the given expressions are also cloned. Note
+   * that the variables are not deep-copied since they are immutable.
+   */
+  static ExprPtr clone(const ExprPtr& s);
+
+ protected:
+  std::shared_ptr<Expr> getptr() {
+    return shared_from_this();
+  }
+
+ private:
+  Dtype dtype_;
+  IRNodeType expr_type_;
+};
+
+// A CRTP pattern to accept visitors for children class,
+// and dispatch back to the children.
+template <class Op, class Base = Expr>
+class ExprNode : public Base {
+ public:
+  using ExprNodeBase = ExprNode<Op>;
+  void accept(IRVisitor* visitor) override {
+    visitor->visit(static_to<Op>(Base::getptr()));
+  }
+  ExprPtr accept_mutator(IRMutator* mutator) override;
+  // pass the constructor to the base class
+  using Base::Base;
+};
+
+// A wrapper object to the underlying ExprNode.
+// Also serves the primary way to build and operate on other expressions.
+class TORCH_API ExprHandle {
+ public:
+  ExprHandle() = default;
+  explicit ExprHandle(ExprPtr node) : base_expr_node_(std::move(node)) {}
+
+  ExprPtr node() {
+    return base_expr_node_;
+  }
+
+  ExprPtr node() const {
+    return base_expr_node_;
+  }
+
+  bool empty() const {
+    return base_expr_node_ == nullptr;
+  }
+
+#define IMM_EXPR_DECLARE(Type, Name) ExprHandle(Type v);
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_EXPR_DECLARE)
+#undef IMM_EXPR_DECLARE
+
+  template <class Op>
+  NodePtr<Op> AsNode() {
+    return to<Op>(this->node());
+  }
+
+  template <class Op>
+  NodePtr<Op> AsNode() const {
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+    return const_cast<ExprHandle*>(this)->AsNode<Op>();
+  }
+
+  Dtype dtype() const {
+    return node()->dtype();
+  }
+
+  // Handling the math operators.
+  ExprHandle operator+(const ExprHandle& other) const;
+  ExprHandle operator-(const ExprHandle& other) const;
+  ExprHandle operator*(const ExprHandle& other) const;
+  ExprHandle operator/(const ExprHandle& other) const;
+  ExprHandle operator%(const ExprHandle& other) const;
+  ExprHandle operator==(const ExprHandle& other) const;
+  ExprHandle operator!=(const ExprHandle& other) const;
+  ExprHandle operator>(const ExprHandle& other) const;
+  ExprHandle operator>=(const ExprHandle& other) const;
+  ExprHandle operator<(const ExprHandle& other) const;
+  ExprHandle operator<=(const ExprHandle& other) const;
+  ExprHandle operator&(const ExprHandle& other) const;
+  ExprHandle operator|(const ExprHandle& other) const;
+  ExprHandle operator&&(const ExprHandle& other) const;
+  ExprHandle operator||(const ExprHandle& other) const;
+  ExprHandle operator^(const ExprHandle& other) const;
+  ExprHandle operator<<(const ExprHandle& other) const;
+  ExprHandle operator>>(const ExprHandle& other) const;
+
+ private:
+  ExprPtr base_expr_node_ = nullptr;
+};
+
+// The underlying representation node to a Var.
+// Currently, each Var object represents a unique variable, even though the
+// names might be the same. We should consider add a unique_name as well.
+class TORCH_API Var : public ExprNode<Var> {
+ public:
+  static ExprHandle make(const std::string& name_hint, Dtype dtype) {
+    return ExprHandle(alloc<Var>(name_hint, dtype));
+  }
+  static ExprHandle make(Dtype dtype) {
+    return ExprHandle(alloc<Var>("", dtype));
+  }
+
+  // TODO: unique_name
+  const std::string& name_hint() const {
+    return name_hint_;
+  }
+
+  void set_name_hint(const std::string& name) {
+    name_hint_ = name;
+  }
+
+  void set_name_hint(std::string&& name) {
+    name_hint_ = std::move(name);
+  }
+
+  Var(std::string name_hint, Dtype dtype)
+      : ExprNodeBase(dtype, kPrimitive), name_hint_(std::move(name_hint)) {}
+
+ private:
+  std::string name_hint_;
+};
+
+TORCH_API std::vector<ExprPtr> make_contiguous_strides(
+    const std::vector<ExprHandle>& dims);
+TORCH_API std::vector<ExprPtr> make_channels_last_strides(
+    const std::vector<ExprHandle>& dims);
+
+class TORCH_API Buf : public ExprNode<Buf> {
+ public:
+  static BufHandle make(const std::vector<ExprHandle>& dims, Dtype dtype);
+
+  static BufHandle make(
+      const std::string& name_hint,
+      const std::vector<ExprHandle>& dims,
+      const std::vector<ExprHandle>& strides,
+      Dtype dtype);
+
+  static BufHandle make(
+      const std::string& name_hint,
+      const std::vector<ExprHandle>& dims,
+      Dtype dtype,
+      std::optional<ExprHandle> initializer = std::nullopt,
+      const std::optional<std::vector<ExprHandle>>& strides = std::nullopt,
+      std::optional<ExprHandle> qscale = std::nullopt,
+      std::optional<ExprHandle> qzero = std::nullopt);
+
+  // TODO: unique_name
+  VarPtr base_handle() const {
+    return base_handle_;
+  }
+  void set_base_handle(VarPtr base_handle) {
+    base_handle_ = std::move(base_handle);
+  }
+
+  const std::string& name_hint() const {
+    return base_handle_->name_hint();
+  }
+  void set_name_hint(const std::string& name_hint) {
+    base_handle_->set_name_hint(name_hint);
+  }
+
+  Buf(const std::string& name_hint,
+      const std::vector<ExprPtr>& dims,
+      Dtype dtype,
+      ExprPtr initializer = nullptr,
+      std::optional<std::vector<ExprPtr>> strides = std::nullopt,
+      ExprPtr qscale = nullptr,
+      ExprPtr qzero = nullptr)
+      : Buf(alloc<Var>(name_hint, kHandle),
+            dims,
+            dtype,
+            std::move(initializer),
+            std::move(strides),
+            std::move(qscale),
+            std::move(qzero)) {}
+
+  Buf(const VarPtr& var,
+      std::vector<ExprPtr> dims,
+      Dtype dtype,
+      ExprPtr initializer = nullptr,
+      std::optional<std::vector<ExprPtr>> strides = std::nullopt,
+      ExprPtr qscale = nullptr,
+      ExprPtr qzero = nullptr);
+
+  size_t ndim() const {
+    return dims_.size();
+  }
+  ExprPtr dim(size_t index) const {
+    if (index >= ndim()) {
+      throw out_of_range_index();
+    }
+    return dims_[index];
+  }
+  std::vector<ExprPtr> dims() const {
+    return dims_;
+  }
+  void set_dims(std::vector<ExprPtr> dims) {
+    dims_ = std::move(dims);
+  }
+
+  std::vector<ExprPtr> strides() const {
+    return strides_;
+  }
+
+  void set_strides(std::vector<ExprPtr> strides) {
+    strides_ = std::move(strides);
+  }
+
+  ExprPtr initializer() const {
+    return initializer_;
+  }
+
+  ExprPtr qzero() const {
+    return qzero_;
+  }
+
+  ExprPtr qscale() const {
+    return qscale_;
+  }
+
+  void set_qzero(ExprPtr qzero) {
+    qzero_ = std::move(qzero);
+  }
+
+  void set_qscale(ExprPtr qscale) {
+    qscale_ = std::move(qscale);
+  }
+
+  bool hasConstantDims() const {
+    for (const auto& d : dims_) {
+      if (!d->isConstant()) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  bool is_contiguous(
+      at::MemoryFormat memory_format = at::MemoryFormat::Contiguous) const;
+
+  // The channels-last 1d can benefit the performance of some operators like
+  // conv1d. But the MemoryFormat enum has not covered this layout yet. Hence,
+  // we abstract a dedicated function to check channels-last 1d contiguous.
+  //
+  // Channels-last 1d:
+  //   dims:              n   c    l
+  //   strides(nlc):    c*l   1    c
+  bool is_channels_last_1d_contiguous() const {
+    if (dims_.size() != 3) {
+      return false;
+    }
+    return is_stride_one(1) && is_cont_with(2, 1) && is_cont_with(0, 2);
+  }
+
+ private:
+  bool is_cont_with(int cur_dim, int adjacent_dim) const;
+  bool is_stride_one(int cur_dim) const;
+
+  VarPtr base_handle_;
+  std::vector<ExprPtr> dims_;
+  std::vector<ExprPtr> strides_;
+  ExprPtr initializer_;
+  // qscale_ and qzero_ are used only for quantized dtypes Bufs: kQUInt8, kQInt8
+  ExprPtr qscale_;
+  ExprPtr qzero_;
+};
+
+class TORCH_API BufHandle : public ExprHandle {
+ public:
+  BufHandle(
+      const std::string& name_hint,
+      const std::vector<ExprHandle>& dims,
+      Dtype dtype)
+      : ExprHandle(Buf::make(name_hint, dims, dtype)) {}
+
+  BufHandle(
+      const std::string& name_hint,
+      const std::vector<ExprHandle>& dims,
+      const std::vector<ExprHandle>& strides,
+      Dtype dtype)
+      : ExprHandle(Buf::make(name_hint, dims, strides, dtype)) {}
+
+  BufHandle(const std::vector<ExprHandle>& dims, Dtype dtype)
+      : ExprHandle(Buf::make("_", dims, dtype)) {}
+
+  explicit BufHandle(Dtype dtype) : ExprHandle(Buf::make("_", {}, dtype)) {}
+
+  explicit BufHandle(BufPtr node) : ExprHandle(std::move(node)) {}
+  BufPtr node() const {
+    return static_to<Buf>(ExprHandle::node());
+  }
+  BufPtr node() {
+    return static_to<Buf>(ExprHandle::node());
+  }
+
+  template <typename... Ts>
+  inline ExprHandle load(const Ts&... ts) const;
+
+  template <typename T>
+  inline ExprHandle load(const std::vector<T>& args) const;
+
+  inline ExprHandle load(const std::vector<ExprHandle>& args) const;
+
+  StorePtr store(const std::vector<ExprHandle>& args, const ExprHandle& val)
+      const;
+
+  bool operator==(const BufHandle& other) const {
+    return this->node() == other.node();
+  }
+  bool operator!=(const BufHandle& other) const {
+    return !(*this == other);
+  }
+
+  const std::string& name_hint() const {
+    return this->node()->name_hint();
+  }
+
+  bool empty() const {
+    return (this->node() == nullptr);
+  }
+
+  size_t ndim() const {
+    return node()->ndim();
+  }
+
+  std::vector<ExprHandle> dims() const;
+
+  ExprHandle dim(size_t index) const {
+    return ExprHandle(node()->dim(index));
+  }
+
+  bool is_contiguous(
+      at::MemoryFormat memory_format = at::MemoryFormat::Contiguous) const {
+    return node()->is_contiguous(memory_format);
+  }
+
+  bool is_channels_last_1d_contiguous() const {
+    return node()->is_channels_last_1d_contiguous();
+  }
+};
+
+// An expression to construct the underlying variable node.
+// Note: do not store any info here, since it is often possible to slice this
+// object. For example: VarHandle x('x'); ExprHandle x2 = x;
+class TORCH_API VarHandle : public ExprHandle {
+ public:
+  // Creates an empty VarHandle whose base Var is set to nullptr.
+  VarHandle() = default;
+
+  explicit VarHandle(Dtype dtype) : ExprHandle(Var::make(dtype)) {}
+
+  VarHandle(const std::string& name_hint, Dtype dtype)
+      : ExprHandle(Var::make(name_hint, dtype)) {}
+
+  explicit VarHandle(VarPtr node) : ExprHandle(std::move(node)) {}
+
+  VarPtr node() const {
+    return static_to<Var>(ExprHandle::node());
+  }
+  bool operator==(const VarHandle& other) const {
+    return this->node() == other.node();
+  }
+  bool operator!=(const VarHandle& other) const {
+    return !(*this == other);
+  }
+
+  const std::string& name_hint() const {
+    return this->node()->name_hint();
+  }
+  bool empty() const {
+    return (this->node() == nullptr);
+  }
+};
+
+template <class Op, class Base>
+ExprPtr ExprNode<Op, Base>::accept_mutator(IRMutator* mutator) {
+  return mutator->mutate(static_to<Op>(Base::getptr()));
+}
+
+inline bool same_node(const ExprHandle& expr1, const ExprHandle& expr2) {
+  return expr1.AsNode<Expr>() == expr2.AsNode<Expr>();
+}
+
+TORCH_API ExprHandle sin(const ExprHandle& v);
+TORCH_API ExprHandle cos(const ExprHandle& v);
+TORCH_API ExprHandle tan(const ExprHandle& v);
+TORCH_API ExprHandle asin(const ExprHandle& v);
+TORCH_API ExprHandle acos(const ExprHandle& v);
+TORCH_API ExprHandle atan(const ExprHandle& v);
+TORCH_API ExprHandle sinh(const ExprHandle& v);
+TORCH_API ExprHandle cosh(const ExprHandle& v);
+TORCH_API ExprHandle tanh(const ExprHandle& v);
+TORCH_API ExprHandle sigmoid(const ExprHandle& v);
+TORCH_API ExprHandle exp(const ExprHandle& v);
+TORCH_API ExprHandle expm1(const ExprHandle& v);
+TORCH_API ExprHandle abs(const ExprHandle& v);
+TORCH_API ExprHandle log(const ExprHandle& v);
+TORCH_API ExprHandle fast_tanh(const ExprHandle& v);
+TORCH_API ExprHandle fast_sigmoid(const ExprHandle& v);
+TORCH_API ExprHandle fast_log(const ExprHandle& v);
+TORCH_API ExprHandle log_vml(const ExprHandle& v);
+TORCH_API ExprHandle log2(const ExprHandle& v);
+TORCH_API ExprHandle log10(const ExprHandle& v);
+TORCH_API ExprHandle log1p(const ExprHandle& v);
+TORCH_API ExprHandle erf(const ExprHandle& v);
+TORCH_API ExprHandle erfc(const ExprHandle& v);
+TORCH_API ExprHandle sqrt(const ExprHandle& v);
+TORCH_API ExprHandle rsqrt(const ExprHandle& v);
+TORCH_API ExprHandle ceil(const ExprHandle& v);
+TORCH_API ExprHandle floor(const ExprHandle& v);
+TORCH_API ExprHandle round(const ExprHandle& v);
+TORCH_API ExprHandle trunc(const ExprHandle& v);
+TORCH_API ExprHandle frac(const ExprHandle& v);
+TORCH_API ExprHandle lgamma(const ExprHandle& v);
+TORCH_API ExprHandle atan2(const ExprHandle& v1, const ExprHandle& v2);
+TORCH_API ExprHandle pow(const ExprHandle& v1, const ExprHandle& v2);
+TORCH_API ExprHandle fmod(const ExprHandle& v1, const ExprHandle& v2);
+TORCH_API ExprHandle remainder(const ExprHandle& v1, const ExprHandle& v2);
+TORCH_API ExprHandle isnan(const ExprHandle& v1);
+TORCH_API ExprHandle Relu(const ExprHandle& v1);
+
+TORCH_API ExprHandle
+ifThenElse(const ExprHandle& c, const ExprHandle& t, const ExprHandle& f);
+
+TORCH_API ExprHandle expr_to_vec(const ExprHandle& v, int lanes);
+
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/external_functions.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/external_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..b903ea436d6d375f03b13cd79f66940b6296115d
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/external_functions.h
@@ -0,0 +1,111 @@
+#pragma once
+
+#include <ATen/Config.h>
+#include <ATen/Functions.h>
+#include <c10/macros/Macros.h>
+#include <torch/csrc/Export.h>
+#include <cstdint>
+#include <vector>
+
+#define FOR_ALL_EXTERNAL_FUNCTIONS(_)   \
+  _(nnc_aten_adaptive_avg_pool2d)       \
+  _(nnc_aten_addmm)                     \
+  _(nnc_aten_conv2d)                    \
+  _(nnc_aten_conv1d)                    \
+  _(nnc_aten_conv1d_out)                \
+  _(nnc_aten_dequantize)                \
+  _(nnc_aten_dequantize_out)            \
+  _(nnc_aten_embedding)                 \
+  _(nnc_aten_matmul)                    \
+  _(nnc_aten_mv)                        \
+  _(nnc_aten_mm)                        \
+  _(nnc_aten_mean)                      \
+  _(nnc_aten_max_red)                   \
+  _(nnc_aten_max_red_out)               \
+  _(nnc_aten_quantized_conv1d)          \
+  _(nnc_aten_quantized_conv1d_out)      \
+  _(nnc_aten_quantized_conv2d)          \
+  _(nnc_aten_quantized_conv2d_out)      \
+  _(nnc_aten_quantized_conv2d_relu)     \
+  _(nnc_aten_quantized_conv2d_relu_out) \
+  _(nnc_aten_quantized_linear)          \
+  _(nnc_aten_quantized_linear_out)      \
+  _(nnc_aten_quantized_linear_relu)     \
+  _(nnc_aten_quantized_add)             \
+  _(nnc_aten_quantized_cat)             \
+  _(nnc_aten_quantized_mul)             \
+  _(nnc_aten_quantized_mul_out)         \
+  _(nnc_aten_quantized_mul_scalar)      \
+  _(nnc_aten_quantized_mul_scalar_out)  \
+  _(nnc_aten_quantized_relu)            \
+  _(nnc_aten_quantized_sigmoid)         \
+  _(nnc_aten_quantized_sigmoid_out)     \
+  _(nnc_aten_quantize_per_tensor)       \
+  _(nnc_aten_quantize_per_tensor_out)   \
+  _(nnc_aten_triangular_solve)          \
+  _(nnc_aten_upsample_nearest2d)        \
+  _(nnc_aten_upsample_nearest2d_out)    \
+  _(nnc_prepacked_conv2d_clamp_run)     \
+  _(nnc_prepacked_linear_clamp_run)
+
+#define DECLARE_EXTERNAL_FUNCTION(NAME) \
+  TORCH_API void NAME(                  \
+      int64_t bufs_num,                 \
+      void** buf_data,                  \
+      int64_t* buf_ranks,               \
+      int64_t* buf_dims,                \
+      int64_t* buf_strides,             \
+      int8_t* buf_dtypes,               \
+      int64_t args_num,                 \
+      int64_t* extra_args);
+
+namespace torch::jit::tensorexpr {
+struct QIData final {
+  double scale;
+  int64_t zero;
+  c10::ScalarType scalarType;
+};
+std::vector<at::Tensor> constructTensors(
+    int64_t bufs_num,
+    void** buf_data,
+    int64_t* buf_ranks,
+    int64_t* buf_dims,
+    int64_t* buf_strides,
+    int8_t* buf_dtypes,
+    std::optional<std::vector<std::pair<size_t, QIData>>> qdataArg =
+        std::nullopt);
+
+std::vector<at::Tensor> constructTensors2(
+    int64_t bufs_in_num,
+    void** buf_data,
+    int64_t* buf_ranks,
+    int64_t* buf_dims,
+    int64_t* buf_strides,
+    int8_t* buf_dtypes,
+    std::optional<std::vector<std::pair<size_t, QIData>>> qdataArg =
+        std::nullopt,
+    size_t bufs_out_num = 0);
+
+#ifdef C10_MOBILE
+extern "C" {
+#endif
+void DispatchParallel(
+    int8_t* func,
+    int64_t start,
+    int64_t stop,
+    int8_t* packed_data) noexcept;
+
+FOR_ALL_EXTERNAL_FUNCTIONS(DECLARE_EXTERNAL_FUNCTION)
+#if AT_MKLDNN_ENABLED()
+DECLARE_EXTERNAL_FUNCTION(nnc_mkldnn_prepacked_conv_run)
+#endif
+
+TORCH_API void nnc_aten_free(size_t bufs_num, void** ptrs) noexcept;
+
+#ifdef C10_MOBILE
+} // extern "C"
+#endif
+
+} // namespace torch::jit::tensorexpr
+
+#undef DECLARE_EXTERNAL_FUNCTION
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/external_functions_core.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/external_functions_core.h
new file mode 100644
index 0000000000000000000000000000000000000000..d911e34d2ddef9f4e2dec31fdfc34d20b7f42046
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/external_functions_core.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/Parallel.h>
+#include <torch/csrc/Export.h>
+#include <cstdint>
+
+namespace torch::jit::tensorexpr {
+
+#ifdef C10_MOBILE
+extern "C" {
+#endif
+void DispatchParallel(
+    int8_t* func,
+    int64_t start,
+    int64_t stop,
+    int8_t* packed_data) noexcept;
+
+TORCH_API void nnc_aten_free(size_t bufs_num, void** ptrs) noexcept;
+
+#ifdef C10_MOBILE
+} // extern "C"
+#endif
+
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/external_functions_registry.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/external_functions_registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f7ca763adb018377af2376e806884470778f8d2
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/external_functions_registry.h
@@ -0,0 +1,57 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <cstdint>
+#include <string>
+#include <unordered_map>
+
+namespace torch::jit::tensorexpr {
+
+// The external functions that could be called from NNC must have the same
+// signature defined by `NNCExternalFunction`.
+//
+// Why this signature?
+// It was picked for two reasons: 1) it should be generic enough to represent
+// most of the ops we might want to call, 2) it should be possible to generate a
+// code for this call in LLVM codegen.
+// The first 5 parameters allow to pass any number of contiguous CPU tensors in
+// case we need to run aten ops (TODO: support different devices). The first
+// buffer in the array is assumed to be the output buffer. We couldn't use
+// `at::Tensor` (or `c10::IValue`) type there directly as it would mean that
+// we'd need to declare it in LLVM codegen in LLVM IR form, which would be very
+// cumbersome and hard to maintain. Note that the dimensions of all tensors are
+// concatenated into a single array buf_dims. We do not need to pass its length,
+// since it can be deduced from total number of buffers and their ranks.
+//
+// The last 2 arguments allow to pass any non-tensor arguments encoded as an
+// array of int64_t values. The way they are encoded is not specified and could
+// be arbitrary - whatever the most convenient for the specific bridge function
+// is.
+//
+// The bridge functions must not throw exceptions - properly propagating them
+// from the generated code is too cumbersome, and thus all calls to functions
+// that could throw must be wrapped with try-catch blocks.
+using NNCExternalFunction = void (*)(
+    int64_t bufs_num,
+    void** buf_data,
+    int64_t* buf_ranks,
+    int64_t* buf_dims,
+    int64_t* buf_strides,
+    int8_t* buf_dtypes,
+    int64_t args_num,
+    int64_t* extra_args);
+
+// Return a global map "function-name" -> "function-pointer" for all registered
+// in NNC external functions
+TORCH_API std::unordered_map<std::string, NNCExternalFunction>&
+getNNCFunctionRegistry();
+
+// To register a new external function in NNC one needs to create an instance of
+// this struct
+struct RegisterNNCExternalFunction {
+  RegisterNNCExternalFunction(const std::string& name, NNCExternalFunction fn) {
+    getNNCFunctionRegistry()[name] = fn;
+  }
+};
+
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/fwd_decls.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/fwd_decls.h
new file mode 100644
index 0000000000000000000000000000000000000000..c7b92b2c1992e58a7e62854df85f21d8ff68bed1
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/fwd_decls.h
@@ -0,0 +1,125 @@
+#pragma once
+#include <c10/core/ScalarType.h>
+#include <memory>
+
+namespace torch::jit::tensorexpr {
+
+template <typename Node>
+using NodePtr = std::shared_ptr<Node>;
+
+template <typename To, typename From>
+NodePtr<To> to(const NodePtr<From>& x) {
+  return std::dynamic_pointer_cast<To>(x);
+}
+
+template <typename To, typename From>
+NodePtr<To> static_to(NodePtr<From> x) {
+  return std::static_pointer_cast<To>(x);
+}
+
+template <typename Node, typename... Args>
+NodePtr<Node> alloc(Args&&... args) {
+  return std::make_shared<Node>(std::forward<Args>(args)...);
+}
+
+class Buf;
+class Expr;
+class Stmt;
+class Var;
+
+using BufPtr = NodePtr<Buf>;
+using ExprPtr = NodePtr<Expr>;
+using StmtPtr = NodePtr<Stmt>;
+using VarPtr = NodePtr<Var>;
+
+class ExprHandle;
+class VarHandle;
+class BufHandle;
+
+class Add;
+class And;
+class BitCast;
+class Broadcast;
+class Cast;
+class CompareSelect;
+class Div;
+class IfThenElse;
+class Intrinsics;
+class Let;
+class Load;
+class Lshift;
+class Max;
+class MaxTerm;
+class Min;
+class MinTerm;
+class Mod;
+class Mul;
+class Or;
+class Polynomial;
+class Ramp;
+class ReduceOp;
+class RoundOff;
+class Rshift;
+class Store;
+class Sub;
+class Term;
+class Xor;
+using AddPtr = NodePtr<Add>;
+using AndPtr = NodePtr<And>;
+using BitCastPtr = NodePtr<BitCast>;
+using BroadcastPtr = NodePtr<Broadcast>;
+using CastPtr = NodePtr<Cast>;
+using CompareSelectPtr = NodePtr<CompareSelect>;
+using DivPtr = NodePtr<Div>;
+using IfThenElsePtr = NodePtr<IfThenElse>;
+using IntrinsicsPtr = NodePtr<Intrinsics>;
+using LetPtr = NodePtr<Let>;
+using LoadPtr = NodePtr<Load>;
+using LshiftPtr = NodePtr<Lshift>;
+using MaxPtr = NodePtr<Max>;
+using MaxTermPtr = NodePtr<MaxTerm>;
+using MinPtr = NodePtr<Min>;
+using MinTermPtr = NodePtr<MinTerm>;
+using ModPtr = NodePtr<Mod>;
+using MulPtr = NodePtr<Mul>;
+using OrPtr = NodePtr<Or>;
+using PolynomialPtr = NodePtr<Polynomial>;
+using RampPtr = NodePtr<Ramp>;
+using ReduceOpPtr = NodePtr<ReduceOp>;
+using RoundOffPtr = NodePtr<RoundOff>;
+using RshiftPtr = NodePtr<Rshift>;
+using StorePtr = NodePtr<Store>;
+using SubPtr = NodePtr<Sub>;
+using TermPtr = NodePtr<Term>;
+using XorPtr = NodePtr<Xor>;
+
+class Allocate;
+class AtomicAdd;
+class Block;
+class Cond;
+class ExternalCall;
+class ExternalCallWithAlloc;
+class For;
+class Free;
+class FreeExt;
+class PlacementAllocate;
+class SyncThreads;
+using AllocatePtr = NodePtr<Allocate>;
+using AtomicAddPtr = NodePtr<AtomicAdd>;
+using BlockPtr = NodePtr<Block>;
+using CondPtr = NodePtr<Cond>;
+using ExternalCallPtr = NodePtr<ExternalCall>;
+using ExternalCallWithAllocPtr = NodePtr<ExternalCallWithAlloc>;
+using ForPtr = NodePtr<For>;
+using FreePtr = NodePtr<Free>;
+using FreeExtPtr = NodePtr<FreeExt>;
+using PlacementAllocatePtr = NodePtr<PlacementAllocate>;
+using SyncThreadsPtr = NodePtr<SyncThreads>;
+
+#define IMM_DECLARE(Type, Name) \
+  class Name##Imm;              \
+  using Name##ImmPtr = NodePtr<Name##Imm>;
+AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_DECLARE)
+#undef IMM_DECLARE
+
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/graph_opt.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/graph_opt.h
new file mode 100644
index 0000000000000000000000000000000000000000..5679221be95530551c7da3e43d7999d57fadbe09
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/graph_opt.h
@@ -0,0 +1,111 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+
+namespace torch::jit::tensorexpr {
+
+// Optimize aten::cat ops in the given subgraph.
+//
+// Moving users of cat to its inputs.
+//    Cat ops get lowered into multiple loops, one per input. When the result
+//    of cat is used by some other op, it results in a situation where inlining
+//    of cat does not happen. This in turn results in intermediate buffers
+//    being created for the result of cat, since it is not inlined.
+//
+//    For example, consider the following graph:
+//       graph(%x : Float(10, strides=[1], device=cpu),
+//             %y : Float(20, strides=[1], device=cpu)):
+//         %dim : int = prim::Constant[value=0]()
+//         %xy_list : Tensor[] = prim::ListConstruct(%x, %y)
+//         %cat : Float(60, strides=[1], device=cpu) = aten::cat(%xy_list, %dim)
+//         %5 : Float(60, strides=[1], device=cpu) = aten::log(%cat)
+//         return (%5))IR";
+//
+//     This will get lowered into:
+//         Allocate(aten_cat);
+//         for (...)
+//           aten_cat[...] = x[...]
+//         for (...)
+//           aten_cat[...] = y[...]
+//         for (...)
+//           aten_log[...] = log(aten_cat[...])
+//         Free(aten_cat);
+//     Note that aten_cat is not inlined into aten_log and it results in
+//     an intermediate buffer allocation as well.
+//
+//     Optimization:
+//        We move the ops that use the result of `cat` into its inputs whenever
+//     possible.
+//
+//     The graph above will be transformed to:
+//        graph(%x : Float(10, strides=[1], device=cpu),
+//              %y : Float(20, strides=[1], device=cpu)):
+//          %3 : int = prim::Constant[value=0]()
+//          %7 : Float(10, strides=[1], device=cpu) = aten::log(%x)
+//          %8 : Float(20, strides=[1], device=cpu) = aten::log(%y)
+//          %9 : Tensor[] = prim::ListConstruct(%7, %8)
+//          %10 : Float(60, strides=[1], device=cpu) = aten::cat(%9, %3)
+//          return (%10)
+//
+//     This will get lowered into:
+//         for (...)
+//           aten_cat[...] = log(x[...])
+//         for (...)
+//           aten_cat[...] = log(y[...])
+//     aten_cat is the output buffer here.
+
+bool OptimizeCat(const std::shared_ptr<Graph>& graph);
+
+TORCH_API void annotateInputShapes(
+    const std::shared_ptr<Graph>& graph,
+    const std::vector<std::optional<at::Tensor>>& example_inputs);
+TORCH_API std::shared_ptr<Graph> removeUnusedSelfArgument(
+    const std::shared_ptr<Graph>& graph);
+TORCH_API std::shared_ptr<Graph> removeGraphOutput(
+    const std::shared_ptr<Graph>& graph,
+    size_t idx);
+TORCH_API std::shared_ptr<Graph> replaceListOutputWithTuple(
+    const std::shared_ptr<Graph>& graph);
+
+// Perform \p ITERS rounds of "trimming" for the given \p GRAPH.
+//
+// Trimming means that we try to remove a small portion of the graph while
+// keeping it valid. This is useful for debugging when we try to find a minimal
+// example reproducing the issue at hand. When ITERS is 0, the graph remains
+// unchanged, when ITERS is a big number, the graph usually becomes empty.
+TORCH_API std::shared_ptr<Graph> trimGraph(
+    const std::shared_ptr<Graph>& graph,
+    int64_t iters);
+
+// Scan all values in the given graph and replace each dimension with a size Xi
+// present in \p SIZES with a symbolic shape Yi. Return a vector of symbol
+// values [Y0, Y1, .., Yn].
+//
+// For example:
+// Input:
+// graph(%x : Float(10, 20, 30, 40)):
+//   %y : Float(10, 20, 30, 40) = aten::relu(%x)
+//   return %y
+//
+// If we run makeShapesSymbolic(graph, {20, 40}), then we'll get:
+//
+// graph(%x : Float(10, SS(-3), 30, SS(-5))):
+//   %y : Float(10, SS(-3), 30, SS(-5)) = aten::relu(%x)
+//   return %y
+//
+// and get {-3, -5} as the return value.
+TORCH_API std::vector<int64_t> makeShapesSymbolic(
+    std::shared_ptr<Graph>& graph,
+    const std::vector<int64_t>& sizes);
+
+// Inspect the graph and report whether it can be converted to TE IR.
+// TODO: add error reporting for graphs that can't be converted.
+TORCH_API bool isGraphCompilable(const std::shared_ptr<Graph>& graph);
+
+// Examine the graph and (hackily) fill in missing tensor type info, such as
+// scalar type, device, and strides. Ideally, this should be done by a proper
+// dtype/device/shape propagation passes, but until they are ready we can use
+// this, not always correct, workaround pass.
+TORCH_API void fixupMissingShapeInfo(const std::shared_ptr<Graph>& graph);
+
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/half_support.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/half_support.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc8164852301522ba7a98ab1cf672fc1e974ab23
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/half_support.h
@@ -0,0 +1,212 @@
+#pragma once
+
+#include <torch/csrc/jit/tensorexpr/codegen.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_visitor.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+
+namespace torch::jit::tensorexpr {
+
+// Walk the Statement looking for Half size loads/stores.
+class HalfChecker : public IRVisitor {
+ public:
+  HalfChecker(const std::vector<CodeGen::BufferArg>& args) {
+    for (const auto& BA : args) {
+      hasHalf_ |= BA.dtype().scalar_type() == ScalarType::Half;
+    }
+  }
+
+  bool hasHalf() const {
+    return hasHalf_;
+  }
+
+  bool hasBFloat16() const {
+    return hasBFloat16_;
+  }
+
+  void visit(const LoadPtr& v) override {
+    hasHalf_ |= v->dtype().scalar_type() == ScalarType::Half;
+    hasBFloat16_ |= v->dtype().scalar_type() == ScalarType::BFloat16;
+    IRVisitor::visit(v);
+  }
+
+  void visit(const StorePtr& v) override {
+    hasHalf_ |= v->buf()->dtype().scalar_type() == ScalarType::Half;
+    hasBFloat16_ |= v->buf()->dtype().scalar_type() == ScalarType::BFloat16;
+    IRVisitor::visit(v);
+  }
+
+  void visit(const HalfImmPtr& v) override {
+    hasHalf_ = true;
+  }
+
+  void visit(const BFloat16ImmPtr& v) override {
+    hasBFloat16_ = true;
+  }
+
+  void visit(const CastPtr& v) override {
+    hasHalf_ |= v->dtype().scalar_type() == ScalarType::Half;
+    hasBFloat16_ |= v->dtype().scalar_type() == ScalarType::BFloat16;
+    IRVisitor::visit(v);
+  }
+
+ private:
+  bool hasHalf_{false};
+  bool hasBFloat16_{false};
+};
+
+class HalfRewriter : public IRMutator {
+  ExprPtr mutate(const LoadPtr& v) override {
+    ExprPtr child = IRMutator::mutate(v);
+    if (!isHalf(child)) {
+      return child;
+    }
+
+    ExprPtr ret = alloc<Cast>(
+        child->dtype().cloneWithScalarType(ScalarType::Float), child);
+
+    inserted_half_casts_.insert(ret);
+    return ret;
+  }
+
+  StmtPtr mutate(const StorePtr& v) override {
+    // Since mutation changes the `value()` expression in-place, we need to
+    // get the dtype of the `value()` before that is mutated.
+    auto newType = v->value()->dtype();
+    ExprPtr new_val = v->value()->accept_mutator(this);
+    auto bufType = v->buf()->dtype();
+
+    if (isHalf(newType.scalar_type())) {
+      new_val = alloc<Cast>(newType, new_val);
+      inserted_half_casts_.insert(new_val);
+    }
+
+    // The scalar_type of value is not Half while the buf is Half
+    if (!isHalf(newType.scalar_type()) && isHalf(bufType.scalar_type())) {
+      new_val = alloc<Cast>(
+          newType.cloneWithScalarType(bufType.scalar_type()), new_val);
+      inserted_half_casts_.insert(new_val);
+    }
+
+    v->set_value(new_val);
+    return v;
+  }
+
+  ExprPtr mutate(const HalfImmPtr& v) override {
+    return alloc<Cast>(kFloat, v);
+  }
+
+  ExprPtr mutate(const BFloat16ImmPtr& v) override {
+    return alloc<Cast>(kFloat, v);
+  }
+
+  ExprPtr mutate(const CastPtr& v) override {
+    ExprPtr child = v->src_value()->accept_mutator(this);
+
+    // just don't allow half casts we didn't insert.
+    if (isHalf(v)) {
+      if (inserted_half_casts_.count(v) < 1) {
+        v->set_src_value(child);
+        v->set_dtype(v->dtype().cloneWithScalarType(c10::kFloat));
+        return v;
+      }
+    }
+
+    // Remove Half(Float()) and friends.
+    CastPtr cast_child = to<Cast>(child);
+    if (cast_child) {
+      auto cast_to_double = v->dtype().scalar_type() == ScalarType::Double;
+      auto from_half = isHalf(cast_child->src_value());
+      // Cannot simplify the double(float(half)) to double(half) as NNC does
+      // not support cast BF16 to double directly.
+      auto not_cast_half_to_doulbe = !(cast_to_double && from_half);
+      if (v->dtype().is_floating_point() &&
+          cast_child->dtype().is_floating_point() && not_cast_half_to_doulbe) {
+        return alloc<Cast>(v->dtype(), cast_child->src_value());
+      }
+    }
+
+    if (child == v->src_value()) {
+      return v;
+    }
+
+    return alloc<Cast>(v->dtype(), child);
+  }
+
+  StmtPtr mutate(const LetPtr& v) override {
+    if (isHalf(v->var()->dtype().scalar_type())) {
+      VarPtr load_new_var = alloc<Var>(v->var()->name_hint(), kFloat);
+      ExprPtr new_value = alloc<Cast>(
+          v->var()->dtype().cloneWithScalarType(ScalarType::Float),
+          v->value()->accept_mutator(this));
+      var_map[v->var()] = load_new_var;
+
+      return alloc<Let>(load_new_var, new_value);
+    }
+
+    return IRMutator::mutate(v);
+  }
+
+  ExprPtr mutate(const VarPtr& v) override {
+    auto it = var_map.find(v);
+    if (it != var_map.end()) {
+      return it->second;
+    }
+
+    return v;
+  }
+
+  template <typename T>
+  ExprPtr mutateArithmetic(T v) {
+    IRMutator::mutate(v);
+    if (isHalf(v)) {
+      v->set_dtype(v->dtype().cloneWithScalarType(c10::kFloat));
+    }
+    return v;
+  }
+
+  ExprPtr mutate(const AddPtr& v) override {
+    return mutateArithmetic(v);
+  }
+  ExprPtr mutate(const SubPtr& v) override {
+    return mutateArithmetic(v);
+  }
+  ExprPtr mutate(const MulPtr& v) override {
+    return mutateArithmetic(v);
+  }
+  ExprPtr mutate(const DivPtr& v) override {
+    return mutateArithmetic(v);
+  }
+  ExprPtr mutate(const MaxPtr& v) override {
+    return mutateArithmetic(v);
+  }
+  ExprPtr mutate(const MinPtr& v) override {
+    return mutateArithmetic(v);
+  }
+  ExprPtr mutate(const CompareSelectPtr& v) override {
+    return mutateArithmetic(v);
+  }
+  ExprPtr mutate(const BroadcastPtr& v) override {
+    return mutateArithmetic(v);
+  }
+  ExprPtr mutate(const IfThenElsePtr& v) override {
+    return mutateArithmetic(v);
+  }
+  ExprPtr mutate(const IntrinsicsPtr& v) override {
+    return mutateArithmetic(v);
+  }
+
+ private:
+  static bool isHalf(ScalarType st) {
+    return st == ScalarType::Half || st == ScalarType::BFloat16;
+  }
+
+  static bool isHalf(const ExprPtr& v) {
+    return isHalf(v->dtype().scalar_type());
+  }
+
+  std::unordered_set<ExprPtr> inserted_half_casts_;
+  std::unordered_map<VarPtr, VarPtr> var_map;
+};
+
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/hash_provider.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/hash_provider.h
new file mode 100644
index 0000000000000000000000000000000000000000..7a4bb241bebf913190cbb51f3f07c3c525df83c4
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/hash_provider.h
@@ -0,0 +1,281 @@
+#pragma once
+
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_printer.h>
+#include <torch/csrc/jit/tensorexpr/ir_visitor.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+
+#include <utility>
+
+namespace torch::jit::tensorexpr {
+
+struct TORCH_API SimplifierHashType {
+  SimplifierHashType() = default;
+  explicit SimplifierHashType(size_t s) : _h(s) {}
+
+  bool operator==(const SimplifierHashType& other) const;
+  bool operator!=(const SimplifierHashType& other) const;
+  bool operator<(const SimplifierHashType& other) const;
+  bool operator==(const size_t other) const;
+  bool operator!=(const size_t other) const;
+
+  size_t _h{0};
+};
+
+} // namespace torch::jit::tensorexpr
+
+namespace std {
+template <>
+struct hash<torch::jit::tensorexpr::SimplifierHashType> {
+  size_t operator()(const torch::jit::tensorexpr::SimplifierHashType& k) const {
+    return k._h;
+  }
+};
+
+} // namespace std
+
+namespace torch::jit::tensorexpr {
+
+#define CACHE_GUARD()  \
+  if (cachedHash(v)) { \
+    return;            \
+  }
+
+class Term;
+class Polynomial;
+
+/* Expression hasher providing comparable values representing sub-exprs.
+ * Uses memoization to avoid excessive recursion. */
+class TORCH_API HashProvider : public IRVisitor {
+ public:
+  template <class T>
+  SimplifierHashType hash(T e) {
+    e->accept(this);
+    return hashOf(e);
+  }
+
+  bool cachedHash(const ExprPtr& e) {
+    return exprToHash_.find(e) != exprToHash_.end();
+  }
+  bool cachedHash(const StmtPtr& s) {
+    return stmtToHash_.find(s) != stmtToHash_.end();
+  }
+
+  void clearCache() {
+    exprToHash_.clear();
+    stmtToHash_.clear();
+  }
+
+  void visit(const AddPtr& v) override;
+  void visit(const SubPtr& v) override;
+  void visit(const MulPtr& v) override;
+  void visit(const DivPtr& v) override;
+  void visit(const ModPtr& v) override;
+  void visit(const RoundOffPtr& v) override;
+  void visit(const MaxPtr& v) override;
+  void visit(const MinPtr& v) override;
+  void visit(const AndPtr& v) override;
+  void visit(const OrPtr& v) override;
+  void visit(const XorPtr& v) override;
+  void visit(const LshiftPtr& v) override;
+  void visit(const RshiftPtr& v) override;
+  void visit(const CompareSelectPtr& v) override;
+
+#define IMM_VISIT(Type, Name)                    \
+  void visit(const Name##ImmPtr& v) override {   \
+    CACHE_GUARD();                               \
+    putHash(v, hash_combine(#Name, v->value())); \
+  }
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_VISIT)
+#undef IMM_VISIT
+
+  void visit(const CastPtr& v) override;
+  void visit(const VarPtr& v) override;
+  void visit(const RampPtr& v) override;
+  void visit(const LoadPtr& v) override;
+  void visit(const StorePtr& v) override;
+  void visit(const BlockPtr& v) override;
+  void visit(const ForPtr& v) override;
+  void visit(const BroadcastPtr& v) override;
+  void visit(const IfThenElsePtr& v) override;
+  void visit(const IntrinsicsPtr& v) override;
+  void visit(const AllocatePtr& v) override;
+  void visit(const FreePtr& v) override;
+  void visit(const CondPtr& v) override;
+  void visit(const TermPtr& v) override;
+  void visit(const PolynomialPtr& v) override;
+  void visit(const MaxTermPtr& v) override;
+  void visit(const MinTermPtr& v) override;
+
+  template <typename... Types>
+  SimplifierHashType hash_combine(const Types&... args) {
+    SimplifierHashType seed;
+    _hash_combine(seed, args...);
+    return seed;
+  }
+
+ private:
+  SimplifierHashType hashOf(const ExprPtr& e) {
+    auto it = exprToHash_.find(e);
+    if (it != exprToHash_.end()) {
+      return it->second;
+    }
+
+    // As a failsafe fall back to IRPrinter.
+    std::stringstream ss;
+    IRPrinter printer(ss);
+    e->accept(&printer);
+    SimplifierHashType hash = SimplifierHashType(te_hash(ss.str()));
+    putHash(e, hash);
+
+    return hash;
+  }
+
+  SimplifierHashType hashOf(const StmtPtr& s) {
+    auto it = stmtToHash_.find(s);
+    if (it != stmtToHash_.end()) {
+      return it->second;
+    }
+
+    // As a failsafe fall back to IRPrinter.
+    std::stringstream ss;
+    IRPrinter printer(ss);
+    s->accept(&printer);
+    SimplifierHashType hash = SimplifierHashType(te_hash(ss.str()));
+    putHash(s, hash);
+
+    return hash;
+  }
+
+  // Hash funcs for various types, numbers are random.
+  template <typename T>
+  void _hash_combine(SimplifierHashType& seed, const T& val) {
+    seed._h ^= te_hash(val) + 0x1f752c19 + (seed._h << 7) + (seed._h >> 4);
+  }
+
+  void _hash_combine(SimplifierHashType& seed, const char* val) {
+    seed._h ^= te_hash(val) + 0x1f752c19 + (seed._h << 7) + (seed._h >> 4);
+  }
+
+  // at:::Half doesn't have a prime_number_hash, so cast to short.
+  void _hash_combine(SimplifierHashType& seed, const at::Half& val) {
+    seed._h ^=
+        te_hash((uint16_t)val) + 0x1f752c19 + (seed._h << 7) + (seed._h >> 4);
+  }
+
+  void _hash_combine(SimplifierHashType& seed, const Dtype& val) {
+    seed._h ^= te_hash(val.ToCppString()) + 0x1f752c19 + (seed._h << 7) +
+        (seed._h >> 4);
+  }
+
+  void _hash_combine(SimplifierHashType& seed, ExprPtr e) {
+    _hash_combine(seed, hash(std::move(e)));
+  }
+
+  template <typename T, typename... Types>
+  void _hash_combine(
+      SimplifierHashType& seed,
+      const T& val,
+      const Types&... args) {
+    _hash_combine(seed, val);
+    _hash_combine(seed, args...);
+  }
+
+  void putHash(const ExprPtr& e, SimplifierHashType h) {
+    auto res = exprToHash_.emplace(e, h);
+    if (res.second == false) {
+      // This is always a logic bug since we should check the cache first.
+      throw std::runtime_error("hash collision");
+    }
+  }
+  void putHash(const StmtPtr& s, SimplifierHashType h) {
+    auto res = stmtToHash_.emplace(s, h);
+    if (res.second == false) {
+      // This is always a logic bug since we should check the cache first.
+      throw std::runtime_error("hash collision");
+    }
+  }
+
+  std::unordered_map<ExprPtr, SimplifierHashType> exprToHash_;
+  std::unordered_map<StmtPtr, SimplifierHashType> stmtToHash_;
+  UniqueNameManager name_manager_;
+
+  size_t te_hash(SimplifierHashType val) {
+    return val._h;
+  }
+
+  size_t te_hash(int64_t val) {
+    // put the thing down.
+    size_t h = val ^ 0x647AA4D20C0B;
+    // bit flip it.
+    size_t h2 = ~h;
+    // and reverse byte order.
+    size_t h3 = 0;
+    for (unsigned int i = 0; i < 64; i += 8) {
+      h3 |= ((h2 >> i) & 0xFF) << (64 - i - 8);
+    }
+    return h3;
+  }
+
+  size_t te_hash(int32_t val) {
+    int64_t v2 = val;
+    return te_hash(v2);
+  }
+
+  size_t te_hash(uint32_t val) {
+    int64_t v2 = val;
+    return te_hash(v2);
+  }
+
+  size_t te_hash(uint64_t val) {
+    int64_t v2 = val;
+    return te_hash(v2);
+  }
+
+  size_t te_hash(int16_t val) {
+    int64_t v2 = val;
+    return te_hash(v2);
+  }
+
+  size_t te_hash(std::string val) {
+    size_t hash{0};
+    int64_t intval{0};
+    int64_t s = val.size() - 1;
+    while (s >= 0) {
+      for (unsigned int i = 0; i < 8; ++i) {
+        if (s < 0)
+          break;
+        int64_t c = val[s];
+        intval |= (c << (i * 8));
+
+        s--;
+      }
+      hash ^= te_hash(intval);
+      intval = 0;
+    }
+
+    return hash;
+  }
+
+  size_t te_hash(double d) {
+    int64_t* n = reinterpret_cast<int64_t*>(&d);
+    return te_hash(*n);
+  }
+
+  size_t te_hash(float d) {
+    int32_t* n = reinterpret_cast<int32_t*>(&d);
+    return te_hash(*n);
+  }
+
+  size_t te_hash(at::Half d) {
+    int16_t* n = reinterpret_cast<int16_t*>(&d);
+    return te_hash(*n);
+  }
+
+  size_t te_hash(at::BFloat16 d) {
+    int16_t* n = reinterpret_cast<int16_t*>(&d);
+    return te_hash(*n);
+  }
+};
+
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/intrinsic_symbols.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/intrinsic_symbols.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb8018ae39f0b70605c993240f096e3b61f7e2fb
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/intrinsic_symbols.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#ifdef TORCH_ENABLE_LLVM
+#include <c10/util/ArrayRef.h>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+struct SymbolAddress {
+  const char* symbol;
+  void* address;
+
+  SymbolAddress(const char* sym, void* addr) : symbol(sym), address(addr) {}
+};
+
+c10::ArrayRef<SymbolAddress> getIntrinsicSymbols();
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
+#endif // TORCH_ENABLE_LLVM
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir.h
new file mode 100644
index 0000000000000000000000000000000000000000..13ae2c5915ae35db069225376b819fd93b949bcd
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir.h
@@ -0,0 +1,916 @@
+#pragma once
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <torch/csrc/jit/tensorexpr/exceptions.h>
+#include <torch/csrc/jit/tensorexpr/expr.h>
+#include <torch/csrc/jit/tensorexpr/fwd_decls.h>
+#include <torch/csrc/jit/tensorexpr/stmt.h>
+
+#include <ATen/core/ivalue.h>
+
+namespace torch::jit::tensorexpr {
+
+enum CompareSelectOperation {
+  kEQ = 0,
+  kGT,
+  kGE,
+  kLT,
+  kLE,
+  kNE,
+};
+
+enum CompareSelectBias {
+  kUnbiased,
+  kLikely,
+  kUnlikely,
+};
+
+inline int getPrecedence(IRNodeType ty) {
+  // Match C++ operator precedence rules, since some pretty-print expressions to
+  // C++. SEE: https://en.cppreference.com/w/cpp/language/operator_precedence
+  switch (ty) {
+    case kPrimitive:
+      return 0;
+    case kCast:
+    case kBitCast:
+      return 2;
+    case kAdd:
+    case kSub:
+      return 6;
+    case kMul:
+    case kDiv:
+    case kMod:
+      return 5;
+    case kMax:
+    case kMin:
+      return 99;
+    case kAnd:
+      return 11;
+    case kOr:
+      return 13;
+    case kLshift:
+    case kRshift:
+      return 7;
+    case kXor:
+      return 12;
+    case kCompareSelect:
+      return 16;
+    default:
+      return 99;
+  }
+}
+
+class TORCH_API Cast : public ExprNode<Cast> {
+ public:
+  ExprPtr src_value() const {
+    return src_value_;
+  }
+
+  void set_src_value(ExprPtr src_value) {
+    src_value_ = std::move(src_value);
+  }
+
+  static ExprHandle make(Dtype dtype, const ExprHandle& src_value) {
+    return ExprHandle(alloc<Cast>(dtype, src_value.node()));
+  }
+  Cast(Dtype dtype, ExprPtr src_value)
+      : ExprNodeBase(dtype, kCast), src_value_(std::move(src_value)) {}
+
+  bool isConstant() const override {
+    return src_value_->isConstant();
+  }
+
+ private:
+  ExprPtr src_value_;
+};
+
+template <typename T>
+ExprHandle cast(const ExprHandle& src_value) {
+  return Cast::make(Dtype(ToDtype<T>(), src_value.dtype().lanes()), src_value);
+}
+
+// This is a bitwise cast, akin to bitcast in LLVM
+class TORCH_API BitCast : public ExprNode<BitCast> {
+ public:
+  ExprPtr src_value() const {
+    return src_value_;
+  }
+
+  void set_src_value(ExprPtr src_value) {
+    src_value_ = std::move(src_value);
+  }
+
+  static ExprHandle make(Dtype dtype, const ExprHandle& src_value) {
+    return ExprHandle(alloc<BitCast>(dtype, src_value.node()));
+  }
+  BitCast(Dtype dtype, ExprPtr src_value)
+      : ExprNodeBase(dtype, kBitCast), src_value_(std::move(src_value)) {
+    TORCH_CHECK(src_value_->dtype().byte_size() == dtype.byte_size());
+  }
+
+  bool isConstant() const override {
+    return src_value_->isConstant();
+  }
+
+ private:
+  ExprPtr src_value_;
+};
+
+template <typename T>
+ExprHandle bitcast(const ExprHandle& src_value) {
+  return BitCast::make(
+      Dtype(ToDtype<T>(), src_value.dtype().lanes()), src_value);
+}
+
+// Represent the expression node for binary operators.
+// A CRTP pattern to share common code among the operators.
+template <typename Op>
+class BinaryOpNode : public ExprNode<Op> {
+ public:
+  ExprPtr lhs() const {
+    return this->lhs_;
+  }
+  ExprPtr rhs() const {
+    return this->rhs_;
+  }
+
+  void set_lhs(ExprPtr lhs) {
+    lhs_ = std::move(lhs);
+  }
+
+  void set_rhs(ExprPtr rhs) {
+    rhs_ = std::move(rhs);
+  }
+
+  static ExprHandle make(const ExprHandle& lhs, const ExprHandle& rhs) {
+    return ExprHandle(alloc<Op>(lhs.node(), rhs.node()));
+  }
+
+  BinaryOpNode(
+      ExprPtr lhs_v,
+      ExprPtr rhs_v,
+      IRNodeType expr_type,
+      ScalarType ret_type = ScalarType::Undefined)
+      : ExprNode<Op>(
+            BinaryOpDtype(lhs_v->dtype(), rhs_v->dtype(), ret_type),
+            expr_type),
+        lhs_(CastIfNeeded(std::move(lhs_v), ExprNode<Op>::dtype())),
+        rhs_(CastIfNeeded(std::move(rhs_v), ExprNode<Op>::dtype())) {}
+
+ private:
+  static ExprPtr CastIfNeeded(ExprPtr expr, Dtype dst_dtype) {
+    if (expr->dtype() == dst_dtype) {
+      return expr;
+    }
+    return Cast::make(dst_dtype, ExprHandle(std::move(expr))).node();
+  }
+
+  ExprPtr lhs_;
+  ExprPtr rhs_;
+};
+
+namespace detail {
+template <typename T>
+void bin_op_deducer(BinaryOpNode<T>);
+bool bin_op_deducer(...);
+} // namespace detail
+
+class TORCH_API Add : public BinaryOpNode<Add> {
+ public:
+  Add(ExprPtr lhs, ExprPtr rhs)
+      : BinaryOpNode(std::move(lhs), std::move(rhs), IRNodeType::kAdd) {}
+};
+
+class TORCH_API Sub : public BinaryOpNode<Sub> {
+ public:
+  Sub(ExprPtr lhs, ExprPtr rhs)
+      : BinaryOpNode(std::move(lhs), std::move(rhs), IRNodeType::kSub) {}
+};
+
+class TORCH_API Mul : public BinaryOpNode<Mul> {
+ public:
+  Mul(ExprPtr lhs, ExprPtr rhs)
+      : BinaryOpNode(std::move(lhs), std::move(rhs), IRNodeType::kMul) {}
+};
+
+class TORCH_API Div : public BinaryOpNode<Div> {
+ public:
+  Div(ExprPtr lhs, ExprPtr rhs)
+      : BinaryOpNode(std::move(lhs), std::move(rhs), IRNodeType::kDiv) {}
+};
+
+class TORCH_API Mod : public BinaryOpNode<Mod> {
+ public:
+  Mod(ExprPtr lhs, ExprPtr rhs)
+      : BinaryOpNode(std::move(lhs), std::move(rhs), IRNodeType::kMod) {}
+};
+
+template <typename Op>
+class BitwiseOpNode : public BinaryOpNode<Op> {
+ public:
+  BitwiseOpNode(ExprPtr lhs, ExprPtr rhs, IRNodeType type)
+      : BinaryOpNode<Op>(std::move(lhs), std::move(rhs), type) {}
+
+  static ExprHandle make(const ExprHandle& lhs, const ExprHandle& rhs) {
+    if (!lhs.dtype().is_integral()) {
+      throw unsupported_dtype();
+    }
+    if (lhs.dtype() != rhs.dtype()) {
+      throw malformed_input("lhs/rhs dtype mismatch");
+    }
+    return BinaryOpNode<Op>::make(lhs, rhs);
+  }
+};
+
+class TORCH_API And : public BitwiseOpNode<And> {
+ public:
+  And(ExprPtr lhs, ExprPtr rhs)
+      : BitwiseOpNode(std::move(lhs), std::move(rhs), IRNodeType::kAnd) {}
+};
+
+class TORCH_API Or : public BitwiseOpNode<Or> {
+ public:
+  Or(ExprPtr lhs, ExprPtr rhs)
+      : BitwiseOpNode(std::move(lhs), std::move(rhs), IRNodeType::kOr) {}
+};
+
+class TORCH_API Xor : public BitwiseOpNode<Xor> {
+ public:
+  Xor(ExprPtr lhs, ExprPtr rhs)
+      : BitwiseOpNode(std::move(lhs), std::move(rhs), IRNodeType::kXor) {}
+};
+
+class TORCH_API Lshift : public BitwiseOpNode<Lshift> {
+ public:
+  Lshift(ExprPtr lhs, ExprPtr rhs)
+      : BitwiseOpNode(std::move(lhs), std::move(rhs), IRNodeType::kLshift) {}
+};
+
+class TORCH_API Rshift : public BitwiseOpNode<Rshift> {
+ public:
+  Rshift(ExprPtr lhs, ExprPtr rhs)
+      : BitwiseOpNode(std::move(lhs), std::move(rhs), IRNodeType::kRshift) {}
+};
+
+// TODO: add TORCH_API
+// Currently adding it results in a compilation error on Windows
+class Max : public BinaryOpNode<Max> {
+ private:
+  bool propagate_nans_;
+
+ public:
+  Max(ExprPtr lhs, ExprPtr rhs, bool propagate_nans)
+      : BinaryOpNode(std::move(lhs), std::move(rhs), IRNodeType::kMax),
+        propagate_nans_(propagate_nans) {}
+
+  bool propagate_nans() const {
+    return propagate_nans_;
+  }
+
+  static ExprHandle make(const ExprHandle& lhs, const ExprHandle& rhs) = delete;
+  static ExprHandle make(
+      const ExprHandle& lhs,
+      const ExprHandle& rhs,
+      bool propagate_nans) {
+    return ExprHandle(alloc<Max>(lhs.node(), rhs.node(), propagate_nans));
+  }
+};
+
+// TODO: add TORCH_API
+// Currently adding it results in a compilation error on Windows
+class Min : public BinaryOpNode<Min> {
+ private:
+  bool propagate_nans_;
+
+ public:
+  Min(ExprPtr lhs, ExprPtr rhs, bool propagate_nans)
+      : BinaryOpNode(std::move(lhs), std::move(rhs), IRNodeType::kMin),
+        propagate_nans_(propagate_nans) {}
+
+  bool propagate_nans() const {
+    return propagate_nans_;
+  }
+
+  static ExprHandle make(const ExprHandle& lhs, const ExprHandle& rhs) = delete;
+  static ExprHandle make(
+      const ExprHandle& lhs,
+      const ExprHandle& rhs,
+      bool propagate_nans) {
+    return ExprHandle(alloc<Min>(lhs.node(), rhs.node(), propagate_nans));
+  }
+};
+
+// Encode typed immediate values e.g. IntImm, FloatImm.
+#define IMM_DECLARE(Type, Name)                               \
+  class TORCH_API Name##Imm : public ExprNode<Name##Imm> {    \
+   public:                                                    \
+    Name##Imm(Type value)                                     \
+        : ExprNodeBase(k##Name, kPrimitive), value_(value) {} \
+    bool isConstant() const override {                        \
+      return true;                                            \
+    }                                                         \
+    Type value() const {                                      \
+      return value_;                                          \
+    }                                                         \
+    static ExprHandle make(Type value) {                      \
+      return ExprHandle(alloc<Name##Imm>(value));             \
+    }                                                         \
+                                                              \
+   private:                                                   \
+    Type value_;                                              \
+  };
+AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_DECLARE)
+#undef IMM_DECLARE
+
+// Get immediate by ScalarType.
+template <typename T>
+ExprPtr getImmediateByType(ScalarType immType, T initialVal) {
+  switch (immType) {
+#define TYPE_CASE(Type, Name) \
+  case ScalarType::Name:      \
+    return alloc<Name##Imm>(Type(initialVal));
+    AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE)
+#undef TYPE_CASE
+    default:
+      throw unsupported_dtype();
+  }
+  return nullptr;
+}
+
+template <typename T>
+ExprPtr getImmediateByType(Dtype dtype, T initialVal) {
+  return getImmediateByType<T>(dtype.scalar_type(), initialVal);
+}
+
+template <typename T>
+ExprPtr immLike(const ExprPtr& e, T v) {
+  return getImmediateByType<T>(e->dtype(), v);
+}
+
+template <typename T>
+ExprPtr immLike(const ExprHandle& e, T v) {
+  return immLike(e.node(), v);
+}
+
+inline std::optional<int64_t> intValue(const ExprPtr& e) {
+#define TYPE_CASE(Type, Name)      \
+  if (auto v = to<Name##Imm>(e)) { \
+    return v->value();             \
+  }
+  AT_FORALL_INT_TYPES(TYPE_CASE);
+#undef TYPE_CASE
+  return std::nullopt;
+}
+
+inline std::optional<int64_t> intValue(const ExprHandle& e) {
+  return intValue(e.node());
+}
+
+template <typename T>
+T immediateAs(const ExprPtr& e) {
+#define TYPE_CASE(Type, Name)                \
+  if (Name##ImmPtr imm = to<Name##Imm>(e)) { \
+    return imm->value();                     \
+  }
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE)
+#undef TYPE_CASE
+  throw unsupported_dtype();
+  return 0;
+}
+
+template <typename T>
+T immediateAs(const ExprHandle& e) {
+  return immediateAs<T>(e.node());
+}
+
+template <typename T>
+bool immediateEquals(const ExprPtr& e, T val) {
+#define TYPE_CASE(Type, Name)                \
+  if (Name##ImmPtr imm = to<Name##Imm>(e)) { \
+    return imm->value() == val;              \
+  }
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE)
+#undef TYPE_CASE
+  throw unsupported_dtype();
+  return false;
+}
+
+TORCH_API bool immediateIsNegative(const ExprPtr& e);
+
+TORCH_API bool immediateIsPositive(const ExprPtr& e);
+
+TORCH_API bool immediateIsZero(const ExprPtr& e);
+
+// Represents a ramp vector node:
+//     [base, base + 1 * stride, ... , base + (lanes - 1) * stride]
+class TORCH_API Ramp : public ExprNode<Ramp> {
+ public:
+  ExprPtr base() const {
+    return base_;
+  }
+  ExprPtr stride() const {
+    return stride_;
+  }
+
+  void set_base(ExprPtr base) {
+    base_ = std::move(base);
+  }
+
+  void set_stride(ExprPtr stride) {
+    stride_ = std::move(stride);
+  }
+
+  static ExprHandle make(
+      const ExprHandle& base,
+      const ExprHandle& stride,
+      int64_t lanes) {
+    if (stride.dtype() != base.dtype()) {
+      throw malformed_input("Bad stride in Ramp");
+    }
+    return ExprHandle(alloc<Ramp>(base.node(), stride.node(), lanes));
+  }
+  int64_t lanes() const {
+    return lanes_;
+  }
+
+  Ramp(ExprPtr base, ExprPtr stride, int64_t lanes)
+      : ExprNodeBase(Dtype(base->dtype(), lanes)),
+        base_(std::move(base)),
+        stride_(std::move(stride)),
+        lanes_(lanes) {}
+
+ private:
+  ExprPtr base_;
+  ExprPtr stride_;
+  int64_t lanes_;
+};
+
+class TORCH_API Load : public ExprNode<Load> {
+ public:
+  VarPtr base_handle() const {
+    return buf_->base_handle();
+  }
+  std::vector<ExprPtr> indices() const {
+    return indices_;
+  }
+  ExprPtr flat_index() const {
+    TORCH_CHECK(indices_.size() == 1, "Indices haven't been flattened.");
+    return indices_[0];
+  }
+  BufPtr buf() const {
+    return buf_;
+  }
+
+  void set_buf(BufPtr buf) {
+    buf_ = std::move(buf);
+  }
+
+  void set_indices(std::vector<ExprPtr> indices) {
+    indices_ = std::move(indices);
+  }
+
+  static ExprHandle make(
+      Dtype dtype,
+      const BufHandle& buf,
+      const std::vector<ExprHandle>& indices);
+  static ExprHandle make(
+      const BufHandle& buf,
+      const std::vector<ExprHandle>& indices);
+
+  Load(Dtype dtype, BufPtr base_handle, std::vector<ExprPtr> indices);
+  Load(const BufPtr& base_handle, const std::vector<ExprPtr>& indices);
+
+ private:
+  BufPtr buf_;
+  std::vector<ExprPtr> indices_;
+};
+
+class TORCH_API Broadcast : public ExprNode<Broadcast> {
+ public:
+  ExprPtr value() const {
+    return value_;
+  }
+
+  void set_value(ExprPtr value) {
+    value_ = std::move(value);
+  }
+
+  int64_t lanes() const {
+    return lanes_;
+  }
+  static ExprHandle make(const ExprHandle& value, int64_t lanes) {
+    return ExprHandle(alloc<Broadcast>(value.node(), lanes));
+  }
+  Broadcast(ExprPtr value, int64_t lanes)
+      : ExprNodeBase(Dtype(value->dtype(), lanes)),
+        value_(std::move(value)),
+        lanes_(lanes) {}
+
+ private:
+  ExprPtr value_;
+  int64_t lanes_;
+};
+
+class TORCH_API IfThenElse : public ExprNode<IfThenElse> {
+ public:
+  ExprPtr condition() const {
+    return condition_;
+  }
+
+  // Lazily evaluated only if condition is true
+  ExprPtr true_value() const {
+    return true_;
+  }
+
+  // Lazily evaluated only if condition is false
+  ExprPtr false_value() const {
+    return false_;
+  }
+
+  void set_condition(ExprPtr condition) {
+    condition_ = std::move(condition);
+  }
+
+  void set_true_value(ExprPtr true_value) {
+    true_ = std::move(true_value);
+  }
+
+  void set_false_value(ExprPtr false_value) {
+    false_ = std::move(false_value);
+  }
+
+  static ExprHandle make(
+      const ExprHandle& c,
+      const ExprHandle& t,
+      const ExprHandle& f) {
+    if (!c.dtype().is_integral()) {
+      throw unsupported_dtype();
+    }
+    if (c.dtype().lanes() != 1) {
+      throw unsupported_dtype();
+    }
+    if (t.dtype() != f.dtype()) {
+      throw malformed_input("Bad dtype in IfThenElse");
+    }
+    return ExprHandle(alloc<IfThenElse>(c.node(), t.node(), f.node()));
+  }
+
+  IfThenElse(ExprPtr c, ExprPtr t, ExprPtr f)
+      : ExprNodeBase(t->dtype()),
+        condition_(std::move(c)),
+        true_(std::move(t)),
+        false_(std::move(f)) {}
+
+ private:
+  ExprPtr condition_;
+  ExprPtr true_;
+  ExprPtr false_;
+};
+
+class TORCH_API CompareSelect : public ExprNode<CompareSelect> {
+ public:
+  CompareSelectOperation compare_select_op() const {
+    return compare_op_;
+  }
+  ExprPtr lhs() const {
+    return this->lhs_;
+  }
+  ExprPtr rhs() const {
+    return this->rhs_;
+  }
+  ExprPtr ret_val1() const {
+    return this->ret_val1_;
+  }
+  ExprPtr ret_val2() const {
+    return this->ret_val2_;
+  }
+
+  void set_lhs(ExprPtr lhs) {
+    lhs_ = std::move(lhs);
+  }
+
+  void set_rhs(ExprPtr rhs) {
+    rhs_ = std::move(rhs);
+  }
+
+  void set_ret_val1(ExprPtr ret_val1) {
+    ret_val1_ = std::move(ret_val1);
+  }
+
+  void set_ret_val2(ExprPtr ret_val2) {
+    ret_val2_ = std::move(ret_val2);
+  }
+
+  CompareSelectBias bias() const {
+    return bias_;
+  }
+
+  static ExprHandle make(
+      const ExprHandle& lhs,
+      const ExprHandle& rhs,
+      CompareSelectOperation cmp_op,
+      CompareSelectBias bias = kUnbiased) {
+    if (lhs.dtype() != rhs.dtype()) {
+      throw malformed_input("bad dtype in CompareSelect");
+    }
+    return ExprHandle(alloc<CompareSelect>(
+        lhs.node(),
+        rhs.node(),
+        IntImm::make(1).node(),
+        IntImm::make(0).node(),
+        cmp_op,
+        bias));
+  }
+
+  static ExprHandle make(
+      const ExprHandle& lhs,
+      const ExprHandle& rhs,
+      const ExprHandle& ret_val1,
+      const ExprHandle& ret_val2,
+      CompareSelectOperation cmp_op,
+      CompareSelectBias bias = kUnbiased) {
+    if (lhs.dtype() != rhs.dtype() || ret_val1.dtype() != ret_val2.dtype()) {
+      throw malformed_input("bad dtype in CompareSelect");
+    }
+    return ExprHandle(alloc<CompareSelect>(
+        lhs.node(),
+        rhs.node(),
+        ret_val1.node(),
+        ret_val2.node(),
+        cmp_op,
+        bias));
+  }
+
+  CompareSelect(
+      ExprPtr lhs,
+      ExprPtr rhs,
+      ExprPtr ret_val1,
+      ExprPtr ret_val2,
+      CompareSelectOperation cmp_op,
+      CompareSelectBias bias = kUnbiased)
+      : ExprNodeBase(ret_val1->dtype()),
+        lhs_(std::move(lhs)),
+        rhs_(std::move(rhs)),
+        ret_val1_(std::move(ret_val1)),
+        ret_val2_(std::move(ret_val2)),
+        compare_op_(cmp_op),
+        bias_(bias) {}
+
+  CompareSelect(
+      ExprPtr lhs,
+      ExprPtr rhs,
+      CompareSelectOperation cmp_op,
+      CompareSelectBias bias = kUnbiased)
+      : ExprNodeBase(kInt),
+        lhs_(std::move(lhs)),
+        rhs_(std::move(rhs)),
+        ret_val1_(alloc<IntImm>(1)),
+        ret_val2_(alloc<IntImm>(0)),
+        compare_op_(cmp_op),
+        bias_(bias) {}
+
+ private:
+  ExprPtr lhs_;
+  ExprPtr rhs_;
+  ExprPtr ret_val1_;
+  ExprPtr ret_val2_;
+  CompareSelectOperation compare_op_;
+  CompareSelectBias bias_;
+};
+
+enum IntrinsicsOp {
+  kSin,
+  kCos,
+  kTan,
+  kAsin,
+  kAcos,
+  kAtan,
+  kAtan2,
+  kSinh,
+  kCosh,
+  kTanh,
+  kSigmoid,
+  kExp,
+  kExpm1,
+  kAbs,
+  kLog,
+  kLog2,
+  kLog10,
+  kLog1p,
+  kErf,
+  kErfc,
+  kSqrt,
+  kRsqrt,
+  kPow,
+  kCeil,
+  kFloor,
+  kRound,
+  kTrunc,
+  kFmod,
+  kRemainder,
+  kLgamma,
+  kFrac,
+  kIsNan,
+  kRand, // We need more discussions on this. Should we consider stateful?
+  kMaxIntrinsicsOp,
+};
+
+class TORCH_API Intrinsics : public ExprNode<Intrinsics> {
+ public:
+  static ExprHandle make(IntrinsicsOp op_type, const ExprHandle& v1) {
+    return ExprHandle(alloc<Intrinsics>(op_type, v1.node()));
+  }
+
+  static ExprHandle make(
+      IntrinsicsOp op_type,
+      const ExprHandle& v1,
+      const ExprHandle& v2) {
+    return ExprHandle(alloc<Intrinsics>(op_type, v1.node(), v2.node()));
+  }
+
+  static ExprHandle make(
+      IntrinsicsOp op_type,
+      const std::vector<ExprHandle>& params) {
+    std::vector<ExprPtr> params_nodes(params.size());
+    for (size_t i = 0; i < params.size(); i++) {
+      params_nodes[i] = params[i].node();
+    }
+    return ExprHandle(alloc<Intrinsics>(op_type, params_nodes));
+  }
+
+  static ExprHandle make(IntrinsicsOp op_type, Dtype dtype) {
+    return ExprHandle(alloc<Intrinsics>(op_type, dtype));
+  }
+
+  IntrinsicsOp op_type() const {
+    return op_type_;
+  }
+
+  std::string func_name() const {
+    switch (op_type()) {
+      case kSin:
+        return "sin";
+      case kCos:
+        return "cos";
+      case kTan:
+        return "tan";
+      case kAsin:
+        return "asin";
+      case kAcos:
+        return "acos";
+      case kAtan:
+        return "atan";
+      case kAtan2:
+        return "atan2";
+      case kSinh:
+        return "sinh";
+      case kCosh:
+        return "cosh";
+      case kTanh:
+        return "tanh";
+      case kSigmoid:
+        return "sigmoid";
+      case kExp:
+        return "exp";
+      case kAbs:
+        return "abs";
+      case kLog:
+        return "log";
+      case kLog2:
+        return "log2";
+      case kLog10:
+        return "log10";
+      case kLog1p:
+        return "log1p";
+      case kErf:
+        return "erf";
+      case kSqrt:
+        return "sqrt";
+      case kRsqrt:
+        return "rsqrt";
+      case kPow:
+        return "pow";
+      case kCeil:
+        return "ceil";
+      case kFloor:
+        return "floor";
+      case kRound:
+        return "round";
+      case kTrunc:
+        return "trunc";
+      case kRand:
+        return "rand";
+      case kFmod:
+        return "fmod";
+      case kRemainder:
+        return "remainder";
+      case kLgamma:
+        return "lgamma";
+      case kExpm1:
+        return "expm1";
+      case kErfc:
+        return "erfc";
+      case kFrac:
+        return "frac";
+      case kIsNan:
+        return "isnan";
+      default:
+        throw std::runtime_error(
+            "invalid op_type: " + std::to_string(op_type()));
+    }
+  }
+
+  Intrinsics(IntrinsicsOp op_type, Dtype dtype)
+      : ExprNodeBase(IntrinsicsDtype(op_type, dtype)),
+        params_({}),
+        op_type_(op_type) {
+    if (OpArgCount(op_type) != 0) {
+      throw malformed_input("bad arg count in Intrinsics");
+    }
+  }
+
+  Intrinsics(IntrinsicsOp op_type, ExprPtr v1)
+      : ExprNodeBase(IntrinsicsDtype(op_type, v1->dtype())),
+        params_({std::move(v1)}),
+        op_type_(op_type) {
+    if (OpArgCount(op_type) != 1) {
+      throw malformed_input("bad arg count in Intrinsics");
+    }
+  }
+
+  Intrinsics(IntrinsicsOp op_type, ExprPtr v1, ExprPtr v2)
+      : ExprNodeBase(IntrinsicsDtype(op_type, v1->dtype(), v2->dtype())),
+        params_({std::move(v1), std::move(v2)}),
+        op_type_(op_type) {
+    if (OpArgCount(op_type) != 2) {
+      throw malformed_input("bad arg count in Intrinsics");
+    }
+  }
+
+  Intrinsics(IntrinsicsOp op_type, const std::vector<ExprPtr>& params)
+      : ExprNodeBase(IntrinsicsDtype(op_type, params)),
+        params_(params),
+        op_type_(op_type) {
+    if (OpArgCount(op_type) != nparams()) {
+      throw malformed_input("bad arg count in Intrinsics");
+    }
+  }
+
+  Intrinsics(IntrinsicsOp op_type, Dtype dtype, std::vector<ExprPtr> params)
+      : ExprNodeBase(IntrinsicsDtype(op_type, dtype)),
+        params_(std::move(params)),
+        op_type_(op_type) {
+    if (OpArgCount(op_type) != nparams()) {
+      throw malformed_input("bad arg count in Intrinsics");
+    }
+  }
+
+  bool isPure() const {
+    return op_type_ != kRand;
+  }
+
+  size_t nparams() const {
+    return params_.size();
+  }
+
+  ExprPtr param(size_t index) const {
+    return params_[index];
+  }
+  const std::vector<ExprPtr>& params() const {
+    return params_;
+  }
+
+  void set_params(std::vector<ExprPtr> params) {
+    params_ = std::move(params);
+  }
+
+  static size_t OpArgCount(IntrinsicsOp op_type);
+
+ private:
+  static Dtype IntrinsicsDtype(IntrinsicsOp op_type, Dtype dt1);
+  static Dtype IntrinsicsDtype(IntrinsicsOp op_type, Dtype dt1, Dtype dt2);
+  static Dtype IntrinsicsDtype(
+      IntrinsicsOp op_type,
+      const std::vector<ExprPtr>& params);
+
+  std::vector<ExprPtr> params_;
+  IntrinsicsOp op_type_;
+};
+
+TORCH_API std::vector<ExprPtr> ExprHandleVectorToExprVector(
+    const std::vector<ExprHandle>&);
+TORCH_API std::vector<ExprHandle> ExprVectorToExprHandleVector(
+    const std::vector<ExprPtr>&);
+TORCH_API std::vector<VarPtr> VarHandleVectorToVarVector(
+    const std::vector<VarHandle>&);
+TORCH_API std::vector<VarHandle> VarVectorToVarHandleVector(
+    const std::vector<VarPtr>&);
+TORCH_API ExprPtr flatten_index(
+    const std::vector<ExprPtr>& dims,
+    const std::vector<ExprPtr>& indices,
+    const std::vector<ExprPtr>& strides);
+
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir_cloner.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir_cloner.h
new file mode 100644
index 0000000000000000000000000000000000000000..c33d7fd9242031ddfa6df1e15393691e3979ff0f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir_cloner.h
@@ -0,0 +1,62 @@
+#pragma once
+#include <c10/core/ScalarType.h>
+#include <torch/csrc/Export.h>
+#include <vector>
+
+#include <torch/csrc/jit/tensorexpr/ir_mutator.h>
+
+namespace torch::jit::tensorexpr {
+
+class TORCH_API IRCloner : public IRMutator {
+ public:
+  ~IRCloner() override = default;
+  ExprPtr mutate(const AddPtr& v) override;
+  ExprPtr mutate(const SubPtr& v) override;
+  ExprPtr mutate(const MulPtr& v) override;
+  ExprPtr mutate(const DivPtr& v) override;
+  ExprPtr mutate(const ModPtr& v) override;
+  ExprPtr mutate(const MaxPtr& v) override;
+  ExprPtr mutate(const MinPtr& v) override;
+  ExprPtr mutate(const AndPtr& v) override;
+  ExprPtr mutate(const OrPtr& v) override;
+  ExprPtr mutate(const XorPtr& v) override;
+  ExprPtr mutate(const LshiftPtr& v) override;
+  ExprPtr mutate(const RshiftPtr& v) override;
+  ExprPtr mutate(const CompareSelectPtr& v) override;
+#define IMM_MUTATE_DECLARE(Type, Name) \
+  ExprPtr mutate(const Name##ImmPtr& v) override;
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_MUTATE_DECLARE)
+#undef IMM_MUTATE_DECLARE
+  ExprPtr mutate(const CastPtr& v) override;
+  ExprPtr mutate(const BitCastPtr& v) override;
+  ExprPtr mutate(const VarPtr& v) override;
+  ExprPtr mutate(const BufPtr& v) override;
+  ExprPtr mutate(const RampPtr& v) override;
+  ExprPtr mutate(const LoadPtr& v) override;
+  ExprPtr mutate(const BroadcastPtr& v) override;
+  ExprPtr mutate(const IfThenElsePtr& v) override;
+  ExprPtr mutate(const IntrinsicsPtr& v) override;
+
+  ExprPtr mutate(const TermPtr& v) override;
+  ExprPtr mutate(const PolynomialPtr& v) override;
+  ExprPtr mutate(const RoundOffPtr& v) override;
+  ExprPtr mutate(const MaxTermPtr& v) override;
+  ExprPtr mutate(const MinTermPtr& v) override;
+
+  ExprPtr mutate(const ReduceOpPtr& v) override;
+
+  StmtPtr mutate(const ForPtr& v) override;
+  StmtPtr mutate(const BlockPtr& v) override;
+  StmtPtr mutate(const StorePtr& v) override;
+  StmtPtr mutate(const AtomicAddPtr& v) override;
+  StmtPtr mutate(const SyncThreadsPtr& v) override;
+  StmtPtr mutate(const ExternalCallPtr& v) override;
+  StmtPtr mutate(const ExternalCallWithAllocPtr& v) override;
+
+  StmtPtr mutate(const AllocatePtr& v) override;
+  StmtPtr mutate(const FreePtr& v) override;
+  StmtPtr mutate(const LetPtr& v) override;
+  StmtPtr mutate(const CondPtr& v) override;
+};
+
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir_mutator.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir_mutator.h
new file mode 100644
index 0000000000000000000000000000000000000000..f158d949b235c1de386cc1a64645104ffd52dfa8
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir_mutator.h
@@ -0,0 +1,62 @@
+#pragma once
+#include <c10/core/ScalarType.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/tensorexpr/fwd_decls.h>
+
+namespace torch::jit::tensorexpr {
+
+class TORCH_API IRMutator {
+ public:
+  virtual ~IRMutator() = default;
+  virtual ExprPtr mutate(const AddPtr& v);
+  virtual ExprPtr mutate(const SubPtr& v);
+  virtual ExprPtr mutate(const MulPtr& v);
+  virtual ExprPtr mutate(const DivPtr& v);
+  virtual ExprPtr mutate(const ModPtr& v);
+  virtual ExprPtr mutate(const MaxPtr& v);
+  virtual ExprPtr mutate(const MinPtr& v);
+  virtual ExprPtr mutate(const AndPtr& v);
+  virtual ExprPtr mutate(const OrPtr& v);
+  virtual ExprPtr mutate(const XorPtr& v);
+  virtual ExprPtr mutate(const LshiftPtr& v);
+  virtual ExprPtr mutate(const RshiftPtr& v);
+  virtual ExprPtr mutate(const CompareSelectPtr& v);
+#define IMM_MUTATE_DECLARE(Type, Name) \
+  virtual ExprPtr mutate(const Name##ImmPtr& v);
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_MUTATE_DECLARE)
+#undef IMM_MUTATE_DECLARE
+  virtual ExprPtr mutate(const CastPtr& v);
+  virtual ExprPtr mutate(const BitCastPtr& v);
+  virtual ExprPtr mutate(const VarPtr& v);
+  virtual ExprPtr mutate(const BufPtr& v);
+  virtual ExprPtr mutate(const RampPtr& v);
+  virtual ExprPtr mutate(const LoadPtr& v);
+  virtual ExprPtr mutate(const BroadcastPtr& v);
+  virtual ExprPtr mutate(const IfThenElsePtr& v);
+  virtual ExprPtr mutate(const IntrinsicsPtr& v);
+
+  virtual ExprPtr mutate(const TermPtr& v);
+  virtual ExprPtr mutate(const PolynomialPtr& v);
+  virtual ExprPtr mutate(const RoundOffPtr& v);
+  virtual ExprPtr mutate(const MaxTermPtr& v);
+  virtual ExprPtr mutate(const MinTermPtr& v);
+
+  virtual ExprPtr mutate(const ReduceOpPtr& v);
+
+  virtual StmtPtr mutate(const ForPtr& v);
+  virtual StmtPtr mutate(const BlockPtr& v);
+  virtual StmtPtr mutate(const StorePtr& v);
+  virtual StmtPtr mutate(const AtomicAddPtr& v);
+  virtual StmtPtr mutate(const SyncThreadsPtr& v);
+  virtual StmtPtr mutate(const ExternalCallPtr& v);
+  virtual StmtPtr mutate(const ExternalCallWithAllocPtr& v);
+
+  virtual StmtPtr mutate(const AllocatePtr& v);
+  virtual StmtPtr mutate(const FreePtr& v);
+  virtual StmtPtr mutate(const FreeExtPtr& v);
+  virtual StmtPtr mutate(const PlacementAllocatePtr& v);
+  virtual StmtPtr mutate(const LetPtr& v);
+  virtual StmtPtr mutate(const CondPtr& v);
+};
+
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir_printer.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir_printer.h
new file mode 100644
index 0000000000000000000000000000000000000000..96aa544875e31e8e3012f7a968e9ccf15181f117
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir_printer.h
@@ -0,0 +1,130 @@
+#pragma once
+
+#include <ostream>
+
+#include <torch/csrc/jit/tensorexpr/fwd_decls.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_visitor.h>
+#include <torch/csrc/jit/tensorexpr/unique_name_manager.h>
+
+namespace torch::jit::tensorexpr {
+
+class Tensor;
+
+class TORCH_API IRPrinter : public IRVisitor {
+ public:
+  explicit IRPrinter(std::ostream& os) : printer_os_(this, os) {}
+
+  void print(ExprHandle);
+  void print(Expr&);
+  void print(Stmt&);
+  void visit(const AddPtr& v) override;
+  void visit(const SubPtr& v) override;
+  void visit(const MulPtr& v) override;
+  void visit(const DivPtr& v) override;
+  void visit(const ModPtr& v) override;
+  void visit(const MaxPtr& v) override;
+  void visit(const MinPtr& v) override;
+  void visit(const AndPtr& v) override;
+  void visit(const OrPtr& v) override;
+  void visit(const XorPtr& v) override;
+  void visit(const LshiftPtr& v) override;
+  void visit(const RshiftPtr& v) override;
+  void visit(const CompareSelectPtr& v) override;
+#define IMM_PRINT_VISIT(Type, Name) void visit(const Name##ImmPtr& v) override;
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_PRINT_VISIT)
+#undef IMM_PRINT_VISIT
+  void visit(const CastPtr& v) override;
+  void visit(const BitCastPtr& v) override;
+  void visit(const VarPtr& v) override;
+  void visit(const BufPtr& v) override;
+  void visit(const RampPtr& v) override;
+  void visit(const LoadPtr& v) override;
+  void visit(const BroadcastPtr& v) override;
+  void visit(const IfThenElsePtr& v) override;
+  void visit(const IntrinsicsPtr& v) override;
+  void visit(const TermPtr& v) override;
+  void visit(const PolynomialPtr& v) override;
+  void visit(const RoundOffPtr& v) override;
+  void visit(const MaxTermPtr& v) override;
+  void visit(const MinTermPtr& v) override;
+  void visit(const ReduceOpPtr& v) override;
+
+  void visit(const AtomicAddPtr& v) override;
+  void visit(const SyncThreadsPtr& v) override;
+  void visit(const ExternalCallPtr& v) override;
+  void visit(const ExternalCallWithAllocPtr& v) override;
+  void visit(const StorePtr& v) override;
+  void visit(const ForPtr& v) override;
+  void visit(const CondPtr& v) override;
+  void visit(const BlockPtr& v) override;
+  void visit(const AllocatePtr& v) override;
+  void visit(const FreePtr& v) override;
+  void visit(const FreeExtPtr& v) override;
+  void visit(const PlacementAllocatePtr& v) override;
+  void visit(const LetPtr& v) override;
+
+  // A child class may have a difference rule for generating dtype
+  // string, e.g. CUDA needs int64_t to be generated as long long.
+  virtual std::string dtypeToCppString(const Dtype& dtype);
+
+  std::ostream& os() {
+    return printer_os_;
+  }
+
+  class PrinterStream : public std::ostream {
+   public:
+    PrinterStream(IRPrinter* printer, std::ostream& os)
+        : std::ostream(os.rdbuf()), printer_(printer) {
+      initialize_imbue();
+    }
+
+    void initialize_imbue();
+
+    IRPrinter* printer() {
+      return printer_;
+    }
+
+   private:
+    IRPrinter* printer_ = nullptr;
+  };
+
+ protected:
+  std::string to_string(CompareSelectOperation op);
+
+  UniqueNameManager* name_manager() {
+    return &name_manager_;
+  }
+  void emitIndent();
+
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  int indent_ = 0;
+
+ private:
+  PrinterStream printer_os_;
+  UniqueNameManager name_manager_;
+};
+
+TORCH_API std::ostream& operator<<(std::ostream& stream, const Expr&);
+TORCH_API std::ostream& operator<<(std::ostream& stream, const ExprHandle&);
+TORCH_API std::ostream& operator<<(std::ostream& stream, const Stmt&);
+TORCH_API std::ostream& operator<<(std::ostream& stream, const Tensor&);
+
+TORCH_API void print(const ExprPtr& expr);
+TORCH_API void print(const StmtPtr& stmt);
+TORCH_API void print(const Tensor& t);
+
+} // namespace torch::jit::tensorexpr
+
+namespace std {
+
+using torch::jit::tensorexpr::Expr;
+using torch::jit::tensorexpr::ExprPtr;
+using torch::jit::tensorexpr::Stmt;
+using torch::jit::tensorexpr::StmtPtr;
+using torch::jit::tensorexpr::Tensor;
+
+TORCH_API std::string to_string(const ExprPtr& expr);
+TORCH_API std::string to_string(const StmtPtr& stmt);
+TORCH_API std::string to_string(const Tensor& t);
+} // namespace std
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir_simplifier.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir_simplifier.h
new file mode 100644
index 0000000000000000000000000000000000000000..bda63a0b835c0ebad714347c8b10982983acdf54
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir_simplifier.h
@@ -0,0 +1,546 @@
+#pragma once
+
+#include <torch/csrc/jit/tensorexpr/bounds_overlap.h>
+#include <torch/csrc/jit/tensorexpr/eval.h>
+#include <torch/csrc/jit/tensorexpr/hash_provider.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_mutator.h>
+#include <torch/csrc/jit/tensorexpr/ir_visitor.h>
+#include <torch/csrc/jit/tensorexpr/types.h>
+
+#include <utility>
+
+/* IR Simplification
+ *
+ * Simplifies expressions in two stages:
+ *  1. Recursively traverse the map combining similar operations into Terms
+ * (interacted via Multiplication) and Polynomials (interacted via Addition). We
+ * reorder the components of each Term or Polynomial into a consistent order to
+ * allow combination or cancelling of like terms.
+ *  2. Once the format of the tree is minimal, expand each Term into a sequence
+ * of Muls, and each Polynomial into a sequence of Ads.
+ */
+
+namespace torch::jit::tensorexpr {
+
+// A bunch of helpers for determine the Dtype of the output of a multi argument
+// Term or Polynomial.
+template <class ExprType>
+Dtype promoteTypesVec(const ExprPtr& s, const std::vector<ExprType>& v) {
+  Dtype t = s->dtype();
+  bool first = true;
+
+  for (const auto& e : v) {
+    if (first) {
+      t = Dtype(t.scalar_type(), e->dtype().lanes());
+      first = false;
+    }
+    t = promoteTypes(t, e->dtype());
+  }
+  return t;
+}
+
+template <class ExprType>
+Dtype promoteTypesVec(const std::vector<ExprType>& v) {
+  if (v.empty()) {
+    throw malformed_input("empty list of types");
+  }
+
+  Dtype t = v[0]->dtype();
+  for (const auto& e : v) {
+    t = promoteTypes(t, e->dtype());
+  }
+  return t;
+}
+
+template <class ExprType>
+Dtype promoteTypesMap(
+    const ExprPtr& s,
+    std::unordered_map<SimplifierHashType, ExprType>& m) {
+  Dtype t = s->dtype();
+  bool first = true;
+  for (auto& e : m) {
+    if (first) {
+      t = Dtype(t.scalar_type(), e.second->dtype().lanes());
+      first = false;
+    }
+    t = promoteTypes(t, e.second->dtype());
+  }
+  return t;
+}
+
+template <class ExprType>
+Dtype promoteTypesVar(ExprType e) {
+  return e->dtype();
+}
+
+template <class ExprType, class... Args>
+Dtype promoteTypesVar(ExprType e, Args... es) {
+  Dtype lhs = e->dtype();
+  Dtype rhs = promoteTypesVar(es...);
+  if (e->isConstant()) {
+    lhs = Dtype(lhs.scalar_type(), rhs.lanes());
+  }
+
+  return promoteTypes(lhs, rhs);
+}
+
+// Uses the evaluator to fold an Expression with constant terms.
+// E.g. evaluateOp(Add(3, 4)) => 7.
+// Expr v must not have any unbound Vars.
+inline ExprPtr evaluateOp(const ExprPtr& v) {
+  ExprHandle handle(v);
+  ExprEval<SimpleIREvaluator> eval(handle);
+
+  switch (v->dtype().scalar_type()) {
+#define TYPE_CASE(Type, Name)                                 \
+  case ScalarType::Name: {                                    \
+    Type val = eval.value<Type>();                            \
+    return getImmediateByType(v->dtype().scalar_type(), val); \
+  }
+    AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, TYPE_CASE)
+#undef TYPE_CASE
+    default:
+      LOG(FATAL) << "Unsupported datatype: " << v->dtype();
+      return nullptr;
+  }
+  return nullptr;
+}
+
+// A Term represents a grouping of Exprs through multiplication.
+// E.g. product(scalar, *variables).
+class Term : public ExprNode<Term> {
+ public:
+  template <class... Args>
+  Term(HashProvider& hasher, ExprPtr s, Args... ts)
+      : ExprNodeBase(promoteTypesVar(s, ts...)), scalar_(s), hasher_(hasher) {
+    CHECK(s->isConstant());
+    addComponent(ts...);
+    sort();
+  }
+
+  Term(HashProvider& hasher, ExprPtr s, std::vector<ExprPtr> v)
+      : ExprNodeBase(promoteTypesVec(s, v)),
+        variables_(std::move(v)),
+        scalar_(std::move(s)),
+        hasher_(hasher) {
+    sort();
+  }
+
+  // Convenience constructor from a map of hash -> var, used when merging Terms.
+  Term(
+      HashProvider& hasher,
+      const ExprPtr& s,
+      std::unordered_map<SimplifierHashType, ExprPtr> varmap)
+      : ExprNodeBase(promoteTypesMap(s, varmap)), scalar_(s), hasher_(hasher) {
+    for (auto& p : varmap) {
+      addComponent(p.second);
+    }
+    sort();
+  }
+
+  ExprPtr scalar() const {
+    return scalar_;
+  }
+  const std::vector<ExprPtr>& variables() const {
+    return variables_;
+  }
+  HashProvider& hasher() const {
+    return hasher_;
+  }
+
+  // Produce a hash of just the variable components of this term, to determine
+  // if it can be combined with another term.
+  SimplifierHashType hashVars() const;
+
+ private:
+  std::vector<ExprPtr> variables_;
+  ExprPtr scalar_;
+  HashProvider& hasher_;
+
+  void addComponent() {}
+  void addComponent(ExprPtr e) {
+    variables_.push_back(std::move(e));
+  }
+  template <class... Es>
+  void addComponent(ExprPtr e, Es&&... es) {
+    addComponent(std::move(e));
+    addComponent(std::forward<Es>(es)...);
+  }
+
+  // Sort by hash to normalize order of components.
+  void sort();
+};
+
+// Polynomial represents a grouping of Exprs by addition.
+// E.g. sum(*variables, scalar).
+// This would better be called Expression, but, naming conflict...
+class Polynomial : public ExprNode<Polynomial> {
+ public:
+  template <class... Args>
+  Polynomial(HashProvider& hasher, ExprPtr s, Args... ts)
+      : ExprNodeBase(promoteTypesVar(s, ts...)), scalar_(s), hasher_(hasher) {
+    CHECK(s->isConstant());
+    addTerm(ts...);
+    sort();
+  }
+
+  Polynomial(HashProvider& hasher, const ExprPtr& s, std::vector<TermPtr> v)
+      : ExprNodeBase(promoteTypesVec(s, v)),
+        variables_(std::move(v)),
+        scalar_(s),
+        hasher_(hasher) {
+    sort();
+  }
+
+  // Helper constructor for list of terms with no scalar component.
+  Polynomial(HashProvider& hasher, std::vector<TermPtr> terms)
+      : ExprNodeBase(promoteTypesVec(terms)),
+        variables_(std::move(terms)),
+        scalar_(getImmediateByType(dtype(), 0)),
+        hasher_(hasher) {
+    sort();
+  }
+
+  // Convenience constructor for map of hash -> var, used when merging
+  // Polynomials.
+  Polynomial(
+      HashProvider& hasher,
+      const ExprPtr& s,
+      std::unordered_map<SimplifierHashType, TermPtr> varmap)
+      : ExprNodeBase(promoteTypesMap(s, varmap)), scalar_(s), hasher_(hasher) {
+    for (auto& p : varmap) {
+      addTerm(p.second);
+    }
+    sort();
+  }
+
+  ExprPtr scalar() const {
+    return scalar_;
+  }
+  const std::vector<TermPtr>& variables() const {
+    return variables_;
+  }
+  HashProvider& hasher() const {
+    return hasher_;
+  }
+
+  SimplifierHashType hashVars() const;
+
+ private:
+  std::vector<TermPtr> variables_;
+  ExprPtr scalar_;
+  HashProvider& hasher_;
+
+  void addTerm(TermPtr t) {
+    variables_.push_back(std::move(t));
+  }
+  template <class... Ts>
+  void addTerm(TermPtr t, Ts&&... ts) {
+    addTerm(std::move(t));
+    addTerm(std::forward<Ts>(ts)...);
+  }
+
+  // Sort by hash to normalize order of terms.
+  void sort();
+};
+
+class RoundOff : public BinaryOpNode<RoundOff> {
+ public:
+  RoundOff(ExprPtr lhs, ExprPtr rhs)
+      : BinaryOpNode(std::move(lhs), std::move(rhs), IRNodeType::kOther) {}
+};
+
+class MaxTerm : public ExprNode<MaxTerm> {
+ public:
+  template <class... Args>
+  MaxTerm(HashProvider& hasher, ExprPtr s, bool p, Args... ts)
+      : ExprNodeBase(s ? promoteTypesVar(s, ts...) : promoteTypesVar(ts...)),
+        scalar_(s),
+        hasher_(hasher),
+        propagate_nans_(p) {
+    addComponent(ts...);
+    uniquefy();
+  }
+
+  MaxTerm(
+      HashProvider& hasher,
+      const ExprPtr& s,
+      bool p,
+      std::vector<ExprPtr> v)
+      : ExprNodeBase(s ? promoteTypesVec(s, v) : promoteTypesVec(v)),
+        variables_(std::move(v)),
+        scalar_(s),
+        hasher_(hasher),
+        propagate_nans_(p) {
+    uniquefy();
+  }
+
+  bool propagate_nans() const {
+    return propagate_nans_;
+  }
+
+  ExprPtr scalar() const {
+    return scalar_;
+  }
+  const std::vector<ExprPtr>& variables() const {
+    return variables_;
+  }
+  HashProvider& hasher() const {
+    return hasher_;
+  }
+
+ private:
+  std::vector<ExprPtr> variables_;
+  ExprPtr scalar_;
+  HashProvider& hasher_;
+  bool propagate_nans_;
+
+  void addComponent() {}
+  void addComponent(ExprPtr e) {
+    variables_.push_back(std::move(e));
+  }
+  template <class... Es>
+  void addComponent(ExprPtr e, Es&&... es) {
+    addComponent(std::move(e));
+    addComponent(std::forward<Es>(es)...);
+  }
+
+  // Uniquefy the terms using their hash.
+  void uniquefy();
+};
+
+class MinTerm : public ExprNode<MinTerm> {
+ public:
+  template <class... Args>
+  MinTerm(HashProvider& hasher, ExprPtr s, bool p, Args... ts)
+      : ExprNodeBase(s ? promoteTypesVar(s, ts...) : promoteTypesVar(ts...)),
+        scalar_(s),
+        hasher_(hasher),
+        propagate_nans_(p) {
+    addComponent(ts...);
+    uniquefy();
+  }
+
+  MinTerm(
+      HashProvider& hasher,
+      const ExprPtr& s,
+      bool p,
+      std::vector<ExprPtr> v)
+      : ExprNodeBase(s ? promoteTypesVec(s, v) : promoteTypesVec(v)),
+        variables_(std::move(v)),
+        scalar_(s),
+        hasher_(hasher),
+        propagate_nans_(p) {
+    uniquefy();
+  }
+
+  bool propagate_nans() const {
+    return propagate_nans_;
+  }
+
+  ExprPtr scalar() const {
+    return scalar_;
+  }
+  const std::vector<ExprPtr>& variables() const {
+    return variables_;
+  }
+  HashProvider& hasher() const {
+    return hasher_;
+  }
+
+ private:
+  std::vector<ExprPtr> variables_;
+  ExprPtr scalar_;
+  HashProvider& hasher_;
+  bool propagate_nans_;
+
+  void addComponent() {}
+  void addComponent(ExprPtr e) {
+    variables_.push_back(std::move(e));
+  }
+  template <class... Es>
+  void addComponent(ExprPtr e, Es&&... es) {
+    addComponent(std::move(e));
+    addComponent(std::forward<Es>(es)...);
+  }
+
+  // Uniquefy the terms using their hash.
+  void uniquefy();
+};
+
+// Context-sensitive IR simplification
+using VarBoundInfo = std::unordered_map<VarPtr, analysis::Bound>;
+
+class TORCH_API SimplifierUnderContext : public IRMutator {
+ public:
+  ~SimplifierUnderContext() override = default;
+  // Add boundary info for index variables in for-loops
+  StmtPtr mutate(const ForPtr& v) override;
+
+  ExprPtr mutate(const DivPtr& v) override;
+  ExprPtr mutate(const ModPtr& v) override;
+  ExprPtr mutate(const CompareSelectPtr& v) override;
+  ExprPtr mutate(const IfThenElsePtr& v) override;
+
+ protected:
+  bool getLoopBoundInfo(const ExprPtr& expr, analysis::Bound* loop_bound_info);
+
+ protected:
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  HashProvider hasher_;
+  VarBoundInfo var_bound_info_;
+};
+
+// Stmt simplification should occur in both modes.
+class TORCH_API PolynomialBase : public IRMutator {
+ public:
+  ~PolynomialBase() override = default;
+
+  StmtPtr mutate(const BlockPtr& v) override;
+
+  StmtPtr mutate(const CondPtr& v) override;
+
+  StmtPtr mutate(const ForPtr& v) override;
+
+  // Trivially factorize terms by GCD of scalar components.
+  TermPtr factorizePolynomial(const PolynomialPtr& poly);
+
+  HashProvider& hasher() {
+    return hasher_;
+  }
+
+ protected:
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  HashProvider hasher_;
+};
+
+// Simplify the IR by combining arithmetic expressions over common terms.
+class TORCH_API PolynomialTransformer : public PolynomialBase {
+ public:
+  using PolynomialBase::mutate;
+  // Inserts term into the provided map, in the case of a hash collision
+  // combines the term with the existing and updates the map.
+  void addOrUpdateTerm(
+      std::unordered_map<SimplifierHashType, TermPtr>& varmap,
+      const TermPtr& term);
+
+  // Add Polynomial expressions, combining Terms representing the same
+  // variables.
+  ExprPtr addPolynomials(const PolynomialPtr& lhs, const PolynomialPtr& rhs);
+
+  // Insert a new Term into the provided polynomial. If the new term has
+  // common variables to an existing term it is combined.
+  ExprPtr insertTerm(const PolynomialPtr& poly, const TermPtr& term);
+
+  // Merge and simplify addition.
+  ExprPtr mutate(const AddPtr& v) override;
+
+  // Subtract one term from another, cancelling if necessary.
+  ExprPtr subTerms(const TermPtr& lhs, TermPtr rhs, bool negated);
+
+  // Subtract the RHS Polynomial from the LHS Polynomial, cancelling out where
+  // possible.
+  ExprPtr subPolynomials(const PolynomialPtr& lhs, const PolynomialPtr& rhs);
+
+  // Merge and simplify subtraction.
+  ExprPtr mutate(const SubPtr& v) override;
+
+  // Multiply two terms together, usually creating a new term with the variable
+  // lists concatenated.
+  TermPtr mulTerms(const TermPtr& lhs, const TermPtr& rhs);
+
+  // Multiply a Polynomial by a Term.
+  ExprPtr polyByTerm(const PolynomialPtr& poly, const TermPtr& term);
+
+  // Match a rounding pattern and create a RoundOff if found.
+  ExprPtr isRoundOff(const ExprPtr& lhs, const ExprPtr& rhs);
+
+  // Inserts a new component into a term, simplifying if possible.
+  ExprPtr insertIntoTerm(const TermPtr& term, const ExprPtr& expr);
+
+  // Merge and simplify multiplication.
+  ExprPtr mutate(const MulPtr& v) override;
+
+  ExprPtr mutate(const DivPtr& v) override;
+
+  ExprPtr mutate(const ModPtr& v) override;
+
+  ExprPtr mutate(const AndPtr& v) override;
+
+  ExprPtr mutate(const XorPtr& v) override;
+
+  ExprPtr mutate(const LshiftPtr& v) override;
+
+  ExprPtr mutate(const RshiftPtr& v) override;
+
+  ExprPtr mutate(const MaxPtr& v) override;
+
+  ExprPtr mutate(const MinPtr& v) override;
+
+  ExprPtr mutate(const CompareSelectPtr& v) override;
+
+  ExprPtr mutate(const IntrinsicsPtr& v) override;
+
+  ExprPtr mutate(const CastPtr& v) override;
+
+  ExprPtr mutate(const IfThenElsePtr& v) override;
+
+  static ExprPtr simplify(ExprPtr e);
+  static ExprHandle simplify(const ExprHandle& e);
+  static StmtPtr simplify(StmtPtr e);
+};
+
+// Expands Terms and Polynomial expressions into primitive operations.
+// Does some simple factorization and reordering.
+class TORCH_API TermExpander : public PolynomialBase {
+  PolynomialTransformer* simplifier_;
+  std::set<VarPtr> eliminated_allocations_;
+
+ public:
+  using PolynomialBase::mutate;
+  TermExpander(PolynomialTransformer* simplifier) : simplifier_(simplifier) {}
+  bool check_safe() {
+    return eliminated_allocations_.empty();
+  }
+
+  // Expand Terms out to a series of Muls.
+  ExprPtr mutate(const TermPtr& v) override;
+
+  // Expand Polynomials out to a series of Adds.
+  ExprPtr mutate(const PolynomialPtr& v) override;
+
+  // Expand MaxTerms to a series of Max ops.
+  ExprPtr mutate(const MaxTermPtr& v) override;
+
+  // Expand MinTerms to a series of Min ops.
+  ExprPtr mutate(const MinTermPtr& v) override;
+
+  // Expand RoundOff to it's component: Mul(Div(lhs, rhs), rhs).
+  ExprPtr mutate(const RoundOffPtr& v) override;
+
+  // Eliminate zero length allocations.
+  StmtPtr mutate(const AllocatePtr& v) override;
+  StmtPtr mutate(const FreePtr& v) override;
+
+  // Override to enable condition fusing.
+  BlockPtr fuseConditions(BlockPtr v);
+  StmtPtr fuseSyncThreads(BlockPtr block);
+  StmtPtr mutate(const BlockPtr& v) override;
+};
+
+class TORCH_API IRSimplifier {
+ public:
+  static StmtPtr simplify(StmtPtr s);
+  static ExprPtr simplify(ExprPtr e);
+  static ExprHandle simplify(const ExprHandle& e) {
+    return ExprHandle(simplify(e.node()));
+  }
+};
+
+// Flattens the buf and performs the simplifier on the flattened dims.
+ExprPtr buf_flat_size(const BufPtr& v);
+// Returns true if expressions A and B can be simplified to an equal expression.
+TORCH_API bool exprEquals(const ExprPtr& A, const ExprPtr& B);
+
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir_verifier.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir_verifier.h
new file mode 100644
index 0000000000000000000000000000000000000000..5775d058b80955113397205a2636e731624c9b78
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir_verifier.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include <torch/csrc/jit/tensorexpr/fwd_decls.h>
+#include <torch/csrc/jit/tensorexpr/ir_visitor.h>
+
+namespace torch::jit::tensorexpr {
+
+class Expr;
+class ExprHandle;
+class Mod;
+class And;
+class Or;
+class Xor;
+class Lshift;
+class Rshift;
+class CompareSelect;
+class Ramp;
+class Load;
+class IfThenElse;
+class Intrinsics;
+
+class Stmt;
+class ExternalCall;
+class Store;
+class For;
+class Block;
+
+class TORCH_API IRVerifier : public IRVisitor {
+ public:
+  IRVerifier() = default;
+
+  void visit(const ModPtr& v) override;
+  void visit(const AndPtr& v) override;
+  void visit(const OrPtr& v) override;
+  void visit(const XorPtr& v) override;
+  void visit(const LshiftPtr& v) override;
+  void visit(const RshiftPtr& v) override;
+  void visit(const CompareSelectPtr& v) override;
+  void visit(const RampPtr& v) override;
+  void visit(const LoadPtr& v) override;
+  void visit(const IfThenElsePtr& v) override;
+  void visit(const IntrinsicsPtr& v) override;
+
+  void visit(const ExternalCallPtr& v) override;
+  void visit(const StorePtr& v) override;
+  void visit(const ForPtr& v) override;
+  void visit(const BlockPtr& v) override;
+};
+
+TORCH_API void verify(const StmtPtr&);
+TORCH_API void verify(const ExprPtr&);
+TORCH_API void verify(const ExprHandle&);
+
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir_visitor.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir_visitor.h
new file mode 100644
index 0000000000000000000000000000000000000000..00ac02fc0567b40d57685c85d8e7bb0aa589d9d2
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/ir_visitor.h
@@ -0,0 +1,60 @@
+#pragma once
+#include <c10/core/ScalarType.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/tensorexpr/fwd_decls.h>
+
+namespace torch::jit::tensorexpr {
+
+class TORCH_API IRVisitor {
+ public:
+  virtual ~IRVisitor() = default;
+  virtual void visit(const AddPtr& v);
+  virtual void visit(const SubPtr& v);
+  virtual void visit(const MulPtr& v);
+  virtual void visit(const DivPtr& v);
+  virtual void visit(const ModPtr& v);
+  virtual void visit(const MaxPtr& v);
+  virtual void visit(const MinPtr& v);
+  virtual void visit(const AndPtr& v);
+  virtual void visit(const OrPtr& v);
+  virtual void visit(const XorPtr& v);
+  virtual void visit(const LshiftPtr& v);
+  virtual void visit(const RshiftPtr& v);
+  virtual void visit(const CompareSelectPtr& v);
+
+#define IMM_PRINT_VISIT(Type, Name) virtual void visit(const Name##ImmPtr& v);
+
+  AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, IMM_PRINT_VISIT)
+#undef IMM_PRINT_VISIT
+
+  virtual void visit(const CastPtr& v);
+  virtual void visit(const BitCastPtr& v);
+  virtual void visit(const VarPtr& v);
+  virtual void visit(const BufPtr& v);
+  virtual void visit(const RampPtr& v);
+  virtual void visit(const LoadPtr& v);
+  virtual void visit(const ForPtr& v);
+  virtual void visit(const BlockPtr& v);
+  virtual void visit(const StorePtr& v);
+  virtual void visit(const BroadcastPtr& v);
+  virtual void visit(const IfThenElsePtr& v);
+  virtual void visit(const IntrinsicsPtr& v);
+  virtual void visit(const AllocatePtr& v);
+  virtual void visit(const FreePtr& v);
+  virtual void visit(const FreeExtPtr& v);
+  virtual void visit(const PlacementAllocatePtr& v);
+  virtual void visit(const LetPtr& v);
+  virtual void visit(const CondPtr& v);
+  virtual void visit(const TermPtr& v);
+  virtual void visit(const PolynomialPtr& v);
+  virtual void visit(const RoundOffPtr& v);
+  virtual void visit(const MaxTermPtr& v);
+  virtual void visit(const MinTermPtr& v);
+  virtual void visit(const ReduceOpPtr& v);
+  virtual void visit(const AtomicAddPtr& v);
+  virtual void visit(const SyncThreadsPtr& v);
+  virtual void visit(const ExternalCallPtr& v);
+  virtual void visit(const ExternalCallWithAllocPtr& v);
+};
+
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/kernel.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..9be9ebbf96659231a2707906b01eaf2aa4e9098b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/kernel.h
@@ -0,0 +1,378 @@
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/passes/symbolic_shape_runtime_fusion.h>
+#include <torch/csrc/jit/passes/utils/subgraph_utils.h>
+#include <torch/csrc/jit/runtime/interpreter.h>
+#include <torch/csrc/jit/tensorexpr/analysis.h>
+#include <torch/csrc/jit/tensorexpr/codegen.h>
+#include <torch/csrc/jit/tensorexpr/lowerings.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+
+namespace torch::jit::tensorexpr {
+
+struct SmallSizeTPairHash {
+ public:
+  std::size_t operator()(const std::pair<size_t, size_t>& x) const {
+    // hashing input index and then dim index
+    return x.first * 128 + x.second;
+  }
+};
+
+// Returns true if the TE fuser supports this conv2d.
+bool conv2dIsSupportedJit(const Node* node);
+// Returns true if the TE fuser supports this conv2d with mkldnn prepacked conv.
+bool mkldnnPrepackedConvIsSupportedJit(const Node* node);
+// Returns true if the TE _convolution node is Conv2d.
+bool isConv2d(const Node* node);
+// Returns true if the TE fuser supports this matmul.
+bool matmulIsSupported(const Node* node);
+template <typename T>
+inline std::vector<int64_t> bufferSizes(const T& t) {
+  std::vector<int64_t> sizes;
+  for (size_t i = 0; i < t->ndim(); i++) {
+    sizes.push_back(*intValue(t->dim(i)));
+  }
+  return sizes;
+}
+
+// Get the dimensions of a value.
+std::vector<ExprHandle> valueShape(const ArgValue& v);
+
+// If v is a tensor, broadcast it to match the shape of axes, or return
+// directly if v is a constant.
+ExprHandle tensorOrConstant(
+    const ArgValue& v,
+    const std::vector<ExprHandle>& axes);
+
+int64_t normalizeAndCheckIndex(int64_t idx, int64_t list_size);
+
+ExprHandle broadcast(const BufHandle& b, const std::vector<ExprHandle>& axes);
+
+ExprHandle constant(const ArgValue& v);
+
+std::vector<ExprHandle> computeIndicesToBroadcast(
+    const std::vector<ExprHandle>& outputAxes,
+    const std::vector<ExprHandle>& inputSizes);
+
+inline std::string getArgValueName(const ArgValue& a) {
+  if (std::holds_alternative<tensorexpr::BufHandle>(a)) {
+    return "BufHandle";
+  } else if (std::holds_alternative<tensorexpr::VarHandle>(a)) {
+    return "VarHandle";
+  } else if (std::holds_alternative<double>(a)) {
+    return "double";
+  } else if (std::holds_alternative<int64_t>(a)) {
+    return "int64_t";
+  } else if (std::holds_alternative<bool>(a)) {
+    return "bool";
+  } else if (std::holds_alternative<BufList>(a)) {
+    return "BufList";
+  } else if (std::holds_alternative<DoubleList>(a)) {
+    return "DoubleList";
+  } else if (std::holds_alternative<IntList>(a)) {
+    return "IntList";
+  } else if (std::holds_alternative<ArgNone>(a)) {
+    return "None";
+  } else {
+    throw std::runtime_error("ArgValue type not handled in string conversion");
+  }
+}
+
+template <class T>
+std::vector<T> convertVecArgValue(const std::vector<ArgValue>& v) {
+  std::vector<T> res;
+  for (auto& x : v) {
+    auto val = std::get_if<T>(&x);
+    if (val) {
+      res.push_back(*val);
+    } else {
+      throw std::runtime_error(
+          "vector type not homogeneous - found " + getArgValueName(x) +
+          ", expected " + getArgValueName(v[0]));
+    }
+  }
+  return res;
+}
+
+class TORCH_API TensorExprKernel {
+  struct ConstantDescr {
+    BufPtr buf;
+    // Only one of ptr and node is used at a time
+    // 1) ptr for the constant tensors
+    // 2) node for the constant custom class objects
+    void* ptr = nullptr;
+    Node* node = nullptr;
+  };
+
+ public:
+  // Constructor Params:
+  //  * subgraph
+  //      - the graph that needs to be compiled.
+  //  * kernel_func_name
+  //      - the name that should be used for the generated kernel.
+  //  * custom_lowerings
+  //      - map that represents custom lowering definitions for a set of ops.
+  //  * symbolic_shape_inputs
+  //      - a list of symbolic graph inputs that represent the symbolic dims of
+  //        the input tensors.
+  //  * pre_alloc
+  //      - a flag to control pre-allocation of buffers.
+  explicit TensorExprKernel(
+      const std::shared_ptr<Graph>& subgraph,
+      std::string kernel_func_name,
+      std::unordered_map<c10::Symbol, NNCLoweringFunction> custom_lowerings =
+          {},
+      std::vector<int64_t> symbolic_shape_inputs = {},
+      bool pre_alloc = false,
+      std::unordered_map<
+          const torch::jit::Value*,
+          std::vector<torch::jit::StrideInput>> symbolic_strides = {});
+
+  explicit TensorExprKernel(
+      const std::shared_ptr<Graph>& subgraph,
+      std::unordered_map<c10::Symbol, NNCLoweringFunction> custom_lowerings =
+          {},
+      std::vector<int64_t> symbolic_shape_inputs = {},
+      bool pre_alloc = false,
+      std::unordered_map<
+          const torch::jit::Value*,
+          std::vector<torch::jit::StrideInput>> symbolic_strides = {})
+      : TensorExprKernel(
+            subgraph,
+            SubgraphUtils::generateNameForGraph(subgraph),
+            std::move(custom_lowerings),
+            std::move(symbolic_shape_inputs),
+            pre_alloc,
+            std::move(symbolic_strides)) {}
+
+  void run(Stack& stack) const;
+  void runFast(
+      const std::vector<void*>& inputs,
+      const std::vector<void*>& outputs) const;
+  // Expected format of stack:
+  //  ... <outputs> <inputs>
+  // i.e., output IValues must be below the input IValues in the stack.
+  void runWithAllocatedOutputs(Stack& stack) const;
+
+  void fallback(Stack& stack) const {
+    InterpreterState(code_).run(stack);
+  }
+  void recompile();
+
+  StmtPtr getCodeGenStmt();
+
+  std::string getCodeText(const std::string& attr = "") {
+    return codegen_->getCodeText(attr);
+  }
+
+  const std::shared_ptr<Graph> graph() {
+    return graph_;
+  }
+
+  const std::vector<ConstantDescr>& getConstantDescriptors() const {
+    return constants_;
+  }
+
+  const std::vector<CodeGen::BufferArg>& getBufferArgs() const {
+    return bufferArgs_;
+  }
+
+  const std::string& getKernelName() const {
+    return (codegen_ ? codegen_->kernel_func_name() : kernel_func_name_);
+  }
+
+  const std::vector<int64_t>& getSymbolicShapeInputs() const {
+    return symbolic_shape_inputs_;
+  }
+
+ private:
+  enum BackendType {
+    kUninitialized,
+    kSimpleIREval,
+    kLLVMCodeGen,
+    kCudaCodeGen,
+    kBlockCodeGen,
+  };
+
+  enum MemoryLayoutPolicy {
+    kContiguous,
+    kChannelsLastNdContiguous,
+  };
+
+  void compile();
+  void genInputDebugNames();
+  void runKernel(Stack& stack) const;
+
+  std::vector<ExprHandle> sizesForValue(const torch::jit::Value* v);
+
+  // These functions broadcast shape and also store a `hasBroadcast_` variable.
+  std::vector<ExprHandle> broadcastShapesMut(
+      const std::vector<ExprHandle>& a,
+      const std::vector<ExprHandle>& b);
+  std::vector<ExprHandle> broadcastShapesMut(
+      std::vector<std::vector<ExprHandle>> shapes);
+
+  ArgValue toArg(const torch::jit::Value* v) const;
+  ExprHandle constant(const torch::jit::Value* v);
+
+  Tensor computeValue(const torch::jit::Value* v);
+
+  void bindConstant(const torch::jit::Value* v);
+
+  StmtPtr transformLoops(BackendType backendType, StmtPtr st);
+
+  std::string getCodeGenName(BackendType backendType);
+
+  void getStaticOutputSizesAndStrides(
+      const at::ArrayRef<IValue>& inputs,
+      std::vector<std::vector<int64_t>>* static_sizes,
+      std::vector<std::vector<int64_t>>* static_strides) const;
+
+  std::vector<CodeGen::CallArg> prepareRunArgs(
+      const at::ArrayRef<IValue>& inputs,
+      std::vector<at::Tensor>& outputs) const;
+  BackendType inferBackendTypeFromDevice(at::Device device);
+
+  Tensor bindInput(const torch::jit::Value* input);
+  BlockPtr bindAllInputs();
+
+  // Deduce the memory layout policy to be propagated within
+  // NNC fusion group. The memory layout policy could be `kContiguous`
+  // or `kChannelsLastNdContiguous`.
+  //    `kContiguous`: Always convert the non-contiguous input tensors and
+  //        internal buffers to contiguous.
+  //    `kChannelsLastNdContiguous`: Always convert the input tensors and
+  //        internal buffers to channels-last contiguous.
+  // Currently, the rule is simple.
+  //    If all the input and out tensors of NNC fusion group are channels-last
+  //    contiguous, the policy is `kChannelsLastNdContiguous`. Otherwise, it
+  //    is always `kContiguous`.
+  void deduceMemoryLayoutPolicy();
+
+  Tensor convertSymbolicOutputToCorrectStrides(torch::jit::Value* v);
+  Tensor convertStaticShapeOutputToCorrectStrides(torch::jit::Value* v);
+  Tensor convertSymbolicOutputToCorrectStrides(
+      const std::vector<ExprHandle>& sizes,
+      const std::vector<size_t>& sorted_stride_indices_descending,
+      const std::vector<ExprPtr>& strides,
+      BufPtr& buf);
+
+  NNCLoweringFunction getCustomLoweringFor(c10::Symbol op) const;
+  std::unordered_map<c10::Symbol, NNCLoweringFunction> getCustomLowerings()
+      const {
+    return custom_lowerings_;
+  }
+
+  // Allocate memory for intermediate buffers at compile time.
+  // Specifically, we pre-allocate memory for intermediate buffers with static
+  // size and manage these buffers in the way we manage JIT constant tensors:
+  // push the buf args into the stack so NNC IR can access them at runtime.
+  std::vector<BufPtr> preAllocIntermediateBufs(
+      const std::vector<BufPtr>& interm_bufs);
+
+  struct UnpackedTensorOptions {
+    std::optional<c10::ScalarType> dtype;
+    std::optional<c10::Layout> layout;
+    std::optional<c10::Device> device;
+    std::optional<bool> pinned_memory;
+
+    UnpackedTensorOptions(const c10::TensorOptions& opts)
+        : dtype(c10::optTypeMetaToScalarType(opts.dtype_opt())),
+          layout(opts.layout_opt()),
+          device(opts.device_opt()),
+          pinned_memory(opts.pinned_memory_opt()) {}
+  };
+
+  ExprHandle getVarForShape(const c10::ShapeSymbol& ss);
+  std::vector<ExprHandle> computeInputTensorDims(
+      const torch::jit::Value* input);
+  ExprHandle getStrideArg(size_t tensor_input, size_t stride_index);
+  std::vector<ExprHandle> sizesFromSymbolicShape(
+      const c10::SymbolicShape& shape);
+  std::vector<ExprHandle> getInputStrides(
+      const torch::jit::Value* input,
+      const std::vector<ExprHandle>& inputTensorDims);
+  std::vector<torch::jit::StrideInput>& getSymbolicStrideDesc(
+      const torch::jit::Value* value);
+
+  // Apply the optimizations to the graph owned by the current fusion group,
+  // like concatenation optimization, post-op fusion, and some other graph-level
+  // optimizations.
+  void optimizeOwningGraph();
+
+  int64_t nInputs_ = 0;
+  int64_t nOutputs_ = 0;
+  std::vector<CodeGen::BufferArg> bufferArgs_;
+  std::vector<std::vector<int64_t>> tensorOutputSizes_;
+  std::vector<std::vector<int64_t>> tensorOutputStrides_;
+  std::vector<torch::jit::StrideInput> tensorOutputStrideDesc_;
+  std::vector<bool> isOutputScalar_;
+  std::vector<UnpackedTensorOptions> tensorOutputTensorOptions_;
+  std::unordered_set<BufPtr> bufOutputs_;
+  std::unordered_set<BufPtr> bufsToBeParallelized_;
+  std::unordered_map<const torch::jit::Value*, BufPtr> bufs_;
+  std::unordered_map<const torch::jit::Value*, VarHandle> scalars_;
+  std::unordered_map<const torch::jit::Value*, std::string> input_name_map_;
+  std::unique_ptr<CodeGen> codegen_;
+  at::Device device_ = at::kCPU;
+  std::shared_ptr<Graph> graph_;
+  Code code_;
+  bool allow_fallback_{false};
+  bool use_fallback_{false};
+  bool hasRandom_{false};
+  bool hasBroadcast_{false};
+  std::unordered_map<const torch::jit::Value*, std::vector<ExprHandle>>
+      known_sizes_;
+
+  std::vector<std::vector<ExprHandle>> tensorOutputSymbolicSizes_;
+  // A map from ShapeSymbol.value() to the corresponding Var.
+  std::unordered_map<int64_t, VarHandle> shapeSymbolToVar_;
+  std::unordered_map<ExprPtr, size_t> shapeSymbolInputPos_;
+  // List of values corresponding to the ShapeSymbols that are inputs to
+  // kernel being compiled. The order of these values correspond to the order
+  // of the symbolic inputs at the end of the list of inputs to the kernel.
+  std::vector<int64_t> symbolic_shape_inputs_;
+  bool has_symbolic_shapes_{false};
+
+  std::vector<at::Tensor> unpacked_constant_tensors_;
+  std::vector<ConstantDescr> constants_;
+
+  std::unordered_map<c10::Symbol, NNCLoweringFunction> custom_lowerings_;
+  StmtPtr stmt_ = nullptr;
+  bool pre_alloc_{false};
+  std::string kernel_func_name_;
+
+  // index of stack, stride index of tensor that will be appended as a codegen
+  // arg
+  std::vector<std::pair<size_t, size_t>> input_stride_args_;
+  // map from <input index, tensor dimension> to stride as arg VarHandle
+  std::unordered_map<std::pair<size_t, size_t>, VarHandle, SmallSizeTPairHash>
+      strideArgToVar_;
+  std::unordered_map<
+      const torch::jit::Value*,
+      std::vector<torch::jit::StrideInput>>
+      symbolic_strides_;
+
+  // Memory layout to be propagated with fusion group
+  MemoryLayoutPolicy memory_layout_policy_ = MemoryLayoutPolicy::kContiguous;
+};
+
+TORCH_API int& getTECudaPointwiseLoopLevels();
+TORCH_API int& getTECudaPointwiseBlockCount();
+TORCH_API int& getTECudaPointwiseBlockSize();
+TORCH_API bool& getTEGenerateBlockCode();
+TORCH_API bool& getTEMustUseLLVMOnCPU();
+TORCH_API bool fallbackAllowed();
+TORCH_API bool setFallbackAllowed(bool value);
+TORCH_API bool& getCatWoConditionals();
+TORCH_API bool& getOptConditionals();
+
+TORCH_API std::optional<at::Device> pickDeviceType(
+    const at::ArrayRef<torch::jit::Value*>& inputs);
+
+bool isContiguous(
+    const torch::jit::Value* v,
+    at::MemoryFormat memory_format = at::MemoryFormat::Contiguous);
+
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/llvm_codegen.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/llvm_codegen.h
new file mode 100644
index 0000000000000000000000000000000000000000..d6fbbe81a2c56f56bc481ce9aa0f26288ddccbfe
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/llvm_codegen.h
@@ -0,0 +1,143 @@
+#pragma once
+
+#ifdef TORCH_ENABLE_LLVM
+#include <torch/csrc/Export.h>
+
+#include <torch/csrc/jit/tensorexpr/codegen.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_visitor.h>
+
+#include <optional>
+
+#include <unordered_map>
+#include <vector>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+class LLVMCodeGenImpl;
+class LLVMCodeGenCallee;
+
+class TORCH_API LLVMCodeGen : public CodeGen {
+ public:
+  explicit LLVMCodeGen(
+      StmtPtr stmt,
+      const std::vector<BufferArg>& args,
+      at::Device device = at::kCPU,
+      const std::string& kernel_func_name = "func",
+      Dtype dtype = kInt,
+      std::optional<std::string> triple = std::nullopt,
+      std::optional<std::string> cpu = std::nullopt,
+      std::optional<std::string> attrs = std::nullopt);
+  explicit LLVMCodeGen(StmtPtr stmt);
+
+  LLVMCodeGen() = delete;
+  ~LLVMCodeGen() override;
+
+  // Cleans up all the memory used during LLVM code generation pass except
+  // the generated kernel. After calling this method, users should not call
+  // methods like `getCodeText` that require the LLVMCodeGenImpl data. However,
+  // users can continue to call this kernel using `call` and `call_raw`.
+  void cleanup_memory();
+
+  TORCH_API void call(const std::vector<CallArg>& args) override;
+  TORCH_API void call_raw(const std::vector<void*>& args) override;
+  TORCH_API void call_with_numel(void** args, int64_t numel) override;
+
+  at::Tensor empty_strided(
+      c10::IntArrayRef size,
+      c10::IntArrayRef stride,
+      std::optional<c10::ScalarType> dtype_opt,
+      std::optional<c10::Layout> layout_opt,
+      std::optional<c10::Device> device_opt,
+      std::optional<bool> pin_memory_opt) override;
+
+  template <typename T>
+  T value() {
+    return value<T>(nullptr);
+  }
+
+  template <typename T>
+  T value(std::vector<void*>& args) {
+    return value<T>(args.data());
+  }
+
+  template <typename T>
+  T value(void** args) {
+    T (*fp)(void**) = (T(*)(void**))getKernelAddress(callee_.get());
+    T rv = fp(args);
+    return rv;
+  }
+
+  std::string getCodeText(const std::string& attr = "") override;
+
+ private:
+  void* getKernelAddress(LLVMCodeGenCallee* callee);
+
+  std::unique_ptr<LLVMCodeGenCallee> callee_;
+  std::unique_ptr<LLVMCodeGenImpl> impl_;
+};
+
+struct TORCH_API LLVMCodeGenBuilder {
+  using BufferArg = CodeGen::BufferArg;
+
+  LLVMCodeGenBuilder(StmtPtr stmt, std::vector<BufferArg> args)
+      : stmt_(stmt), args_(std::move(args)) {}
+
+  LLVMCodeGenBuilder& device(at::Device device) {
+    device_ = device;
+    return *this;
+  }
+
+  LLVMCodeGenBuilder& kernelFuncName(std::string name) {
+    kernelFuncName_ = std::move(name);
+    return *this;
+  }
+
+  LLVMCodeGenBuilder& dtype(Dtype d) {
+    dtype_ = d;
+    return *this;
+  }
+
+  LLVMCodeGenBuilder& triple(std::string triple) {
+    triple_ = std::move(triple);
+    return *this;
+  }
+
+  LLVMCodeGenBuilder& cpu(std::string cpu) {
+    cpu_ = std::move(cpu);
+    return *this;
+  }
+
+  LLVMCodeGenBuilder& attrs(std::string attrs) {
+    attrs_ = std::move(attrs);
+    return *this;
+  }
+
+  std::unique_ptr<LLVMCodeGen> build() {
+    return std::make_unique<LLVMCodeGen>(
+        stmt_, args_, device_, kernelFuncName_, dtype_, triple_, cpu_, attrs_);
+  }
+
+ private:
+  StmtPtr stmt_;
+  std::vector<BufferArg> args_;
+  at::Device device_ = at::kCPU;
+  std::string kernelFuncName_ = "func";
+  Dtype dtype_ = kInt;
+  std::optional<std::string> triple_ = std::nullopt;
+  std::optional<std::string> cpu_ = std::nullopt;
+  std::optional<std::string> attrs_ = std::nullopt;
+};
+
+TORCH_API std::optional<std::string>& LLVMTargetTriple();
+TORCH_API std::optional<std::string>& LLVMTargetCPU();
+TORCH_API std::optional<std::string>& LLVMTargetAttrs();
+TORCH_API bool& LLVMAOTWorkflow();
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
+
+#endif // TORCH_ENABLE_LLVM
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/llvm_jit.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/llvm_jit.h
new file mode 100644
index 0000000000000000000000000000000000000000..cfffbb35400be41e6954c3d1c58d16e203f5730a
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/llvm_jit.h
@@ -0,0 +1,77 @@
+#pragma once
+
+#ifdef TORCH_ENABLE_LLVM
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <torch/csrc/Export.h>
+#include <optional>
+
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wsuggest-override")
+#include <llvm/ExecutionEngine/JITSymbol.h>
+C10_DIAGNOSTIC_POP()
+#include <llvm/ExecutionEngine/Orc/Core.h>
+#include <llvm/ExecutionEngine/Orc/ThreadSafeModule.h>
+#include <llvm/Target/TargetMachine.h>
+
+#include <memory>
+#include <string>
+
+namespace torch {
+namespace jit {
+namespace tensorexpr {
+
+inline std::string formatError(llvm::Error&& err, const char* msg) {
+  static constexpr const char* defaultErrorMsg =
+      "Unexpected failure in LLVM JIT";
+  std::string errorMsg(msg ? msg : defaultErrorMsg);
+  llvm::raw_string_ostream ss(errorMsg);
+  ss << ": " << err;
+  return ss.str();
+}
+
+template <typename T>
+T assertSuccess(llvm::Expected<T> valOrErr, const char* msg = nullptr) {
+  TORCH_INTERNAL_ASSERT(valOrErr, formatError(valOrErr.takeError(), msg));
+  return std::move(*valOrErr);
+}
+
+inline void assertSuccess(llvm::Error err, const char* msg = nullptr) {
+  TORCH_INTERNAL_ASSERT(!err, formatError(std::move(err), msg));
+}
+
+} // namespace tensorexpr
+} // namespace jit
+} // namespace torch
+
+namespace llvm {
+namespace orc {
+
+class PytorchLLVMJITImpl;
+
+class TORCH_API PytorchLLVMJIT {
+ public:
+  PytorchLLVMJIT(
+      std::optional<std::string> triple,
+      std::optional<std::string> cpu,
+      std::optional<std::string> attrs);
+  ~PytorchLLVMJIT();
+
+  void addModule(std::unique_ptr<Module> M, std::unique_ptr<LLVMContext> C);
+
+  JITSymbol findSymbol(const std::string Name);
+
+  bool hasSymbol(const std::string& Name);
+
+  TargetMachine& getTargetMachine();
+
+  const DataLayout& getDataLayout();
+
+ private:
+  // Use the PImpl idiom here to hide the no-rtti parts of the JIT structure.
+  std::unique_ptr<PytorchLLVMJITImpl> impl_;
+};
+
+} // end namespace orc
+} // end namespace llvm
+
+#endif // ENABLE LLVM
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/loopnest.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/loopnest.h
new file mode 100644
index 0000000000000000000000000000000000000000..d745844066ddbaa102b316ced5ca334068a45bf0
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/loopnest.h
@@ -0,0 +1,616 @@
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/tensorexpr/fwd_decls.h>
+
+namespace torch::jit::tensorexpr {
+
+class Expr;
+class Var;
+class Buf;
+class Tensor;
+class Function;
+class Stmt;
+class For;
+class Block;
+class Store;
+class Dtype;
+
+class TORCH_API LoopNest {
+ public:
+  // A constructor for building a LoopNest from a list of Tensors
+  LoopNest(
+      const std::vector<Tensor>& output_tensors,
+      const std::vector<Tensor>& tensors_to_compute);
+
+  // A convenience constructor for the case when all tensors are output tensors
+  LoopNest(const std::vector<Tensor>& output_tensors);
+
+  // A constructor for building a LoopNest from an Stmt and a list of output
+  // buffers.
+  LoopNest(StmtPtr stmt, std::unordered_set<BufPtr> output_bufs);
+
+  // A constructor for building a LoopNest from another loopnest. It clones the
+  // other loopnest's stmt.
+  LoopNest(const LoopNest& other);
+
+  StmtPtr root_stmt() const {
+    return root_stmt_;
+  }
+
+  std::vector<ForPtr> getLoopStmtsFor(const Tensor&) const;
+  std::vector<ForPtr> getLoopStmtsFor(const BufPtr&) const;
+  std::vector<ForPtr> getLoopStmtsFor(StmtPtr) const;
+  StmtPtr getLoopBodyFor(const Tensor&) const;
+  StmtPtr getLoopBodyFor(BufPtr) const;
+
+  // Returns the For stmt indexed by 'indices' in the 'root' For stmt.
+  //'indices' indicates the path to the returned loop from 'root' in AST, e.g.,
+  //
+  // root: for(int i...){
+  // j_loop: for (int j...){
+  // k1_loop:  for (int k1...){
+  //            A[i, j, k1] = ....
+  //          }
+  //          B[i, j] = ...
+  // k2_loop:  for (int k2...){
+  //            A[i, j, k2] = ...
+  //          }
+  //        }
+  //      }
+  //
+  // the path from 'root' to 'j_loop' is [0]
+  // the path from 'root' to 'k1_loop' is [0, 0]
+  // the path from 'root' to 'k2_loop' is [0, 2]
+  ForPtr getLoopAt(ForPtr root, const std::vector<int>& indices) const;
+
+  // Returns the For stmt that is immediately enclosing the given stmt.
+  static ForPtr getParentLoop(const StmtPtr& st);
+
+  // Returns the list of For stmts corresponding to the loopnest that is
+  // enclosing the given stmt.
+  static std::vector<ForPtr> getEnclosingLoopNest(const StmtPtr& st);
+
+  // Returns a list of all Stmts that write to the given buf.
+  std::vector<StmtPtr> getAllWritesToBuf(BufPtr) const;
+
+  // The following methods return the For loops that contain writes to
+  // the given buf.
+  //
+  // For example, consider the following code:
+  //   for i1
+  //     for j1
+  //       a[i1,j1] =
+  //   for i2
+  //     for j2
+  //       for k2
+  //         a[i2,j2] =
+  //     for j3
+  //       a[i2,j3] =
+
+  // Returns a list of For loops which directly contain a Stmt that writes
+  // to buf.
+  // For the above example:
+  //   getAllInnermostLoopsWritingToBuf(a) => {j1, k2, j3}
+  std::vector<ForPtr> getAllInnermostLoopsWritingToBuf(BufPtr) const;
+
+  // Returns a list of For loopnests which contain a Stmt that writes to
+  // the given buf. Each loopnest here is a vector For loops.
+  // For the above example:
+  //   getAllLoopNestsWritingToBuf(a) => {{i1,j1}, {i2,j2,k2}, {i2,j3}}
+  std::vector<std::vector<ForPtr>> getAllLoopNestsWritingToBuf(BufPtr) const;
+
+  StmtPtr simplify();
+
+  // Sanitize variables and buffer names.
+  // The pass assigns predefined names for loop index variables
+  // (i,j,k,l,m,n,o,p,i1,j1,k1,...) and ensures these names are not conflicting
+  // anywhere. It also removes duplicates from other Buf nad Var names as well
+  // as replaces illegal characters in them with underscores.
+  //
+  // Note: since it's currently technically possible to use the same variable
+  // as index in two different loops, this transformation finds such cases and
+  // introduces new variables to avoid duplication.
+  static StmtPtr sanitizeNames(StmtPtr s);
+
+  bool computeInline(const StmtPtr& s);
+  bool computeInline(const BufPtr& b);
+  void inlineIntermediateBufs(bool allow_duplicated_work);
+
+  // Optimizes conditionals.
+  //
+  // Currently, only the following pattern of conditionals is optimized.
+  // This corresponds to the conditional format that is generated to handle
+  // `aten::cat` op.
+  //
+  //   for (int i = 0; i < 20; i++) {
+  //     A[i] = IfThenElse(i<5 ? 1 : 0, B[i], C[i-5])
+  //   }
+  //
+  // Constraints that must be satisfied for this optimization:
+  //   * All conditions should be of the form "var < expr".
+  //   * All conditions should have the same variable, say v.
+  //   * The condition variable found should be the same as the inner-most
+  //     loop variable. TODO: Remove this constraint.
+  //   * If there are multiple stores that contain conditionals using the same
+  //     loop variable, only the first conditional will be optimized.
+  //     TODO: Remove this constraint.
+  bool optimizeConditionals();
+
+  // Splits the given loop into 2 nested loops with the given factor as the
+  // inner loop bound. If the factor does not evenly divide the loop bound,
+  // then the remaining iterations are extracted into a tail loop that is
+  // added after the given loop.
+  //
+  // For example, consider the following code:
+  //   for (int i = 0; i < 100; ++i) {
+  //     A[i] =
+  //   }
+  //
+  // splitWithTail(i, 8, ...) will result in:
+  //   for (int i_outer = 0; i_outer < 12; ++i_outer) {
+  //     for (int i_inner = 0; i_inner < 8; ++i_inner) {
+  //       A[i_outer * 8 + i_inner] =
+  //     }
+  //   }
+  //   for (int i_tail = 0; i_tail < 4; ++i_tail) {
+  //     A[i_tail + 96] =
+  //   }
+  //
+  // The given loop will be transformed to the outer loop after splitting.
+  // So, the pointer to the input loop should be valid after splitting and
+  // will point to the outer loop. The `inner` and `tail` parameters will be
+  // set to point to the inner and tail loops that are generated.
+  static void splitWithTail(
+      const ForPtr& f,
+      int factor,
+      ForPtr* inner,
+      ForPtr* tail);
+  // A convenience wrapper when the caller does not need to access the
+  // split loops.
+  static void splitWithTail(const ForPtr& f, int factor);
+
+  // Splits the given loop into 2 nested loops with the given factor as the
+  // inner loop bound. If the factor does not evenly divide the loop bound,
+  // then a conditional is inserted into the body to handle the remaining
+  // iterations appropriately.
+  //
+  // For example, consider the following code:
+  //   for (int i = 0; i < 100; ++i) {
+  //     A[i] =
+  //   }
+  //
+  // splitWithMask(i, 8, ...) will result in:
+  //   for (int i_outer = 0; i_outer < 13; ++i_outer) {
+  //     for (int i_inner = 0; i_inner < 8; ++i_inner) {
+  //       if (i_outer * 8 + i_inner < 100) {
+  //         A[i_outer * 8 + i_inner] =
+  //       }
+  //     }
+  //   }
+  //
+  // The given loop will be transformed to the outer loop after splitting.
+  // So, the pointer to the input loop should be valid after splitting and
+  // will point to the outer loop. The `inner` parameter will be set to point
+  // to the inner loop that is generated.
+  static void splitWithMask(const ForPtr& f, int factor, ForPtr* inner);
+  // A convenience wrapper when the caller does not need to access the
+  // split loops.
+  static void splitWithMask(const ForPtr& f, int factor);
+
+  // The following methods support loop distribution.
+  // For example, consider the following code. This will be used to
+  // demonstrate the methods below.
+  //
+  // S0:  for m
+  // S1:    for i
+  // S2:      A[i] = 0
+  // S3:      for j
+  // S4:        A[i] = A[i] +
+  // S5:      B[i] = A[i]
+  // S6:      for k
+  // S7:        B[i] = B[i] +
+
+  // This method distributes the given loop over its body by splitting
+  // after every given pivot stmt.
+  //
+  // NOTE: Pivot stmts that are not in the given loop's body will be ignored.
+  //
+  // For the above example:
+  //   distributeLoop(S1, {S3, S5})
+  // will result in:
+  // S0:  for m
+  // S1:    for i
+  // S2:      A[i] = 0
+  // S3:      for j
+  // S4:        A[i] = A[i] +
+  //   :    for i
+  // S5:      B[i] = A[i]
+  //   :    for i
+  // S6:      for k
+  // S7:        B[i] = B[i] +
+  static std::vector<ForPtr> distributeLoop(
+      const ForPtr& loop,
+      const std::unordered_set<StmtPtr>& pivots);
+
+  // This method distributes the given loop over every stmt in its body.
+  //
+  // For the above example:
+  //   distributeLoop(S1)
+  // will result in:
+  // S0:  for m
+  // S1:    for i
+  // S2:      A[i] = 0
+  //   :    for i
+  // S3:      for j
+  // S4:        A[i] = A[i] +
+  //   :    for i
+  // S5:      B[i] = A[i]
+  //   :    for i
+  // S6:      for k
+  // S7:        B[i] = B[i] +
+  static std::vector<ForPtr> distributeLoop(const ForPtr& loop);
+  // Same as above, but also distribute parent loops.
+  // Returns the result of distributing the outermost loop.
+  //
+  // For the above example:
+  //   distributeLoopAndParents(S1) will result in:
+  // S0:  for m
+  // S1:    for i
+  // S2:      A[i] = 0
+  //   :  for m
+  //   :    for i
+  // S3:      for j
+  // S4:        A[i] = A[i] +
+  //   :  for m
+  //   :    for i
+  // S5:      B[i] = A[i]
+  //   :  for m
+  //   :    for i
+  // S6:      for k
+  // S7:        B[i] = B[i] +
+  static std::vector<ForPtr> distributeLoopAndParents(const ForPtr& loop);
+
+  // This method distributes the given loop over its body by splitting
+  // after every For stmt in its body.
+  //
+  // For the above example:
+  //   distributeLoopOverInnerLoops(S1)
+  // will result in:
+  // S0:  for m
+  // S1:    for i
+  // S2:      A[i] = 0
+  // S3:      for j
+  // S4:        A[i] = A[i] +
+  //   :    for i
+  // S5:      B[i] = A[i]
+  // S6:      for k
+  // S7:        B[i] = B[i] +
+  static std::vector<ForPtr> distributeLoopOverInnerLoops(const ForPtr& loop);
+  // Same as above, but also distribute parent loops.
+  // Returns the result of distributing the outermost loop.
+  //
+  // For the above example:
+  //   distributeLoopAndParentsOverInnerLoops(S1)
+  // will result in:
+  // S0:  for m
+  // S1:    for i
+  // S2:      A[i] = 0
+  // S3:      for j
+  // S4:        A[i] = A[i] +
+  //   :  for m
+  //   :    for i
+  // S5:      B[i] = A[i]
+  // S6:      for k
+  // S7:        B[i] = B[i] +
+  static std::vector<ForPtr> distributeLoopAndParentsOverInnerLoops(
+      const ForPtr& loop);
+
+  // This method performs loop fusion.
+  // For example, consider the following code.
+  //
+  // S1:  for m
+  // S2:    A[m] = 0
+  // S3:    for j
+  // S4:      A[m] = A[m] +
+  // S5:  for n
+  // S5:    B[n] = A[n]
+  // S6:    for k
+  // S7:      B[n] = B[n] +
+  //
+  // fuseLoops({S1, S5}), will return the following loop:
+  // S1:  for m
+  // S2:    A[m] = 0
+  // S3:    for j
+  // S4:      A[m] = A[m] +
+  // S5:    B[m] = A[m]
+  // S6:    for k
+  // S7:      B[m] = B[m] +
+  //
+  // This transformation is unsafe as it simply add all loops into the body of
+  // the first loop for fusion without correctness checks.
+  //
+  // Below are the two requirements to apply unsafeFuseLoops:
+  //  * All the loops have the same parent.
+  //  * There are no statements between these loops in their parent body.
+  static bool unsafeFuseLoops(const std::vector<ForPtr>& loops, ForPtr* fused);
+
+  // Loop fusion is done only when all the conditions below are satisfied.
+  //  * All the loops have the same parent.
+  //  * There are no statements between these loops in their parent body.
+  //  * The start bounds are the same for all loops.
+  //  * The stop bounds are the same for all loops.
+  //  * Fusing the loops does not violate or add any dependencies.
+  static bool fuseLoops(const std::vector<ForPtr>& loops, ForPtr* fused);
+
+  static void reorderAxis(const ForPtr& a, const ForPtr& b);
+
+  // Reorder the given list of loops according to the permutation specified.
+  // Here `permutation[i]` represents the position of the loop in the input
+  // which will end up at position `i` after the reorder.
+  //
+  // For example, consider the following code:
+  //   for p
+  //     for q
+  //       for r
+  //         for s
+  //           A[p,q,r,s] =
+  //
+  // reorder({p, q, r, s}, {2, 3, 0, 1}) will return the list of loops in the
+  // following form:
+  //    for r
+  //      for s
+  //        for p
+  //          for q
+  //            A[p,q,r,s] =
+  static std::vector<ForPtr> reorder(
+      const std::vector<ForPtr>& loops,
+      const std::vector<size_t>& permutation);
+
+  // Tile takes a 2d domain (x, y) and splits it into small rectangular blocks
+  // each with shape (x_factor, y_factor). The traversal over the domain turns
+  // into an outer iteration over the blocks and an inner traversal over all
+  // points in the block.
+  // Note that if x dim % x_factor or y dim % y_factor does not equal to 0, the
+  // loop body will generate corresponding tailing loops.
+  // The transformation is in-place and returns 'xtail'.
+  //
+  // For example, consider the following code:
+  //   for i: [0, 64)
+  //     for j: [0, 64)
+  //       for k: [0, 32)
+  //         A[i, j] = B[i, k] + C[j, k]
+  //
+  // tile(i, j, 4, 8) will transform "i" for-stmt into the following nested
+  // loop:
+  //   for i_outer: [0, 16)
+  //     for j_outer: [0, 8)
+  //       for i_inner: [0, 4)
+  //         for j_inner: [0, 8)
+  //           for k: [0, 32)
+  //             A[i_outer * 4 + i_inner, j_outer * 8 + j_inner] =
+  //             B[i_outer * 4 + i_inner, k] + C[j_outer * 8 + j_inner, k]
+  //
+  // tile(i, j, 4, 9) will transform "i" for-stmt into the following nested
+  // loop:
+  //   for i_outer: [0, 16)
+  //     for j_outer: [0, 7)
+  //       for i_inner: [0, 4)
+  //         for j_inner: [0, 9)
+  //           for k: (0, 32)
+  //             A[i_outer * 4 + i_inner, j_outer * 9 + j_inner] =
+  //             B[i_outer * 4 + i_inner, k] + C[j_outer * 9 + j_inner, k]
+  //     for j_tail: [0, 1)
+  //       for i_inner: [0, 4)
+  //         for k: (0, 32)
+  //           A[i_outer * 4 + i_inner, 7 * 9 + j_tail] =
+  //           B[i_outer * 4 + i_inner, k] + C[7 * 9 + j_tail, k]
+  ForPtr tile(const ForPtr& x, const ForPtr& y, int x_factor, int y_factor);
+
+  // Returns true if the given loops are perfectly nested, i.e., every loop
+  // (except the innermost) should have exactly one statement in its body
+  // and that statement must be the next inner loop.
+  static bool areLoopsPerfectlyNested(const std::vector<ForPtr>& loops);
+
+  // Returns true if the given loop has a loop-carried dependence.
+  static bool hasLoopCarriedDependence(const ForPtr& loop);
+
+  // Unrolls all the iterations of the given loop.
+  // Requires that the loop bounds are constant.
+  static void fullUnroll(const ForPtr& f, StmtPtr* unrolled);
+  static void fullUnroll(const ForPtr& f);
+
+  // Unrolls the given loop for the specified factor.
+  // This does not require constant bounds for the loop being unrolled.
+  static void unroll(const ForPtr& f, int factor, ForPtr* tail);
+  static void unroll(const ForPtr& f, int factor);
+
+  static bool normalize(const ForPtr& f);
+  static bool isNormalized(const ForPtr& f);
+
+  static bool flatten(const std::vector<ForPtr>& f, ForPtr* flattened);
+  static bool flatten(const std::vector<ForPtr>& f);
+
+  // Compresses the given buffer based on its use in the given Stmts.
+  //
+  // NOTE: This API assumes that there are no accesses to the given buffer
+  // outside the given statement. So, this should be called with the entire
+  // kernel statement to avoid incorrect buffer compressions.
+  //
+  // For example, given the input:
+  //
+  // for (int i = 0; i < 100; ++i) {
+  //   for (int j = 0; j < 200; ++j) {
+  //     A[i,j] = sin(i*j)
+  //   }
+  //   for (int j = 0; j < 199; ++j) {
+  //     B[i,j] = A[i,j] + A[i, j+1]
+  //   }
+  // }
+  //
+  // compressBuffer(A, ...) will compress buffer A from
+  // [100, 200] to [1, 200] and modify the code as follows:
+  //
+  // for (int i = 0; i < 100; ++i) {
+  //   for (int j = 0; j < 200; ++j) {
+  //     A[0,j] = sin(i*j)
+  //   }
+  //   for (int j = 0; j < 199; ++j) {
+  //     B[i,j] = A[0,j] + A[0, j+1]
+  //   }
+  // }
+  static void compressBuffer(const BufPtr& buf, const StmtPtr& stmt);
+
+  // Compresses all buffers in the given statement.
+  //
+  // NOTE: This API assumes that there are no accesses to buffers outside
+  // the given statement. So, this should be called with the entire
+  // kernel statement to avoid incorrect buffer compressions.
+  //
+  // TODO: Add an IR verifier check to detect invalidly compressed buffers.
+  static void compressAllBuffers(const StmtPtr& stmt);
+
+  // Get 'num' loops from the loopnest starting at 'f'.
+  static std::vector<ForPtr> getLoopStmtsInLoopNest(
+      const ForPtr& f,
+      size_t num);
+
+  // LoopOptions are propagated to tail.
+  static void sliceHead(
+      const ForPtr& f,
+      int factor,
+      ForPtr* head,
+      ForPtr* tail);
+  static void sliceHead(const ForPtr& f, int factor);
+  // LoopOptions are propagated to head.
+  static void sliceTail(
+      const ForPtr& f,
+      int factor,
+      ForPtr* head,
+      ForPtr* tail);
+  static void sliceTail(const ForPtr& f, int factor);
+
+  using AccessResult = std::pair<BufPtr, StmtPtr>;
+  // Insert a cache for the consumer's usages of the buffer produced in
+  // consumer, and redirect reads and writes in the consumer to that cache.
+  // Returns a pair of the new cache buffer, and the new rewritten consumer.
+  static AccessResult cacheAccesses(
+      const BufPtr& producer,
+      const std::string& name,
+      const StmtPtr& consumer);
+
+  // Insert a temporary computation of statement S in the scope of loop AT.
+  // S is assumed to be a Store or a Block containing a Store. Along with the
+  // computation itself, this transformation inserts Alloc/Free statements for
+  // the temporary buffer used in the computation.
+  static void computeAt(const StmtPtr& s, const ForPtr& at);
+
+  // Rfactor a reduction axis into a normal axis.
+  //
+  // Requirements:
+  //  * S is the reduction store
+  //  * S is the only statement in the innermost loop
+  //  * There is at least two reduction arguments in S
+  //  * OUTER_REDUCTION_FOR loop corresponds to the outermost reduction variable
+  //  used in the store and all other reduction variables are index variables of
+  //  children loops of OUTER_REDUCTION_FOR
+  //  * OUTER_REDUCTION_FOR is a perfect loop nest, i.e. it has only loops
+  //  corresponding to the other reduction variables and the store, nested into
+  //  each other
+  //
+  // What it does:
+  //   * Introduce a new buffer with an extra dimension of a size equal to the
+  //   span of the loop OUTER_REDUCTION_FOR (the new buffer is returned via
+  //   RFAC_BUF_PTR)
+  //   * Insert an initialization store for the new buffer in
+  //   OUTER_REDUCTION_FOR before its nested loop
+  //   * Replace the reduction store to the original buffer with the reduction
+  //   store to the temp buffer, removing the index var of OUTER_REDUCTION_FOR
+  //   from reduction arguments
+  //   * Insert a final reduction store over the extra dimension of the new
+  //   buffer to the original buffer
+  //   * Returns TRUE if the transformation succeeded and FALSE otherwise
+  //
+  // Example:
+  // Original IR:
+  // S1: for i      # normal axis
+  // S2:   X[i] = 0
+  // S3:   for j    # reduction axis
+  // S4:     for k  # reduction axis
+  // S5:       X[i] = ReduceOp(X[i] + Y[i,j,k], reduce_axis={j,k})
+  //
+  // After RFACTOR(S5, S3)
+  // S1: for i               # normal axis
+  // S2:   X[i] = 0
+  // S3:   for j             # reduction axis for X, normal axis for X_rfac
+  //         X_rfac[i,j] = 0
+  // S4:     for k           # reduction axis
+  //           X_rfac[i,j] = ReduceOp(X_rfac[i,j] + Y[i,j,k], reduce_axis={k})
+  //         X[i] = ReduceOp(X[i] + X_rfac[i,j], reduce_axis={j})
+  static bool rfactor(const StmtPtr& s, const ForPtr& outer_reduction_for);
+  static bool rfactor(
+      const StmtPtr& s,
+      const ForPtr& outer_reduction_for,
+      BufPtr* rfac_buf_ptr);
+
+  // Vectorize the given loop. This method requires that the given loop
+  // does not perform a reduction.
+  // It returns true if vectorization is successful and false otherwise.
+  static bool vectorize(const ForPtr&);
+
+  // Find the inner-most loops and vectorize them. Currently, this only works
+  // for the LLVM backend, when no reductions are involved.
+  void vectorizeInnerLoops();
+
+  void eliminateDeadStores();
+
+  void prepareForCodegen();
+
+  const std::unordered_set<BufPtr> getInputBufs() const;
+  const std::unordered_set<BufPtr> getOutputBufs() const {
+    return output_bufs_;
+  }
+  std::vector<BufPtr> getIntermediateBufs() const;
+
+  // Finds which is the outer For between a and b for loops. If neither of the 2
+  // Fors is an ancestor of the other, it returns nullptr.
+  static ForPtr findOuterFor(ForPtr a, ForPtr b);
+
+ private:
+  void initialize(
+      const std::vector<Tensor>& output_tensors,
+      const std::vector<Tensor>& tensors_to_compute);
+
+  StmtPtr root_stmt_;
+
+  std::unordered_set<BufPtr> output_bufs_;
+};
+
+TORCH_API StmtPtr FlattenIndexes(const StmtPtr& s);
+
+// TODO: Revisit this once we decide on how dependencies analysis should look
+// like. Maybe we would choose to use a different API and BufUse would be
+// removed, or if we decide to keep it we need to properly document its API.
+struct BufLoadOrStoreUse {
+  StmtPtr s;
+  bool isStore;
+};
+
+/*
+ * Returns a map ( Buf -> uses of this Buf), uses are represented as vectors of
+ * BufUse elements, which are StmtPtr and a bool isStore flag. The order of uses
+ * in the vectors reflects the order in which the uses appear in the given
+ * statement.
+ */
+std::unordered_map<BufPtr, std::vector<BufLoadOrStoreUse>> findLoadOrStoreUses(
+    const StmtPtr& s);
+
+// replaces all invalid characters with underscore
+TORCH_API std::string sanitizeName(const std::string& input_name);
+
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/loopnest_randomization.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/loopnest_randomization.h
new file mode 100644
index 0000000000000000000000000000000000000000..08ba3778e74c7bf0f80185d9f23892dae989932e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/loopnest_randomization.h
@@ -0,0 +1,9 @@
+#pragma once
+
+namespace torch::jit::tensorexpr {
+
+// Applies a series of loop optimizations chosen randomly. This is only for
+// testing purposes. This allows automatic stress testing of NNC loop
+// transformations.
+void loopnestRandomization(int64_t seed, LoopNest& l);
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/lowerings.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/lowerings.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ee64f4841e80a969abb402d85beb2dbb4347469
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/lowerings.h
@@ -0,0 +1,45 @@
+// This file defines classes for registering standard lowerings from JIT to TE
+// IR.
+#pragma once
+
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/runtime/interpreter.h>
+#include <torch/csrc/jit/tensorexpr/analysis.h>
+#include <torch/csrc/jit/tensorexpr/codegen.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+
+namespace torch::jit::tensorexpr {
+
+using ArgNone = std::monostate;
+using BufList = std::vector<tensorexpr::BufHandle>;
+using DoubleList = std::vector<double>;
+using IntList = std::vector<int64_t>;
+using ArgValue = std::variant<
+    tensorexpr::BufHandle,
+    tensorexpr::VarHandle,
+    double,
+    int64_t,
+    bool,
+    BufList,
+    DoubleList,
+    IntList,
+    std::string,
+    ArgNone>;
+
+using NNCLoweringFunction = std::function<Tensor(
+    const std::vector<ArgValue>&,
+    const std::vector<ExprHandle>&,
+    const std::vector<ExprHandle>&,
+    const std::optional<ScalarType>&,
+    at::Device)>;
+
+TORCH_API FunctionSchemaMap<NNCLoweringFunction>& getNNCLoweringRegistry();
+TORCH_API NNCLoweringFunction getStandardLoweringFor(const std::string& op);
+
+struct RegisterNNCLoweringsFunction {
+  RegisterNNCLoweringsFunction(
+      const std::vector<std::string>& schemas,
+      const NNCLoweringFunction& fn);
+};
+
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/mem_dependency_checker.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/mem_dependency_checker.h
new file mode 100644
index 0000000000000000000000000000000000000000..b2466373acb614a6d1a96431466f6e117671c42e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/mem_dependency_checker.h
@@ -0,0 +1,409 @@
+#pragma once
+#include <c10/core/ScalarType.h>
+#include <torch/csrc/Export.h>
+#include <utility>
+#include <vector>
+
+#include <torch/csrc/jit/tensorexpr/bounds_overlap.h>
+#include <torch/csrc/jit/tensorexpr/ir_mutator.h>
+#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
+#include <torch/csrc/jit/tensorexpr/ir_visitor.h>
+#include <torch/csrc/jit/tensorexpr/stmt.h>
+
+namespace torch::jit::tensorexpr::analysis {
+
+enum class AccessType {
+  Input,
+  Output,
+  Load,
+  Store,
+  Call,
+  AtomicAdd,
+  Alloc,
+  Free
+};
+const char* AccessToString(AccessType a);
+
+class AccessInfo;
+using DependencySet = std::unordered_set<std::shared_ptr<AccessInfo>>;
+
+/* AccessInfo
+ *
+ * Represents a single bounded memory access to a buffer, for instance a Load or
+ * a Store. Holds information relating to the specific access and links to
+ * connected accesses in the dependency graph.
+ */
+class TORCH_API AccessInfo {
+ public:
+  AccessInfo(
+      size_t id,
+      AccessType type,
+      StmtPtr stmt,
+      VarPtr var,
+      IndexBounds bounds)
+      : id_(id),
+        type_(type),
+        stmt_(std::move(stmt)),
+        expr_(nullptr),
+        var_(std::move(var)),
+        bounds_(std::move(bounds)) {}
+
+  AccessInfo(
+      size_t id,
+      AccessType type,
+      ExprPtr expr,
+      StmtPtr stmt,
+      VarPtr var,
+      IndexBounds bounds)
+      : id_(id),
+        type_(type),
+        stmt_(std::move(stmt)),
+        expr_(std::move(expr)),
+        var_(std::move(var)),
+        bounds_(std::move(bounds)) {}
+
+  // Id is a unique int representing the order this access occurred in the
+  // graph.
+  size_t id() const {
+    return id_;
+  }
+
+  // The type of the access (Load, Store, etc).
+  AccessType type() const {
+    return type_;
+  }
+
+  // The enclosing Stmt this access represents. E.g. if this is a Store then
+  // Stmt is the Store itself, while if the access is caused by an Expr, this is
+  // the most immediate parent Stmt.
+  StmtPtr stmt() const {
+    return stmt_;
+  }
+
+  // If the access is represented by an Expr (such as Load or Call) then this is
+  // it, otherwise it's nullptr.
+  ExprPtr expr() const {
+    return expr_;
+  }
+
+  // The Var representing the underlying Buffer.
+  VarPtr var() const {
+    return var_;
+  }
+
+  // A vector of Bounds representing the start and end expression for each
+  // dimension.
+  IndexBounds& bounds() {
+    return bounds_;
+  }
+
+  // Each access that this depends upon,
+  // eg. if this is a Load, then it contains every Store that immediately
+  // contributes to a load of the bounds.
+  // or: if this is a Store, it contains all reads on the RHS of the Store.
+  const std::map<size_t, std::shared_ptr<AccessInfo>>& dependencies() const {
+    return dependencies_;
+  }
+
+  // Each access that depends on this one.
+  // ie. this access is present in the dependencies map of all accesses that are
+  // dependent.
+  std::map<size_t, std::shared_ptr<AccessInfo>> dependents() const {
+    std::map<size_t, std::shared_ptr<AccessInfo>> res;
+    for (const auto& kv : dependents_) {
+      res.emplace(kv.first, kv.second.lock());
+    }
+    return res;
+  }
+
+  // Returns the symbolic expression of the indices of this access.
+  std::vector<ExprPtr> getIndices() const;
+
+  // Establishes a dependency or dependent relationship with another access.
+  void addDependency(const std::shared_ptr<AccessInfo>& write);
+  void addDependent(const std::shared_ptr<AccessInfo>& read);
+
+  // helper for checking dependencies.
+  bool hasDependency(const std::shared_ptr<AccessInfo>& info) const;
+
+  // Returns the set of all nodes that are direct (immediate) dependencies of
+  // this access.
+  DependencySet getDirectDependencies();
+  // likewise, returns all nodes that directly depend on this one.
+  DependencySet getDirectDependents();
+
+  // Returns the full list of all nodes in the graph that this access depends
+  // on, and all nodes they depend on, and so forth, back to the inputs.
+  DependencySet getIndirectDependencies();
+  // likewise, returns the full list of all nodes that depend on this node, and
+  // all nodes that depend on those nodes and so on down to the outputs.
+  DependencySet getIndirectDependents();
+
+  // Does this access represent a read of memory (Load, ReduceOp, Call, etc).
+  bool isRead() const;
+  // Does this access represent a write of memory (Store, etc).
+  bool isWrite() const;
+
+  // Helpers for dumping accesses in various formats.
+  void print() const;
+  void dumpDOT(std::ostream& os) const;
+  const char* AccessTypeColour() const;
+
+ private:
+  size_t id_;
+  AccessType type_;
+  StmtPtr stmt_;
+  ExprPtr expr_;
+  VarPtr var_;
+  IndexBounds bounds_;
+
+  // Yes these should be sorted.
+  std::map<size_t, std::shared_ptr<AccessInfo>> dependencies_;
+  std::map<size_t, std::weak_ptr<AccessInfo>> dependents_;
+};
+
+using VarBoundMap = std::unordered_map<VarPtr, Bound>;
+
+/* MemDependencyChecker analyses a IR fragment and builds a dependency graph of
+ * accesses contained within.
+ *
+ * It's possible to retrieve the entire graph in node-object form, or can be
+ * used as an oracle for answering dependency questions. e.g:
+ *
+ *  analyzer.hasIndirectDependency(BufA, BufB); or,
+ *  analyzer.hasDirectDependency(LoadA, StoreB);
+ */
+class TORCH_API MemDependencyChecker : public IRVisitor {
+  struct Scope;
+
+ public:
+  MemDependencyChecker();
+  MemDependencyChecker(
+      const std::unordered_set<BufPtr>& inputs,
+      const std::unordered_set<BufPtr>& outputs);
+  MemDependencyChecker(
+      const std::vector<BufHandle>& inputs,
+      const std::vector<BufHandle>& outputs);
+
+  ~MemDependencyChecker() override = default;
+
+  // Whether or not to allow loop execution order to influence dependency
+  // calculation. If the loop may later be parallelized you don't want this.
+  bool allowLoopExecutionOrderAnalysis(bool allow = true);
+
+  // Dependency Checking API.
+  // The goal is to have enough overloads here so you don't really have to think
+  // about it.
+
+  // Returns true if any read in A has a direct dependence on a write in B.
+  bool dependsDirectly(const StmtPtr& A, const StmtPtr& B);
+  bool dependsDirectly(const ExprPtr& A, const StmtPtr& B);
+
+  // Returns true of the output depends directly on a write contained in B.
+  bool dependsDirectly(const BufPtr& output, const StmtPtr& B);
+
+  // Returns true if a read in A depends directly on the provided input.
+  bool dependsDirectly(const StmtPtr& A, const BufPtr& input);
+  bool dependsDirectly(const ExprPtr& A, const BufPtr& input);
+
+  // Outputs/inputs cannot depend directly.
+
+  // Returns true if the access A has B as an immediate dependency.
+  bool dependsDirectly(
+      const std::shared_ptr<AccessInfo>& A,
+      const std::shared_ptr<AccessInfo>& B);
+
+  // Returns true if any read in A has an ancestor write contained in B.
+  bool dependsIndirectly(const StmtPtr& A, const StmtPtr& B);
+  bool dependsIndirectly(const ExprPtr& A, const StmtPtr& B);
+
+  // Returns true of the output depends indirectly on a write contained in B.
+  bool dependsIndirectly(const BufPtr& output, const StmtPtr& B);
+
+  // Returns true if a read in A depends indirectly on the provided input.
+  bool dependsIndirectly(const StmtPtr& A, const BufPtr& input);
+  bool dependsIndirectly(const ExprPtr& A, const BufPtr& input);
+
+  // returns true if the output uses any load of the input.
+  bool dependsIndirectly(const BufPtr& output, const BufPtr& input);
+
+  // Returns true if the access A has a dependency chain to access B.
+  bool dependsIndirectly(
+      const std::shared_ptr<AccessInfo>& A,
+      const std::shared_ptr<AccessInfo>& B);
+
+  // Returns the AccessInfo
+  std::shared_ptr<AccessInfo> accessFor(const StmtPtr& A) const;
+  std::shared_ptr<AccessInfo> accessFor(const ExprPtr& A) const;
+
+  // Returns all AccessInfos.
+  std::unordered_set<std::shared_ptr<AccessInfo>> accessesWithin(
+      const StmtPtr& A) const;
+  // TODO: this will return only the AccessInfo for A. It's included for
+  // completeness but be aware it wont return accesses used in the computation
+  // of A.
+  std::unordered_set<std::shared_ptr<AccessInfo>> accessesWithin(
+      const ExprPtr& A) const;
+
+  // Accesses relating to input and output buffers.
+  std::shared_ptr<AccessInfo> input(const BufPtr& B) const;
+  std::shared_ptr<AccessInfo> output(const BufPtr& B) const;
+
+  // Returns the full history of reads and writes.
+  const std::vector<std::shared_ptr<AccessInfo>>& getHistory() const;
+
+  // Dumps the dependency graph in DOT format.
+  void dumpDAG(const std::string& filename) const;
+
+ private:
+  // Node visitors.
+  void visit(const StorePtr& v) override;
+  void visit(const LoadPtr& v) override;
+  void visit(const ForPtr& v) override;
+  void visit(const CondPtr& v) override;
+  void visit(const IfThenElsePtr& v) override;
+  void visit(const CompareSelectPtr& v) override;
+  void visit(const BlockPtr& v) override;
+  void visit(const LetPtr& v) override;
+  void visit(const AtomicAddPtr& v) override;
+  void visit(const AllocatePtr& v) override;
+  void visit(const FreePtr& v) override;
+
+  using BoundRelationship = std::pair<IndexBounds, std::shared_ptr<AccessInfo>>;
+
+  // An internal struct holding the accesses found within a scope Block.
+  struct Scope {
+    Scope(BlockPtr b, std::shared_ptr<Scope> p)
+        : block(std::move(b)), parent(std::move(p)) {}
+
+    BlockPtr block;
+    std::shared_ptr<Scope> parent;
+
+    std::unordered_map<VarPtr, Bound> shadowedVarBounds;
+    std::unordered_set<VarPtr> localVars;
+
+    std::vector<std::shared_ptr<AccessInfo>> accesses_;
+
+    std::unordered_map<VarPtr, std::list<BoundRelationship>> openWrites_;
+  };
+  std::shared_ptr<Scope> currentScope_;
+
+  bool allowExecutionOrderAnalysis_{false};
+
+  std::unordered_multimap<StmtPtr, std::shared_ptr<AccessInfo>> stmtToAccess_;
+  std::unordered_multimap<ExprPtr, std::shared_ptr<AccessInfo>> exprToAccess_;
+  std::unordered_map<StmtPtr, std::vector<std::shared_ptr<AccessInfo>>>
+      scopeToAccesses_;
+
+  VarBoundMap knownVarBounds_;
+
+  // Finds all accesses that are reads within the scope of v.
+  template <typename StmtOrExprPtr>
+  DependencySet getAllReadsWithin(const StmtOrExprPtr& v) {
+    DependencySet reads;
+    auto insertAllReads = [&](const auto& nodes) {
+      for (const auto& l : nodes) {
+        auto bound = exprToAccess_.equal_range(l);
+        for (auto it = bound.first; it != bound.second; ++it) {
+          if (it->second->isRead()) {
+            reads.insert(it->second);
+          }
+        }
+      }
+    };
+
+    // Look for and insert accesses belonging to all nodes that act like
+    // reads.
+    insertAllReads(NodeFinder<Load>::find(v));
+    insertAllReads(NodeFinder<ReduceOp>::find(v));
+
+    return reads;
+  }
+
+  // Finds all accesses that are writes within the scope of v.
+  // Writes cannot occur in Exprs, so this is a little simpler.
+  DependencySet getAllWritesWithin(const StmtPtr& v) {
+    DependencySet writes;
+
+    // writes just Store currently.
+    auto stores = NodeFinder<Store>::find(v);
+    for (const auto& s : stores) {
+      auto bound = stmtToAccess_.equal_range(s);
+      for (auto it = bound.first; it != bound.second; ++it) {
+        if (it->second->isWrite()) {
+          writes.insert(it->second);
+        }
+      }
+    }
+    return writes;
+  }
+
+  // Templated helpers to work on either Exprs or Stmts.
+  template <typename StmtOrExprPtr>
+  bool dependsDirectlyHelper(const StmtOrExprPtr& A, const StmtPtr& B) {
+    auto aReads = getAllReadsWithin(A);
+    auto bWrites = getAllWritesWithin(B);
+
+    for (auto& read : aReads) {
+      for (auto& depPair : read->dependencies()) {
+        if (bWrites.count(depPair.second) != 0) {
+          return true;
+        }
+      }
+    }
+
+    return false;
+  }
+
+  template <typename StmtOrExprPtr>
+  bool dependsIndirectlyHelper(StmtOrExprPtr A, const StmtPtr& B) {
+    auto aReads = getAllReadsWithin(A);
+    auto bWrites = getAllWritesWithin(B);
+
+    auto aDeps = getAllWriteDependencies(aReads);
+
+    for (auto& dependency : aDeps) {
+      if (bWrites.count(dependency) != 0) {
+        return true;
+      }
+    }
+
+    return false;
+  }
+
+  DependencySet getAllWriteDependencies(const DependencySet& products);
+
+  // Maps for inputs and outputs, since they aren't present directly in the IR.
+  std::unordered_map<BufPtr, std::shared_ptr<AccessInfo>> inputs_;
+  std::unordered_map<BufPtr, std::shared_ptr<AccessInfo>> outputs_;
+  std::unordered_map<VarPtr, std::shared_ptr<AccessInfo>> intermediates_;
+
+  // Inserts accesses for Buf's: specifically for inputs and outputs.
+  void insertBuffers(
+      std::unordered_map<BufPtr, std::shared_ptr<AccessInfo>>& bufs,
+      AccessType type);
+
+  // Update the write history with a new write, adding dependencies and closing
+  // any overlapped writes (if possible).
+  void updateWriteHistory(
+      std::list<BoundRelationship>& writeHistory,
+      const std::shared_ptr<AccessInfo>& info,
+      size_t latestAccessToClose,
+      bool closeOverlapped = true,
+      bool insert = true);
+
+  // Merge a child scope into a parent scope, adding dependencies for open
+  // writes in the parent to accesses in the child.
+  void mergeScope(
+      const std::shared_ptr<Scope>& child,
+      const std::shared_ptr<Scope>& parent,
+      bool closeOverlapped = true);
+
+  // Binds symbolic vars in indices with the low and high bound for those vars.
+  std::vector<Bound> getIndicesBounds(const std::vector<ExprPtr>& indices);
+
+  size_t nextAccess_{0};
+  StmtPtr lastStmt_{nullptr};
+};
+
+} // namespace torch::jit::tensorexpr::analysis
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/conv2d.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/conv2d.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f701862d20a5ac4ae0927fc755f11e63949ba94
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/conv2d.h
@@ -0,0 +1,101 @@
+#pragma once
+
+#include <torch/csrc/jit/tensorexpr/operators/misc.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+
+namespace torch::jit::tensorexpr {
+
+// An API to compute 2D depthwise convolutions with bias.
+TORCH_API Tensor conv2d_depthwise(
+    BufHandle input,
+    BufHandle weight,
+    BufHandle bias,
+    int stride,
+    int pad,
+    int groups);
+
+// An API to compute 2D depthwise convolutions without bias.
+TORCH_API Tensor conv2d_depthwise(
+    BufHandle input,
+    BufHandle weight,
+    int stride,
+    int pad,
+    int groups);
+
+TORCH_API Tensor conv2d_depthwise(
+    BufHandle input,
+    BufHandle weight,
+    BufHandle bias,
+    ExprHandle N,
+    ExprHandle C,
+    ExprHandle H,
+    ExprHandle W,
+    ExprHandle K,
+    ExprHandle CperG,
+    ExprHandle R,
+    ExprHandle S,
+    ExprHandle stride,
+    ExprHandle pad,
+    ExprHandle groups);
+
+TORCH_API Tensor conv2d_depthwise(
+    BufHandle input,
+    BufHandle weight,
+    ExprHandle N,
+    ExprHandle C,
+    ExprHandle H,
+    ExprHandle W,
+    ExprHandle K,
+    ExprHandle CperG,
+    ExprHandle R,
+    ExprHandle S,
+    ExprHandle stride,
+    ExprHandle pad,
+    ExprHandle groups);
+
+bool conv2dIsSupported(
+    const TensorInfo& input,
+    const TensorInfo& weight,
+    const TensorInfo& bias,
+    const std::vector<int64_t>& stride,
+    const std::vector<int64_t>& pad,
+    const std::vector<int64_t>& dilation,
+    int64_t groups);
+bool mkldnnPrepackedConvIsSupported(
+    const TensorInfo& input,
+    const TensorInfo& weight,
+    const std::vector<int64_t>& stride,
+    const std::vector<int64_t>& pad,
+    const std::vector<int64_t>& dilation,
+    int64_t groups);
+Tensor computeConv2d(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+Tensor computeConv1d(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+Tensor computePrepackedConv2dClampRun(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+Tensor computePrepackedLinearClampRun(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+Tensor computeMkldnnPrepackedConvRun(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/matmul.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/matmul.h
new file mode 100644
index 0000000000000000000000000000000000000000..d6dc35dc929448ba9d62029b0a6c586871de0530
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/matmul.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <torch/csrc/jit/tensorexpr/kernel.h>
+
+namespace torch::jit::tensorexpr {
+
+Tensor computeMatmul(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+Tensor computeAddMM(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/misc.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/misc.h
new file mode 100644
index 0000000000000000000000000000000000000000..720410ec3fd5ead28faabde311974065427b54e2
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/misc.h
@@ -0,0 +1,94 @@
+#pragma once
+
+#include <torch/csrc/jit/tensorexpr/fwd_decls.h>
+#include <torch/csrc/jit/tensorexpr/lowerings.h>
+#include <torch/csrc/jit/tensorexpr/tensor.h>
+
+namespace torch::jit::tensorexpr {
+
+struct TensorInfo {
+  std::vector<int64_t> dims;
+  c10::ScalarType dtype;
+};
+std::optional<TensorInfo> getTensorInfo(const BufHandle& b);
+
+int64_t normalizeAndCheckIndex(int64_t idx, int64_t list_size);
+
+// Convert boolean to integer, if needed.
+ExprHandle boolToInteger(const ExprHandle& x);
+ExprHandle promoteToDtype(ExprHandle e, ScalarType dt);
+void promoteInputs(
+    std::vector<ExprHandle>& inputs,
+    const int typeConstraints = kAllTypes);
+ExprHandle promoteIntegerToDefaultType(const ExprHandle& e);
+ExprHandle promoteHalfToFloat(const ExprHandle& e);
+ExprHandle demoteOutput(
+    const ExprHandle& e,
+    const std::optional<ScalarType> type);
+
+std::vector<ExprHandle> broadcastShapes(
+    std::vector<std::vector<ExprHandle>> shapes);
+std::vector<ExprHandle> broadcastShapes(
+    const std::vector<ExprHandle>& a,
+    const std::vector<ExprHandle>& b);
+
+std::vector<ExprHandle> valueShape(const ArgValue& v);
+ExprHandle tensorOrConstant(
+    const ArgValue& v,
+    const std::vector<ExprHandle>& axes);
+ExprHandle scalarOrConstant(const ArgValue& v);
+ExprHandle broadcast(const BufHandle& b, const std::vector<ExprHandle>& axes);
+ExprHandle constant(const ArgValue& v);
+
+ExprHandle clamp(
+    const ExprHandle& cmin,
+    const ExprHandle& cmax,
+    const ExprHandle& input);
+
+Tensor computeChunk(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+Tensor computeTranspose(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+Tensor computeExpand(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+Tensor computeReshape(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+Tensor computeFlatten(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+Tensor computeCatWoConditionals(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape);
+Tensor computeCat(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+Tensor computeEmbedding(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/norm.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/norm.h
new file mode 100644
index 0000000000000000000000000000000000000000..e62db2f209b4e411a1b55f7d473aebc3af35a5a5
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/norm.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <torch/csrc/jit/tensorexpr/kernel.h>
+
+namespace torch::jit::tensorexpr {
+
+Tensor computeBatchNorm(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/operators.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/operators.h
new file mode 100644
index 0000000000000000000000000000000000000000..68701fa557d20a1f20e0c482548bce3df3fb38b4
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/operators.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <torch/csrc/jit/tensorexpr/operators/conv2d.h>
+#include <torch/csrc/jit/tensorexpr/operators/matmul.h>
+#include <torch/csrc/jit/tensorexpr/operators/misc.h>
+#include <torch/csrc/jit/tensorexpr/operators/norm.h>
+#include <torch/csrc/jit/tensorexpr/operators/pointwise.h>
+#include <torch/csrc/jit/tensorexpr/operators/quantization.h>
+#include <torch/csrc/jit/tensorexpr/operators/reduction.h>
+#include <torch/csrc/jit/tensorexpr/operators/softmax.h>
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/pointwise.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/pointwise.h
new file mode 100644
index 0000000000000000000000000000000000000000..d1a3be7ba0a26d07284783bd9082c9010eb20261
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/pointwise.h
@@ -0,0 +1,82 @@
+#pragma once
+
+#include <torch/csrc/jit/tensorexpr/kernel.h>
+
+namespace torch::jit::tensorexpr {
+
+TORCH_API Tensor computeSign(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::optional<std::vector<ExprHandle>>& outputStrides = std::nullopt);
+
+Tensor computeOneOperand(
+    const std::string& name,
+    const std::vector<ArgValue>& inputValues,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    const std::function<ExprHandle(const ExprHandle&)>& innerExpr,
+    const int checkParamTypes = kAllTypes);
+Tensor computeTwoOperand(
+    const std::string& name,
+    const std::vector<ArgValue>& inputValues,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    const std::function<ExprHandle(const ExprHandle&, const ExprHandle&)>&
+        innerExpr);
+Tensor computeTwoOperandWithAlpha(
+    const std::string& name,
+    const std::vector<ArgValue>& inputValues,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    const std::function<ExprHandle(const ExprHandle&, const ExprHandle&)>&
+        innerExpr);
+Tensor computeConditionWithTwoOperand(
+    const std::string& name,
+    const std::vector<ArgValue>& inputValues,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    const std::function<
+        ExprHandle(const ExprHandle&, const ExprHandle&, const ExprHandle&)>&
+        innerExpr);
+Tensor computeThreeOperand(
+    const std::string& name,
+    const std::vector<ArgValue>& inputValues,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    const std::function<
+        ExprHandle(const ExprHandle&, const ExprHandle&, const ExprHandle&)>&
+        innerExpr,
+    bool promote_inputs = true);
+Tensor computeFourOperand(
+    const std::string& name,
+    const std::vector<ArgValue>& inputValues,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    const std::function<ExprHandle(
+        const ExprHandle&,
+        const ExprHandle&,
+        const ExprHandle&,
+        const ExprHandle&)>& innerExpr);
+Tensor computeNoop(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+
+Tensor computeScalar(
+    const std::string& name,
+    const std::vector<ArgValue>& inputValues,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    const std::function<ExprHandle(const ExprHandle&, const ExprHandle&)>&
+        innerExpr);
+
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/quantization.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/quantization.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a1160c335f0918c66c43ba2ac270afd4f7bffec
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/quantization.h
@@ -0,0 +1,156 @@
+#pragma once
+
+#include <torch/csrc/jit/tensorexpr/kernel.h>
+
+namespace torch::jit::tensorexpr {
+
+TORCH_API ExprHandle quantizePerTensorQParamFromArg(ArgValue arg);
+
+TORCH_API double immQScale(const BufHandle& qx);
+
+TORCH_API int64_t immQZero(const BufHandle& qx);
+
+TORCH_API ScalarType immQDType(const BufHandle& qx);
+
+TORCH_API bool isQuantized(const BufHandle& qx);
+
+TORCH_API Tensor computeQuantizePerTensor(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+
+TORCH_API Tensor computeQuantizePerTensorExternalCall(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+
+TORCH_API Tensor computeQuantizedConv1d(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+
+TORCH_API Tensor computeQuantizedConv2dPrepack(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+
+TORCH_API Tensor computeQuantizedConv1d(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+
+TORCH_API Tensor computeQuantizedConv2d(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+
+TORCH_API Tensor computeQuantizedConv2dRelu(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+
+TORCH_API Tensor computeQuantizedLinear(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+
+TORCH_API Tensor computeQuantizedLinearRelu(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+
+TORCH_API Tensor computeQuantizedAdd(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+
+Tensor computeQuantizedAddExternalCall(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+
+TORCH_API Tensor computeQuantizedMul(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+
+TORCH_API Tensor computeQuantizedMulScalar(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+
+TORCH_API Tensor computeQuantizedCat(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+
+TORCH_API Tensor computeQuantizedRelu(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+
+TORCH_API Tensor computeDequantize(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+
+TORCH_API Tensor computeDequantizeExternalCall(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+
+TORCH_API Tensor computeUpsampleNearest2d(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+
+TORCH_API Tensor computeUpsampleNearest2dExternalCall(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+
+TORCH_API Tensor computeQuantizedSigmoidExternalCall(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device);
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/reduction.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/reduction.h
new file mode 100644
index 0000000000000000000000000000000000000000..1cfe584a9da00aa742849ba17e3f99c53b4f3300
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/reduction.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <torch/csrc/jit/tensorexpr/kernel.h>
+
+namespace torch::jit::tensorexpr {
+
+TORCH_API Tensor computeSum(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+TORCH_API Tensor computeMean(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+TORCH_API Tensor computeAdaptiveAvgPool2d(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+Tensor computeMax(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    const std::optional<ScalarType>& outputType,
+    at::Device device);
+
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/softmax.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/softmax.h
new file mode 100644
index 0000000000000000000000000000000000000000..f2419749220e93cdf1878336cfe968633ac42ae2
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/operators/softmax.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <torch/csrc/jit/tensorexpr/kernel.h>
+
+namespace torch::jit::tensorexpr {
+
+Tensor computeSoftmax(
+    const std::vector<ArgValue>& inputs,
+    const std::vector<ExprHandle>& outputShape,
+    const std::vector<ExprHandle>& outputStrides,
+    bool log_softmax);
+
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/reduction.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/reduction.h
new file mode 100644
index 0000000000000000000000000000000000000000..16e21737a855f68638de0022eb96b060fff3b9a3
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/reduction.h
@@ -0,0 +1,306 @@
+#pragma once
+
+#include <torch/csrc/jit/tensorexpr/expr.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_printer.h>
+#include <torch/csrc/jit/tensorexpr/stmt.h>
+#include <torch/csrc/jit/tensorexpr/types.h>
+
+#include <functional>
+#include <utility>
+#include <vector>
+
+namespace torch::jit::tensorexpr {
+
+using ParameterList = const std::vector<VarHandle>;
+using ReduceInteraction = std::function<ExprHandle(ExprHandle, ExprHandle)>;
+
+// A Reducer is a user interface describing a particular reduction
+// operation. It has three components: An initialization value, a way of
+// interacting each value with the accumulation, and a method for obtaining the
+// current value to be reduced. It is materialized into a ReduceOp when loop
+// variables are known.
+class TORCH_API Reducer {
+ public:
+  Reducer(ExprHandle init, ReduceInteraction& interaction)
+      : init_(init.node()), interaction_(interaction) {}
+
+  template <typename RI>
+  Reducer(ExprHandle init, RI interaction)
+      : init_(init.node()), interaction_(std::move(interaction)) {}
+
+  ExprPtr initializer() const {
+    return init_;
+  }
+
+  ExprHandle operator()(
+      const BufHandle& result_buf,
+      ExprHandle body,
+      const std::vector<ExprHandle>& output,
+      const std::vector<VarHandle>& inner) const;
+
+  ReduceOpPtr operator()(
+      const BufPtr& result_buf,
+      ExprPtr body,
+      const std::vector<ExprPtr>& output,
+      const std::vector<VarPtr>& inner) const;
+
+  ExprHandle operator()(
+      const BufHandle& result_buf,
+      BufHandle acc_buf,
+      const ExprHandle& body,
+      const std::vector<ExprHandle>& output,
+      const std::vector<VarHandle>& inner) const;
+
+  // Polymorphic handling of Body functions with a variety of parameters.
+  static ExprHandle getReduceBody(
+      const std::function<ExprHandle(ParameterList&)>& func,
+      const std::vector<VarHandle>& vars) {
+    return func(vars);
+  }
+
+  static ExprHandle getReduceBody(
+      const std::function<ExprHandle(const VarHandle&)>& func,
+      const std::vector<VarHandle>& vars) {
+    if (vars.size() != 1) {
+      throw malformed_input("mismatch between reduce body and arg size (1)");
+    }
+
+    return func(vars[0]);
+  }
+
+  static ExprHandle getReduceBody(
+      const std::function<ExprHandle(const VarHandle&, const VarHandle&)>& func,
+      const std::vector<VarHandle>& vars) {
+    if (vars.size() != 2) {
+      throw malformed_input("mismatch between reduce body and arg size (2)");
+    }
+    return func(vars[0], vars[1]);
+  }
+
+  static ExprHandle getReduceBody(
+      const std::function<
+          ExprHandle(const VarHandle&, const VarHandle&, const VarHandle&)>&
+          func,
+      const std::vector<VarHandle>& vars) {
+    if (vars.size() != 3) {
+      throw malformed_input("mismatch between reduce body and arg size (3)");
+    }
+    return func(vars[0], vars[1], vars[2]);
+  }
+
+  static ExprHandle getReduceBody(
+      const std::function<ExprHandle(
+          const VarHandle&,
+          const VarHandle&,
+          const VarHandle&,
+          const VarHandle&)>& func,
+      const std::vector<VarHandle>& vars) {
+    if (vars.size() != 4) {
+      throw malformed_input("mismatch between reduce body and arg size (4)");
+    }
+    return func(vars[0], vars[1], vars[2], vars[3]);
+  }
+
+  // Completes the reduction operator by applying the interaction function to
+  // the accumulation and the body expression.
+  static ExprPtr complete(
+      const BufPtr& accumulator,
+      const ReduceInteraction& interaction,
+      ExprHandle body,
+      const std::vector<ExprPtr>& output_args,
+      const std::vector<VarPtr>& reduce_args) {
+    ExprHandle accum =
+        ExprHandle(alloc<Load>(body.dtype(), accumulator, output_args));
+    auto e = interaction(std::move(accum), std::move(body));
+    return e.node();
+  }
+  static ExprHandle complete(
+      const BufHandle& accumulator,
+      const ReduceInteraction& interaction,
+      ExprHandle body,
+      const std::vector<ExprHandle>& output_args,
+      const std::vector<VarHandle>& reduce_args) {
+    ExprHandle accum = Load::make(body.dtype(), accumulator, output_args);
+    auto e = interaction(std::move(accum), std::move(body));
+    return e;
+  }
+
+ private:
+  ExprPtr init_;
+  ReduceInteraction interaction_;
+};
+
+// An expression representing a Reduction operation (e.g. Sum, Max) broken into
+// it's component parts: initialization, accumulation var, acquisition of value
+// to be reduced and interaction.
+//
+// This is intended to be expanded in the loopnest and not make it to codegen.
+class TORCH_API ReduceOp : public ExprNode<ReduceOp> {
+ public:
+  ReduceOp(
+      const ExprPtr& body,
+      std::vector<VarPtr> reduce_args,
+      Reducer reducer)
+      : ExprNodeBase(body->dtype()),
+        body_(body),
+        reduce_args_(std::move(reduce_args)),
+        reducer_(std::move(reducer)) {
+    result_buf_ = nullptr;
+    acc_buf_ = nullptr;
+    ri_operand_ = nullptr;
+  }
+
+  ReduceOp(
+      const ExprPtr& body,
+      std::vector<VarPtr> reduce_args,
+      BufPtr result_buf,
+      BufPtr acc_buf,
+      ExprPtr ri_operand,
+      Reducer reducer)
+      : ExprNodeBase(body->dtype()),
+        body_(body),
+        reduce_args_(std::move(reduce_args)),
+        result_buf_(std::move(result_buf)),
+        acc_buf_(std::move(acc_buf)),
+        ri_operand_(std::move(ri_operand)),
+        reducer_(std::move(reducer)) {}
+
+  static ExprHandle make(
+      ExprHandle body,
+      const std::vector<VarHandle>& reduce_args,
+      const Reducer& reducer);
+
+  static ExprHandle make(
+      ExprHandle body,
+      const std::vector<VarHandle>& reduce_args,
+      BufHandle result_buf,
+      BufHandle acc_buf,
+      ExprHandle ri_operand,
+      const Reducer& reducer);
+
+  // return the body expression which obtains the value to be reduced.
+  ExprPtr body() const {
+    return body_;
+  }
+
+  // Returns the original Reducer factory that can create ReduceOps.
+  const Reducer& reducer() const {
+    return reducer_;
+  }
+
+  // returns variables associated with the axes of reduction.
+  const std::vector<VarPtr>& reduce_args() const {
+    return reduce_args_;
+  }
+
+  void setAccBuf(BufHandle acc_buf) {
+    acc_buf_ = acc_buf.node();
+  }
+  BufPtr getAccBuf() {
+    return acc_buf_;
+  }
+
+  void setResultBuf(BufHandle buf) {
+    result_buf_ = buf.node();
+  }
+  BufPtr getResultBuf() {
+    return result_buf_;
+  }
+
+  void setRiOperand(ExprHandle ri_operand) {
+    ri_operand_ = ri_operand.node();
+  }
+  ExprPtr getRiOperand() {
+    return ri_operand_;
+  }
+
+ private:
+  // body_ = reducer_->interaction_(result_buf_, ri_operand_)
+  ExprPtr body_;
+  std::vector<VarPtr> reduce_args_;
+
+  BufPtr result_buf_;
+  BufPtr acc_buf_;
+  ExprPtr ri_operand_;
+
+  const Reducer reducer_;
+};
+
+class Sum : public Reducer {
+ public:
+  Sum()
+      : Reducer(ExprHandle(0), [](const ExprHandle& a, const ExprHandle& b) {
+          return a + b;
+        }) {}
+};
+
+inline ExprHandle maximumVal(ScalarType type) {
+  switch (type) {
+#define MAX_BY_TYPE_CASE(Type, Name) \
+  case ScalarType::Name:             \
+    return ExprHandle(std::numeric_limits<Type>::max());
+    AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, MAX_BY_TYPE_CASE)
+#undef MAX_BY_TYPE_CASE
+    default:
+      throw unsupported_dtype();
+  }
+  return ExprHandle();
+}
+
+inline ExprHandle minimumVal(ScalarType type) {
+  switch (type) {
+#define MAX_BY_TYPE_CASE(Type, Name) \
+  case ScalarType::Name:             \
+    return ExprHandle(std::numeric_limits<Type>::min());
+    AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, MAX_BY_TYPE_CASE)
+#undef MAX_BY_TYPE_CASE
+    default:
+      throw unsupported_dtype();
+  }
+}
+
+class Maximum : public Reducer {
+ public:
+  // TODO possible to remove this arg by deferring the init value until we
+  // know the dtype of the body.
+  Maximum(Dtype dtype)
+      : Reducer(
+            minimumVal(dtype.scalar_type()),
+            [](const ExprHandle& a, const ExprHandle& b) {
+              return Max::make(a, b, true);
+            }) {}
+  Maximum(ExprHandle initializer)
+      : Reducer(
+            std::move(initializer),
+            [](const ExprHandle& a, const ExprHandle& b) {
+              return Max::make(a, b, true);
+            }) {}
+};
+
+class Minimum : public Reducer {
+ public:
+  Minimum(Dtype dtype)
+      : Reducer(
+            maximumVal(dtype.scalar_type()),
+            [](const ExprHandle& a, const ExprHandle& b) {
+              return Min::make(a, b, true);
+            }) {}
+  Minimum(const ExprHandle& initializer)
+      : Reducer(initializer, [](const ExprHandle& a, const ExprHandle& b) {
+          return Min::make(a, b, true);
+        }) {}
+};
+
+class ReductionExpander : public IRMutator {
+ public:
+  StmtPtr expand(const StmtPtr& s) {
+    return s->accept_mutator(this);
+  }
+
+  ExprPtr mutate(const ReduceOpPtr& v) override {
+    return v->body();
+  }
+};
+
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/registerizer.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/registerizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..d5b0617b6e9dbebf12bd47682ffbc5397fc4a37c
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/registerizer.h
@@ -0,0 +1,426 @@
+#pragma once
+#include <c10/core/ScalarType.h>
+#include <c10/util/irange.h>
+#include <torch/csrc/Export.h>
+
+#include <torch/csrc/jit/tensorexpr/hash_provider.h>
+#include <torch/csrc/jit/tensorexpr/ir_mutator.h>
+#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
+#include <torch/csrc/jit/tensorexpr/ir_visitor.h>
+
+#include <utility>
+#include <vector>
+
+namespace torch::jit::tensorexpr {
+namespace registerizer {
+
+/* The Registerizer performs scalar replacement by looking for common Stores and
+Loads to a single item in a buffer and replacing them with a local temporary
+scalar which is cheaper to write.
+
+For example it can replace:
+
+{
+  A[0] = 0;
+  for(const auto x : c10::irange(10)) {
+    A[0] = (A[0]) + x;
+  }
+}
+
+with:
+
+{
+  int A_ = 0;
+  for(const auto x : c10::irange(10)) {
+    A_ = x + A_;
+  }
+  A[0] = A_;
+}
+
+This is particularly useful on GPUs when parallelizing, since after replacing
+loops with metavars we have a lot of accesses like this. */
+
+class Scope;
+
+/*  Holds analysis information about accesses to a specific range of a
+ buffer, including the number of loads and stores and the lowest common parent
+ Block.
+ */
+class AccessInfo {
+ public:
+  AccessInfo() = default;
+  AccessInfo(
+      SimplifierHashType h,
+      BufPtr b,
+      std::vector<ExprPtr> i,
+      size_t accessOrder)
+      : hash_(h),
+        buf_(std::move(b)),
+        indices_(std::move(i)),
+        store_cost_(alloc<IntImm>(0)),
+        load_cost_(alloc<IntImm>(0)),
+        accessOrder_(accessOrder) {}
+
+  // Adds a Store to this access, which is in the provided scope.
+  void addStore(const StorePtr& store, const std::shared_ptr<Scope>& scope);
+
+  // Adds a Load to this access, which occurs in the usage Stmt in the provided
+  // scope.
+  void addLoad(
+      const LoadPtr& load,
+      const std::shared_ptr<Scope>& scope,
+      const StmtPtr& usage);
+
+  // Merge another AccessInfo into this one.
+  void merge(const std::shared_ptr<AccessInfo>& other);
+
+  // Returns true if the other AccessInfo's bounds may overlap this one.
+  bool overlaps(const std::shared_ptr<AccessInfo>& other);
+
+  // Returns true if the indices of this access depend on the provided Var.
+  bool dependsOnVar(const VarPtr& v);
+
+  // Clone this AccessInfo, and set this as the new accesses' hiddenAccess.
+  static std::shared_ptr<AccessInfo> cloneWithHiddenInfo(
+      const std::shared_ptr<AccessInfo>& orig);
+
+  // print for debugging.
+  void print() const;
+
+  SimplifierHashType hash() const {
+    return hash_;
+  }
+
+  BufPtr buf() const {
+    return buf_;
+  }
+
+  const std::vector<ExprPtr>& indices() const {
+    return indices_;
+  }
+
+  BlockPtr block() const {
+    return block_;
+  }
+
+  void setEnclosingBlock(BlockPtr b) {
+    block_ = std::move(b);
+  }
+
+  StmtPtr first_usage() const {
+    return first_usage_;
+  }
+  StmtPtr last_usage() const {
+    return last_usage_;
+  }
+
+  void setUsageMarks(StmtPtr first, StmtPtr last) {
+    first_usage_ = std::move(first);
+    last_usage_ = std::move(last);
+  }
+
+  bool firstUsageOverlapped() const {
+    return firstUsageOverlapped_;
+  }
+
+  ExprPtr store_cost() const {
+    return store_cost_;
+  }
+
+  ExprPtr load_cost() const {
+    return load_cost_;
+  }
+
+  const std::vector<StorePtr>& stores() const {
+    return stores_;
+  }
+
+  const std::vector<LoadPtr>& loads() const {
+    return loads_;
+  }
+
+  void hoistCosts(const ExprPtr& extent) {
+    store_cost_ = IRSimplifier::simplify(alloc<Mul>(store_cost_, extent));
+    load_cost_ = IRSimplifier::simplify(alloc<Mul>(load_cost_, extent));
+  }
+
+  size_t conditionId() const {
+    return conditionId_;
+  }
+
+  void setConditionId(size_t c) {
+    conditionId_ = c;
+  }
+
+  size_t accessOrder() const {
+    return accessOrder_;
+  }
+
+  std::shared_ptr<AccessInfo> hiddenAccess() const {
+    return hiddenAccess_;
+  }
+
+  // Holds state relating to the scalar variable we will insert to replace some
+  // number of loads and stores.
+  struct ScalarReplacement {
+    VarPtr var{nullptr};
+    BufPtr var_wrapper{nullptr};
+    LetPtr initializer{nullptr};
+  };
+
+  ScalarReplacement& replacement() {
+    return replacement_;
+  }
+
+ private:
+  SimplifierHashType hash_;
+  BufPtr buf_;
+  std::vector<ExprPtr> indices_;
+  BlockPtr block_{nullptr};
+
+  StmtPtr first_usage_{nullptr};
+  StmtPtr last_usage_{nullptr};
+
+  // Whether or not this access is overlapped in the first Stmt it appears. This
+  // means we cannot use it's first Store as the initializer.
+  bool firstUsageOverlapped_{false};
+
+  // The cost in real ops that this access represents, to enable
+  // filtering accesses that wont save any loads or stores.
+  ExprPtr store_cost_;
+  ExprPtr load_cost_;
+
+  // The actual Stores and Loads which represent this access.
+  // Be careful with these, any mutator will invalidate these pointers.
+  std::vector<StorePtr> stores_;
+  std::vector<LoadPtr> loads_;
+
+  // An identifier representing the conditional block, if any, this access
+  // depends on.
+  size_t conditionId_{0};
+
+  // An identifier representing the order this access was first encountered, for
+  // sorting returned results.
+  size_t accessOrder_{0};
+
+  // Sometimes when traversing the tree we need to record what would happen if
+  // we hoisted an access, but sometimes it doesn't work out. This lets us
+  // "undo" some mutation and return to the internal hidden AccessInfo.
+  // It will be removed after any further additions to this AccessInfo.
+  std::shared_ptr<AccessInfo> hiddenAccess_;
+
+  ScalarReplacement replacement_;
+};
+
+using AccessHashMap =
+    std::unordered_map<SimplifierHashType, std::shared_ptr<AccessInfo>>;
+
+// Represents a scope block and holds all accesses contained within it.
+class Scope {
+ public:
+  Scope(BlockPtr b, std::shared_ptr<Scope> parent, size_t conditionId = 0)
+      : block_(std::move(b)),
+        parent_(std::move(parent)),
+        conditionId_(conditionId) {}
+
+  AccessHashMap& getAccessMapByBuf(const BufPtr& b);
+
+  std::unordered_map<BufPtr, AccessHashMap>& openAccesses() {
+    return openAccesses_;
+  }
+
+  std::vector<std::shared_ptr<AccessInfo>>& closedAccesses() {
+    return closedAccesses_;
+  }
+
+  BlockPtr block() const {
+    return block_;
+  }
+
+  std::shared_ptr<Scope> parent() const {
+    return parent_;
+  }
+
+  size_t conditionId() const {
+    return conditionId_;
+  }
+
+  const std::unordered_set<VarPtr>& localVars() const {
+    return localVars_;
+  }
+  void addLocalVar(VarPtr v) {
+    localVars_.insert(std::move(v));
+  }
+
+  void closeAccess(const std::shared_ptr<AccessInfo>& info);
+
+  void filterClosed();
+
+ private:
+  // Map of map to access, narrowing by Buf then by hash(Buf+Indices).
+  // This allows us to find a candidate access easily, and also check for
+  // overlap with other accesses to the same buf. Buf ->
+  //    Hash ->
+  //        Access
+  std::unordered_map<BufPtr, AccessHashMap> openAccesses_;
+  std::vector<std::shared_ptr<AccessInfo>> closedAccesses_;
+
+  // The Block object this scope represents.
+  BlockPtr block_;
+
+  // The enclosing scope object.
+  std::shared_ptr<Scope> parent_;
+
+  // An identifier representing the condition block this scope depends on.
+  size_t conditionId_;
+
+  // A set of variables local to this scope (e.g. loop vars).
+  std::unordered_set<VarPtr> localVars_;
+};
+
+/* Analyzes the graph and collects accesses to the same symbolic tensor element
+ * which can be replaced by a single local scalar.
+ *
+ * This works by recursively walking the tree in postfix order, building sets of
+ * accesses to the same symbolic element by scope and then merging lower scopes
+ * into their enclosing scope.
+ *
+ * It is safe to move two accesses of the same Tensor element to a local scalar
+ * Var if between all usages of the element there are no other Loads or Stores
+ * that may refer to it. In the comments I refer to this as overlapping the
+ * access, or "cutting" the existing AccessInfo. In the case where a candidate
+ * for registerization is cut, it may be possible to finalize the access early
+ * by writing it back to the Tensor and then create a new scalar variable after
+ * the overlapping access is complete. We will attempt to do this when it saves
+ * memory accesses.
+ *
+ * There are a few cases that make this more challenging:
+ *
+ *  - For: Loops change the number of real usages of a buffer by the loop
+ * extent, but only if we can pull the definition and finalization of the scalar
+ * variable out of the loop block.
+ *
+ * - Cond: Conditions complicate lifting scalars out of internal scopes.
+ * Generally we cannot lift an access outside of a conditional scope unless
+ * there is already a reference to that same access at the higher scope, since
+ * we don't know if the condition was guarding an array access not safe at the
+ * higher scope. In the comments I refer to this as the condition "hiding" the
+ * access, and the outer access "unhiding" it.
+ *
+ * - IfThenElse: Same situation as Cond, except since IfThenElse is an Expr
+ * rather than a Stmt we cannot insert the scalar definition or finalizer
+ * within the conditional scope. Accesses inside an IfThenElse can be safely
+ * combined with external accesses but cannot exist completely within.
+ *
+ * - Let: Accesses dependent on local variables via Let Stmts, or loop vars,
+ * cannot be raised outside of the scope of the dependent var.
+ */
+class TORCH_API RegisterizerAnalysis : public IRVisitor {
+ public:
+  RegisterizerAnalysis()
+      : currentScope_(std::make_shared<Scope>(nullptr, nullptr, 0)) {}
+  ~RegisterizerAnalysis() override = default;
+
+  void visit(const ForPtr& v) override;
+
+  void visit(const CondPtr& v) override;
+
+  void visit(const BlockPtr& v) override;
+
+  void visit(const StorePtr& v) override;
+
+  void visit(const LoadPtr& v) override;
+
+  void visit(const IfThenElsePtr& v) override;
+
+  void visit(const LetPtr& v) override;
+
+#define STMT_ON_STACK(Op)                 \
+  void visit(const Op##Ptr& v) override { \
+    stmtStack_.push_front(v);             \
+    IRVisitor::visit(v);                  \
+    stmtStack_.pop_front();               \
+  }
+
+  STMT_ON_STACK(AtomicAdd)
+  STMT_ON_STACK(Allocate)
+  STMT_ON_STACK(Free)
+
+#undef STMT_ON_STACK
+
+  std::vector<std::shared_ptr<AccessInfo>> getCandidates();
+
+ private:
+  void mergeCurrentScopeIntoParent();
+  void mergeHiddenScope(bool allowClosed);
+  void closeAccessIntoScope(
+      const std::shared_ptr<AccessInfo>& info,
+      const std::shared_ptr<Scope>& scope);
+
+  std::unordered_set<size_t> exprConditionals_;
+
+  // A stack of enclosing Stmts for tracking the usage Stmt of Loads.
+  std::deque<StmtPtr> stmtStack_;
+
+  // The current scope being analyzed.
+  std::shared_ptr<Scope> currentScope_;
+
+  HashProvider hasher_;
+
+  size_t conditionId_{0};
+  size_t accessOrder_{0};
+};
+
+/* Replaces each registerizable access with a Scalar variable, including
+ * definition, initializer and finalizer.
+ */
+class TORCH_API RegisterizerReplacer : public IRMutator {
+ public:
+  RegisterizerReplacer(std::vector<std::shared_ptr<AccessInfo>>& vec)
+      : infoSet_(vec) {
+    buildReplacements();
+  }
+
+  ExprPtr mutate(const LoadPtr& v) override;
+
+  StmtPtr mutate(const StorePtr& v) override;
+
+  StmtPtr mutate(const BlockPtr& v) override;
+
+ private:
+  struct ReplacerScope {
+    std::unordered_map<StmtPtr, std::deque<std::shared_ptr<AccessInfo>>>
+        initializerPoints_;
+    std::unordered_map<StmtPtr, std::deque<std::shared_ptr<AccessInfo>>>
+        finalizePoints_;
+  };
+
+  // Creates the various ReplacerScope objects and builds internal maps.
+  void buildReplacements();
+
+  // State relating to the accesses yet to be replaced.
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  std::vector<std::shared_ptr<AccessInfo>>& infoSet_;
+  std::unordered_map<StorePtr, std::shared_ptr<AccessInfo>> storeToAccess_;
+  std::unordered_map<LoadPtr, std::shared_ptr<AccessInfo>> loadToAccess_;
+  std::unordered_map<BlockPtr, ReplacerScope> parentToAccesses_;
+
+  // Holds the set of Stores that should be pulled into an initializer, so they
+  // can be eliminated.
+  std::set<StorePtr> eliminatedIntializers_;
+
+  // Tracks the number of times we've seen each buffer, so we can name the
+  // scalar Vars appropriately.
+  std::unordered_map<BufPtr, unsigned int> bufferAccessCounts_;
+  unsigned int getBufferAccessCount(const BufPtr& b) {
+    return ++bufferAccessCounts_[b];
+  }
+};
+} // namespace registerizer
+
+// Apply scalar replacement to all accesses in s.
+// To produce safe code, this must occur after handling parallelized axes and
+// atomics.
+TORCH_API StmtPtr registerize(StmtPtr s);
+
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/stmt.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/stmt.h
new file mode 100644
index 0000000000000000000000000000000000000000..25a4692600fc40c00d91cf1533d5b22089de2341
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/stmt.h
@@ -0,0 +1,1012 @@
+#pragma once
+
+#include <algorithm>
+#include <list>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include <torch/csrc/jit/tensorexpr/expr.h>
+
+namespace torch::jit::tensorexpr {
+
+// The common base between all statement node.
+class TORCH_API Stmt : public std::enable_shared_from_this<Stmt> {
+ public:
+  Stmt() = default;
+  virtual ~Stmt() = default;
+  virtual void accept(IRVisitor* visitor) = 0;
+  virtual StmtPtr accept_mutator(IRMutator* mutator) = 0;
+
+  StmtPtr get_parent() const {
+    return parent_ ? parent_->getptr() : nullptr;
+  }
+
+  /*
+   * Make a deep copy of the given statement.
+   *
+   * All statements and expressions used in children of the statement are
+   * cloned. Note that the variables are not deep-copied since they are
+   * immutable.
+   */
+  static StmtPtr clone(const StmtPtr& s);
+
+ protected:
+  static void set_parent(const StmtPtr& s, Stmt* new_parent) {
+    s->parent_ = new_parent;
+  }
+  std::shared_ptr<Stmt> getptr() {
+    return shared_from_this();
+  }
+
+ private:
+  Stmt* parent_ = nullptr;
+};
+
+template <class Op>
+class StmtNode : public Stmt {
+ public:
+  using StmtNodeBase = StmtNode<Op>;
+  void accept(IRVisitor* visitor) override {
+    visitor->visit(static_to<Op>(getptr()));
+  }
+  StmtPtr accept_mutator(IRMutator* mutator) override;
+  friend Op;
+
+ private:
+  StmtNode() = default;
+};
+
+template <class Op>
+StmtPtr StmtNode<Op>::accept_mutator(IRMutator* mutator) {
+  return mutator->mutate(static_to<Op>(getptr()));
+}
+
+// Concrete Stmt classes
+class TORCH_API Block : public StmtNode<Block> {
+ public:
+  static BlockPtr make(const std::vector<StmtPtr>& stmts) {
+    std::vector<StmtPtr> valid_stmts;
+    for (auto& stmt : stmts) {
+      if (!stmt) {
+        continue;
+      }
+      valid_stmts.push_back(stmt);
+    }
+    if (valid_stmts.empty()) {
+      return nullptr;
+    }
+    return alloc<Block>(valid_stmts);
+  }
+
+  size_t nstmts() const {
+    return stmts_.size();
+  }
+  bool empty() const {
+    return stmts_.empty();
+  }
+
+  void prepend_stmt(const StmtPtr& s) {
+    if (s->get_parent()) {
+      throw malformed_input("Block prepend Stmt with existing parent", s);
+    }
+
+    stmts_.push_front(s);
+    set_parent(s, this);
+  }
+  void append_stmt(const StmtPtr& s) {
+    if (s->get_parent()) {
+      throw malformed_input("Block append Stmt with existing parent", s);
+    }
+
+    stmts_.push_back(s);
+    set_parent(s, this);
+  }
+
+  void insert_stmt_before(const StmtPtr& s, const StmtPtr& before) {
+    if (s->get_parent()) {
+      throw malformed_input("Block append Stmt with existing parent", s);
+    }
+
+    auto pos = std::find(stmts_.begin(), stmts_.end(), before);
+    if (pos == stmts_.end()) {
+      throw malformed_input(
+          "Inserting after statement that is not in block", s);
+    }
+
+    stmts_.insert(pos, s);
+    set_parent(s, this);
+  }
+
+  void insert_stmt_after(const StmtPtr& s, const StmtPtr& after) {
+    if (s->get_parent()) {
+      throw malformed_input("Block append Stmt with existing parent", s);
+    }
+
+    auto pos = std::find(stmts_.begin(), stmts_.end(), after);
+    if (pos == stmts_.end()) {
+      throw malformed_input(
+          "Inserting after statement that is not in block", s);
+    }
+
+    ++pos;
+
+    stmts_.insert(pos, s);
+    set_parent(s, this);
+  }
+
+  bool replace_stmt(const StmtPtr& old_stmt, const StmtPtr& new_stmt) {
+    if (new_stmt->get_parent()) {
+      throw malformed_input(
+          "Block replace Stmt with existing parent", new_stmt);
+    }
+
+    auto pos = std::find(stmts_.begin(), stmts_.end(), old_stmt);
+    if (pos == stmts_.end()) {
+      return false;
+    }
+    stmts_.insert(pos, new_stmt);
+    stmts_.erase(pos);
+    set_parent(old_stmt, nullptr);
+    set_parent(new_stmt, this);
+    return true;
+  }
+
+  // Creates a new block by cloning `this` block and replacing the given
+  // statement with a new statement. Note that `old_stmt` refers to a statement
+  // in `this` block. If the `old_stmt` is not found, it will return `nullptr`.
+  BlockPtr clone_and_replace(const StmtPtr& old_stmt, const StmtPtr& new_stmt) {
+    if (new_stmt->get_parent()) {
+      throw malformed_input(
+          "Block replace Stmt with existing parent", new_stmt);
+    }
+
+    std::vector<StmtPtr> stmts(stmts_.begin(), stmts_.end());
+    std::vector<StmtPtr> cloned_stmts(stmts.size());
+    bool found = false;
+    for (int i = 0; i < static_cast<int>(stmts.size()); ++i) {
+      if (stmts[i] == old_stmt) {
+        found = true;
+        cloned_stmts[i] = new_stmt;
+      } else {
+        cloned_stmts[i] = Stmt::clone(stmts[i]);
+      }
+    }
+    if (!found) {
+      return nullptr;
+    }
+    return alloc<Block>(cloned_stmts);
+  }
+
+  bool remove_stmt(const StmtPtr& stmt) {
+    auto pos = std::find(stmts_.begin(), stmts_.end(), stmt);
+    if (pos == stmts_.end()) {
+      return false;
+    }
+
+    set_parent(stmt, nullptr);
+    stmts_.erase(pos);
+    return true;
+  }
+
+  std::list<StmtPtr> stmts() const {
+    return stmts_;
+  }
+
+  void clear() {
+    for (const auto& s : stmts_) {
+      set_parent(s, nullptr);
+    }
+    stmts_.clear();
+  }
+
+  void set_stmts(const std::vector<StmtPtr>& stmts) {
+    clear();
+    init(stmts);
+  }
+
+  explicit Block(const std::vector<StmtPtr>& stmts) {
+    init(stmts);
+  }
+
+  typedef std::list<StmtPtr>::iterator iterator;
+  typedef std::list<StmtPtr>::const_iterator const_iterator;
+
+  iterator begin() {
+    return stmts_.begin();
+  }
+
+  const_iterator begin() const {
+    return stmts_.begin();
+  }
+
+  iterator end() {
+    return stmts_.end();
+  }
+
+  const_iterator end() const {
+    return stmts_.end();
+  }
+
+  StmtPtr front() {
+    return stmts_.front();
+  }
+
+  StmtPtr front() const {
+    return stmts_.front();
+  }
+
+  StmtPtr back() {
+    return stmts_.back();
+  }
+
+  StmtPtr back() const {
+    return stmts_.back();
+  }
+
+  void splice(Block::iterator it, const BlockPtr& other) {
+    for (const StmtPtr& s : *other) {
+      set_parent(s, this);
+    }
+
+    stmts_.splice(it, other->stmts_);
+  }
+
+  static BlockPtr getSharedParent(StmtPtr p1, StmtPtr p2) {
+    std::unordered_set<BlockPtr> enclosing;
+
+    StmtPtr p1_p = std::move(p1);
+    while (p1_p) {
+      if (BlockPtr b = to<Block>(p1_p)) {
+        enclosing.insert(b);
+      }
+      p1_p = p1_p->get_parent();
+    }
+
+    StmtPtr p2_p = std::move(p2);
+    while (p2_p) {
+      if (BlockPtr b = to<Block>(p2_p)) {
+        if (enclosing.count(b) != 0) {
+          return b;
+        }
+      }
+      p2_p = p2_p->get_parent();
+    }
+
+    return nullptr;
+  }
+
+  // returns the immediate child containing statement s.
+  StmtPtr getEnclosedRoot(StmtPtr s) const {
+    while (s && s->get_parent().get() != this) {
+      s = s->get_parent();
+    }
+    return s;
+  }
+
+ private:
+  std::list<StmtPtr> stmts_;
+
+  void init(const std::vector<StmtPtr>& stmts) {
+    for (const StmtPtr& s : stmts) {
+      if (!s) {
+        continue;
+      }
+      if (!s->get_parent()) {
+        // If we get here, it's a bug, but we cannot throw an error from a
+        // constructor. But IR verifier would catch this.
+        set_parent(s, this);
+      }
+
+      stmts_.push_back(s);
+    }
+  }
+};
+
+class TORCH_API Store : public StmtNode<Store> {
+ public:
+  VarPtr base_handle() const {
+    return buf_->base_handle();
+  }
+  std::vector<ExprPtr> indices() const {
+    return indices_;
+  }
+  ExprPtr flat_index() const {
+    TORCH_CHECK(indices_.size() == 1, "Indices haven't been flattened.");
+    return indices_[0];
+  }
+  ExprPtr value() const {
+    return value_;
+  }
+  BufPtr buf() const {
+    return buf_;
+  }
+
+  void set_buf(BufPtr buf) {
+    buf_ = std::move(buf);
+  }
+
+  void set_indices(std::vector<ExprPtr> indices) {
+    indices_ = std::move(indices);
+  }
+
+  void set_value(ExprPtr value) {
+    value_ = std::move(value);
+  }
+
+  static StorePtr make(
+      const BufHandle& buf,
+      const std::vector<ExprHandle>& indices,
+      const ExprHandle& value);
+
+  Store(BufPtr buf, std::vector<ExprPtr> indices, ExprPtr value);
+
+ private:
+  BufPtr buf_;
+  std::vector<ExprPtr> indices_;
+  ExprPtr value_;
+};
+
+// Allocate a buffer of given shapes and dtypes and bind it with the given
+// buffer var. The life span is at most through the current program, until it is
+// explicitly freed. An unfreed memory is likely considered an error.
+class TORCH_API Allocate : public StmtNode<Allocate> {
+ public:
+  static AllocatePtr make(const BufHandle& buf_handle) {
+    return alloc<Allocate>(buf_handle.node());
+  }
+
+  VarPtr buffer_var() const {
+    return buf_->base_handle();
+  }
+
+  Dtype dtype() const {
+    return buf_->dtype();
+  }
+
+  const std::vector<ExprPtr> dims() const {
+    return buf_->dims();
+  }
+
+  BufPtr buf() const {
+    return buf_;
+  }
+
+  void set_buf(BufPtr buf) {
+    buf_ = std::move(buf);
+  }
+
+  explicit Allocate(BufPtr buf) : buf_(std::move(buf)) {}
+
+ private:
+  BufPtr buf_;
+  // TODO: add memory types.
+};
+
+// PlacementAllocate is a variation of the Allocate operator in NNC IR. It does
+// not allocate memory but reuse the memory of another buffer for the given
+// buffer.
+class TORCH_API PlacementAllocate : public StmtNode<PlacementAllocate> {
+ public:
+  static PlacementAllocatePtr make(
+      const BufHandle& buf_handle,
+      const BufHandle& buf_handle_to_reuse) {
+    return alloc<PlacementAllocate>(
+        buf_handle.node(), buf_handle_to_reuse.node());
+  }
+
+  BufPtr buf() const {
+    return buf_;
+  }
+
+  BufPtr buf_to_reuse() const {
+    return buf_to_reuse_;
+  }
+
+  void set_buf(BufPtr buf) {
+    buf_ = std::move(buf);
+  }
+
+  void set_buf_to_reuse(BufPtr buf) {
+    buf_to_reuse_ = std::move(buf);
+  }
+
+  explicit PlacementAllocate(BufPtr buf, BufPtr buf_to_reuse)
+      : buf_(std::move(buf)), buf_to_reuse_(std::move(buf_to_reuse)) {}
+
+ private:
+  BufPtr buf_;
+  BufPtr buf_to_reuse_;
+};
+
+// Free the specific buffer. It is an error.
+class TORCH_API Free : public StmtNode<Free> {
+ public:
+  static FreePtr make(const BufHandle& buf_handle) {
+    return alloc<Free>(buf_handle.node());
+  }
+
+  VarPtr buffer_var() const {
+    return buf_->base_handle();
+  }
+
+  BufPtr buf() const {
+    return buf_;
+  }
+
+  void set_buf(BufPtr buf) {
+    buf_ = std::move(buf);
+  }
+
+  explicit Free(BufPtr buf) : buf_(std::move(buf)) {}
+
+ private:
+  BufPtr buf_;
+};
+
+class TORCH_API FreeExt : public StmtNode<FreeExt> {
+ public:
+  static FreeExtPtr make(const std::vector<BufHandle>& bufs);
+
+  std::vector<BufPtr> bufs() const {
+    return bufs_;
+  }
+
+  void set_bufs(std::vector<BufPtr> bufs) {
+    bufs_ = std::move(bufs);
+  }
+
+  explicit FreeExt(std::vector<BufPtr> bufs) : bufs_(std::move(bufs)) {}
+
+ private:
+  std::vector<BufPtr> bufs_;
+};
+
+class TORCH_API Let : public StmtNode<Let> {
+ public:
+  static LetPtr make(const VarHandle& var, const ExprHandle& val) {
+    return alloc<Let>(var.node(), val.node());
+  }
+
+  Let(VarPtr var, ExprPtr val) : var_(std::move(var)), val_(std::move(val)) {}
+
+  VarPtr var() const {
+    return var_;
+  }
+
+  ExprPtr value() const {
+    return val_;
+  }
+
+  void set_var(VarPtr var) {
+    var_ = std::move(var);
+  }
+
+  void set_val(ExprPtr val) {
+    val_ = std::move(val);
+  }
+
+ private:
+  VarPtr var_;
+  ExprPtr val_;
+};
+
+class TORCH_API Cond : public StmtNode<Cond> {
+ public:
+  static CondPtr make(
+      const ExprHandle& condition,
+      const StmtPtr& true_stmt,
+      const StmtPtr& false_stmt) {
+    return alloc<Cond>(condition.node(), true_stmt, false_stmt);
+  }
+
+  ExprPtr condition() const {
+    return condition_;
+  }
+
+  BlockPtr true_stmt() const {
+    return true_stmt_;
+  }
+
+  BlockPtr false_stmt() const {
+    return false_stmt_;
+  }
+
+  void set_condition(ExprPtr condition) {
+    condition_ = std::move(condition);
+  }
+
+  void set_true_stmt(StmtPtr true_stmt) {
+    if (true_stmt) {
+      BlockPtr b = to<Block>(true_stmt);
+      if (!b) {
+        b = alloc<Block>(std::vector<StmtPtr>({std::move(true_stmt)}));
+      }
+      true_stmt_ = b;
+      set_parent(true_stmt_, this);
+    }
+  }
+
+  void set_false_stmt(StmtPtr false_stmt) {
+    if (false_stmt) {
+      BlockPtr b = to<Block>(false_stmt);
+      if (!b) {
+        b = alloc<Block>(std::vector<StmtPtr>({std::move(false_stmt)}));
+      }
+      false_stmt_ = b;
+      set_parent(false_stmt_, this);
+    }
+  }
+
+  Cond(ExprPtr condition, StmtPtr true_stmt, StmtPtr false_stmt)
+      : condition_(std::move(condition)) {
+    set_true_stmt(std::move(true_stmt));
+    set_false_stmt(std::move(false_stmt));
+  }
+
+  CondPtr cloneWithNewBodies(
+      const StmtPtr& true_stmt,
+      const StmtPtr& false_stmt) {
+    return alloc<Cond>(condition_, true_stmt, false_stmt);
+  }
+
+  CondPtr cloneWithNewBody(const StmtPtr& true_stmt) {
+    return alloc<Cond>(condition_, true_stmt, nullptr);
+  }
+
+ private:
+  ExprPtr condition_;
+  BlockPtr true_stmt_ = nullptr;
+  BlockPtr false_stmt_ = nullptr;
+};
+
+class TORCH_API LoopOptions {
+ public:
+  enum {
+    IDX_UNSET = -1,
+    IDX_X = 0,
+    IDX_Y = 1,
+    IDX_Z = 2,
+    IDX_W = 3,
+    IDX_MAX = IDX_W,
+  };
+  // GPU Block Index
+  bool is_gpu_block_index() const {
+    return gpu_block_index_ != IDX_UNSET;
+  }
+
+  int gpu_block_index() const {
+    return gpu_block_index_;
+  }
+
+  std::string gpu_block_index_str() const {
+    if (!is_gpu_block_index()) {
+      throw malformed_input("Has no GPU block index");
+    }
+
+    // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
+    static const char* kBlockIndexNames[] = {
+        "blockIdx.x",
+        "blockIdx.y",
+        "blockIdx.z",
+        "blockIdx.w",
+    };
+
+    if (gpu_block_index_ < IDX_X || gpu_block_index_ > IDX_MAX) {
+      throw malformed_input("invalid GPU block index");
+    }
+
+    return kBlockIndexNames[gpu_block_index_];
+  }
+
+  void set_gpu_block_index(int index) {
+    if (index == IDX_UNSET) {
+      gpu_block_index_ = IDX_UNSET;
+    }
+
+    if (is_gpu_thread_index()) {
+      throw std::runtime_error("Cannot set both gpu block and thread index");
+    }
+    if (is_gpu_block_index() && gpu_block_index() != index) {
+      throw std::runtime_error("Cannot set a previously set block index");
+    }
+    gpu_block_index_ = index;
+  }
+
+  // GPU Thread Index
+  bool is_gpu_thread_index() const {
+    return gpu_thread_index() != IDX_UNSET;
+  }
+
+  int gpu_thread_index() const {
+    return gpu_thread_index_;
+  }
+
+  std::string gpu_thread_index_str() const {
+    if (!is_gpu_thread_index()) {
+      throw malformed_input("has no GPU thread index");
+    }
+
+    // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays)
+    static const char* kThreadIndexNames[] = {
+        "threadIdx.x", "threadIdx.y", "threadIdx.z", "threadIdx.w"};
+
+    if (gpu_thread_index_ < IDX_X || gpu_thread_index_ > IDX_MAX) {
+      throw malformed_input("invalid GPU thread index");
+    }
+
+    return kThreadIndexNames[gpu_thread_index_];
+  }
+
+  void set_gpu_thread_index(int index) {
+    if (index == IDX_UNSET) {
+      gpu_thread_index_ = IDX_UNSET;
+    }
+
+    if (is_gpu_block_index()) {
+      throw std::runtime_error("Cannot set both gpu thread and block index");
+    }
+    if (is_gpu_thread_index() && gpu_thread_index() != index) {
+      throw std::runtime_error("Cannot set a previously set thread index");
+    }
+    gpu_thread_index_ = index;
+  }
+
+  void set_parallel() {
+    is_parallel_ = true;
+  }
+
+  bool is_parallel() const {
+    return is_parallel_;
+  }
+
+  std::string ToString() const {
+    if (is_gpu_block_index()) {
+      return gpu_block_index_str();
+    } else if (is_gpu_thread_index()) {
+      return gpu_thread_index_str();
+    } else if (is_parallel()) {
+      return "parallel";
+    }
+    return "";
+  }
+
+  bool isDefault() const {
+    return gpu_block_index_ == IDX_UNSET && gpu_thread_index_ == IDX_UNSET &&
+        !is_parallel_;
+  }
+
+  void set_buffer_mapping(const std::unordered_map<std::string, BufPtr>& map) {
+    map_input_to_tensor_bufs_ = map;
+  }
+
+  std::unordered_map<std::string, BufPtr> get_buffer_mapping() const {
+    return map_input_to_tensor_bufs_;
+  }
+
+ private:
+  int gpu_block_index_{IDX_UNSET};
+  int gpu_thread_index_{IDX_UNSET};
+  bool is_parallel_{false};
+  std::unordered_map<std::string, BufPtr> map_input_to_tensor_bufs_;
+};
+
+class TORCH_API For : public StmtNode<For> {
+ public:
+  VarPtr var() const {
+    return var_;
+  }
+  ExprPtr start() const {
+    return start_;
+  }
+  ExprPtr stop() const {
+    return stop_;
+  }
+  BlockPtr body() const {
+    return body_;
+  }
+  static ForPtr make(
+      const VarHandle& var,
+      const ExprHandle& start,
+      const ExprHandle& stop,
+      const StmtPtr& body) {
+    if (!body) {
+      return nullptr;
+    }
+    return alloc<For>(var.node(), start.node(), stop.node(), body);
+  }
+  static ForPtr make(
+      const VarHandle& var,
+      const ExprHandle& start,
+      const ExprHandle& stop,
+      const StmtPtr& body,
+      const LoopOptions& loop_options) {
+    if (!body) {
+      return nullptr;
+    }
+    return alloc<For>(
+        var.node(), start.node(), stop.node(), body, loop_options);
+  }
+  const LoopOptions loop_options() const {
+    return loop_options_;
+  }
+
+  For(VarPtr var, ExprPtr start, ExprPtr stop, StmtPtr body)
+      : var_(std::move(var)), start_(std::move(start)), stop_(std::move(stop)) {
+    BlockPtr b = to<Block>(body);
+    if (!b) {
+      b = alloc<Block>(std::vector<StmtPtr>({std::move(body)}));
+    }
+    body_ = b;
+    set_parent(body_, this);
+  }
+
+  For(VarPtr var,
+      ExprPtr start,
+      ExprPtr stop,
+      StmtPtr body,
+      LoopOptions loop_options)
+      : var_(std::move(var)),
+        start_(std::move(start)),
+        stop_(std::move(stop)),
+        loop_options_(std::move(loop_options)) {
+    if (!var_) {
+      throw malformed_input("invalid Var in For loop");
+    } else if (!start_) {
+      throw malformed_input("invalid Start in For loop");
+    } else if (!stop_) {
+      throw malformed_input("invalid Stop in For loop");
+    } else if (!body || body->get_parent()) {
+      throw malformed_input("invalid Body in For loop");
+    }
+
+    BlockPtr b = to<Block>(body);
+    if (!b) {
+      b = alloc<Block>(std::vector<StmtPtr>({std::move(body)}));
+    }
+    body_ = b;
+    set_parent(body_, this);
+  }
+
+  void set_gpu_block_index(int block_index) {
+    loop_options_.set_gpu_block_index(block_index);
+  }
+
+  void set_gpu_thread_index(int thread_index) {
+    loop_options_.set_gpu_thread_index(thread_index);
+  }
+
+  void set_parallel() {
+    loop_options_.set_parallel();
+  }
+
+  bool is_parallel() const {
+    return loop_options_.is_parallel();
+  }
+
+  void set_buffer_map(const std::unordered_map<std::string, BufPtr>& map) {
+    loop_options_.set_buffer_mapping(map);
+  }
+
+  ForPtr cloneWithNewBody(const StmtPtr& body) const {
+    return alloc<For>(var_, start_, stop_, body, loop_options_);
+  }
+
+  BlockPtr removeBody() {
+    auto res = body_;
+    set_parent(res, nullptr);
+    body_ = nullptr;
+    return res;
+  }
+
+  void set_body(StmtPtr body) {
+    BlockPtr b = to<Block>(body);
+    if (!b) {
+      b = alloc<Block>(std::vector<StmtPtr>({std::move(body)}));
+    }
+    body_ = b;
+    set_parent(body_, this);
+  }
+
+  void set_start(ExprPtr start) {
+    start_ = std::move(start);
+  }
+
+  void set_stop(ExprPtr stop) {
+    stop_ = std::move(stop);
+  }
+
+  void set_var(VarPtr var) {
+    var_ = std::move(var);
+  }
+
+ private:
+  VarPtr var_;
+  ExprPtr start_;
+  ExprPtr stop_;
+  BlockPtr body_;
+  LoopOptions loop_options_;
+};
+
+// A backend specific IR Node that implements atomic-add.
+// This node could only shows up as an internal with GPU backends.
+// TODO: move to this an internal IR.
+// TODO: make IR nodes extensible.
+class TORCH_API AtomicAdd : public StmtNode<AtomicAdd> {
+ public:
+  AtomicAdd(BufPtr buf, std::vector<ExprPtr> indices, ExprPtr value)
+      : buf_(std::move(buf)),
+        indices_(std::move(indices)),
+        value_(std::move(value)) {}
+
+  VarPtr base_handle() const {
+    return buf_->base_handle();
+  }
+
+  BufPtr buf() const {
+    return buf_;
+  }
+
+  ExprPtr flat_index() const {
+    TORCH_CHECK(indices_.size() == 1, "Indices haven't been flattened.");
+    return indices_[0];
+  }
+
+  ExprPtr value() const {
+    return value_;
+  }
+
+  const std::vector<ExprPtr>& indices() const {
+    return indices_;
+  }
+
+  void set_buf(BufPtr buf) {
+    buf_ = std::move(buf);
+  }
+
+  void set_indices(std::vector<ExprPtr> indices) {
+    indices_ = std::move(indices);
+  }
+
+  void set_value(ExprPtr value) {
+    value_ = std::move(value);
+  }
+
+ private:
+  BufPtr buf_;
+  std::vector<ExprPtr> indices_;
+  ExprPtr value_;
+};
+
+class TORCH_API SyncThreads : public StmtNode<SyncThreads> {
+ public:
+  SyncThreads() = default;
+};
+
+/*
+ * ExternalCall statement represents a call to an external function that would
+ * compute the contents of the output buffer. An ExternalCall statement consists
+ * of:
+ *   1) output buffer - the buffer that'll be initialized by the call
+ *   2) external function name - a key from the NNC function registry to lookup
+ *      the actual function to call
+ *   3) buffer arguments - the input buffers used by the function
+ *   4) non-buffer arguments - scalar arguments to pass to the function
+ *
+ * An example:
+ *   A = nnc_conv2d(buf_args={Input, Weight, Bias}, args={1})
+ * Here 'A' is the output buffer, "nnc_conv2d" is the function name, the buffer
+ * arguments are 'Input', 'Weight', and 'Bias', and there is a single non-buffer
+ * argument - 1.
+ *
+ * The semantics of the scalar arguments is defined solely by the implementation
+ * of the external function.
+ */
+class TORCH_API ExternalCall : public StmtNode<ExternalCall> {
+ public:
+  static ExternalCallPtr make(
+      BufHandle buf,
+      const std::string& func_name,
+      const std::vector<BufHandle>& buf_args,
+      const std::vector<ExprHandle>& args);
+
+  BufPtr buf() const {
+    return buf_;
+  }
+
+  std::string func_name() const {
+    return func_name_;
+  }
+
+  std::vector<BufPtr> buf_args() const {
+    return buf_args_;
+  }
+
+  std::vector<ExprPtr> args() const {
+    return args_;
+  }
+
+  void set_buf(BufPtr buf) {
+    buf_ = std::move(buf);
+  }
+
+  void set_buf_args(std::vector<BufPtr> buf_args) {
+    buf_args_ = std::move(buf_args);
+  }
+
+  void set_args(std::vector<ExprPtr> args) {
+    args_ = std::move(args);
+  }
+
+  ExternalCall(
+      BufPtr buf,
+      std::string func_name,
+      std::vector<BufPtr> buf_args,
+      std::vector<ExprPtr> args)
+      : buf_(std::move(buf)),
+        func_name_(std::move(func_name)),
+        buf_args_(std::move(buf_args)),
+        args_(std::move(args)) {}
+
+ private:
+  BufPtr buf_;
+  std::string func_name_;
+  std::vector<BufPtr> buf_args_;
+  std::vector<ExprPtr> args_;
+};
+
+class TORCH_API ExternalCallWithAlloc : public StmtNode<ExternalCallWithAlloc> {
+ public:
+  static ExternalCallWithAllocPtr make(
+      const std::string& func_name,
+      const std::vector<BufHandle>& buf_out_args,
+      const std::vector<BufHandle>& buf_args,
+      const std::vector<ExprHandle>& args);
+
+  std::vector<BufPtr> buf_out_args() const {
+    return buf_out_args_;
+  }
+
+  std::string func_name() const {
+    return func_name_;
+  }
+
+  std::vector<BufPtr> buf_args() const {
+    return buf_args_;
+  }
+
+  std::vector<ExprPtr> args() const {
+    return args_;
+  }
+
+  void set_buf_out_args(std::vector<BufPtr> buf_out_args) {
+    buf_out_args_ = std::move(buf_out_args);
+  }
+
+  void set_buf_args(std::vector<BufPtr> buf_args) {
+    buf_args_ = std::move(buf_args);
+  }
+
+  void set_args(std::vector<ExprPtr> args) {
+    args_ = std::move(args);
+  }
+
+  ExternalCallWithAlloc(
+      std::string func_name,
+      std::vector<BufPtr> buf_out_args,
+      std::vector<BufPtr> buf_args,
+      std::vector<ExprPtr> args)
+      : func_name_(std::move(func_name)),
+        buf_out_args_(std::move(buf_out_args)),
+        buf_args_(std::move(buf_args)),
+        args_(std::move(args)) {}
+
+ private:
+  std::string func_name_;
+  std::vector<BufPtr> buf_out_args_;
+  std::vector<BufPtr> buf_args_;
+  std::vector<ExprPtr> args_;
+};
+
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/tensor.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..fec98ede0175bd1ca794ab442f0e42983f756add
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/tensor.h
@@ -0,0 +1,321 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <functional>
+#include <utility>
+#include <vector>
+
+#include <torch/csrc/jit/tensorexpr/expr.h>
+#include <torch/csrc/jit/tensorexpr/reduction.h>
+
+namespace torch::jit::tensorexpr {
+
+class TORCH_API Tensor {
+ public:
+  Tensor(BufPtr buf, const std::vector<VarPtr>& args, const ExprPtr& body)
+      : buf_(std::move(buf)) {
+    stmt_ = constructStmt(args, body, {}, {});
+  }
+  Tensor(BufHandle buf, const std::vector<VarHandle>& args, ExprHandle body)
+      : Tensor(buf.node(), VarHandleVectorToVarVector(args), body.node()) {}
+
+  Tensor(
+      BufPtr buf,
+      const std::vector<VarPtr>& args,
+      const std::vector<ExprPtr>& reduce_dims,
+      const std::vector<VarPtr>& reduce_args,
+      const ExprPtr& body)
+      : buf_(std::move(buf)) {
+    stmt_ = constructStmt(args, body, reduce_dims, reduce_args);
+  }
+  Tensor(
+      BufHandle buf,
+      const std::vector<VarHandle>& args,
+      const std::vector<ExprHandle>& reduce_dims,
+      const std::vector<VarHandle>& reduce_args,
+      ExprHandle body)
+      : Tensor(
+            buf.node(),
+            VarHandleVectorToVarVector(args),
+            ExprHandleVectorToExprVector(reduce_dims),
+            VarHandleVectorToVarVector(reduce_args),
+            body.node()) {}
+
+  Tensor(BufPtr buf, StmtPtr stmt)
+      : buf_(std::move(buf)), stmt_(std::move(stmt)) {}
+
+  BufPtr buf() const {
+    return buf_;
+  }
+
+  StmtPtr stmt() const {
+    return stmt_;
+  }
+
+  template <typename T>
+  inline ExprHandle load(const std::vector<T>& args) const;
+  template <typename... Ts>
+  inline ExprHandle load(const Ts&... ts) const;
+
+ private:
+  StmtPtr constructStmt(
+      const std::vector<VarPtr>& args,
+      const ExprPtr& body,
+      const std::vector<ExprPtr>& reduce_dims,
+      const std::vector<VarPtr>& reduce_args) const;
+
+  BufPtr buf_;
+  StmtPtr stmt_;
+};
+
+TORCH_API Tensor Compute(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    const std::optional<std::vector<ExprHandle>>& strides,
+    const std::function<ExprHandle(const VarHandle&)>& body_func);
+TORCH_API Tensor Compute(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    const std::function<ExprHandle(const VarHandle&)>& body_func);
+TORCH_API Tensor Compute(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    const std::optional<std::vector<ExprHandle>>& strides,
+    const std::function<ExprHandle(const VarHandle&, const VarHandle&)>&
+        body_func);
+TORCH_API Tensor Compute(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    const std::function<ExprHandle(const VarHandle&, const VarHandle&)>&
+        body_func);
+TORCH_API Tensor Compute(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    const std::optional<std::vector<ExprHandle>>& strides,
+    const std::function<
+        ExprHandle(const VarHandle&, const VarHandle&, const VarHandle&)>&
+        body_func);
+TORCH_API Tensor Compute(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    const std::function<
+        ExprHandle(const VarHandle&, const VarHandle&, const VarHandle&)>&
+        body_func);
+TORCH_API Tensor Compute(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    const std::optional<std::vector<ExprHandle>>& strides,
+    const std::function<ExprHandle(
+        const VarHandle&,
+        const VarHandle&,
+        const VarHandle&,
+        const VarHandle&)>& body_func);
+TORCH_API Tensor Compute(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    const std::function<ExprHandle(
+        const VarHandle&,
+        const VarHandle&,
+        const VarHandle&,
+        const VarHandle&)>& body_func);
+TORCH_API Tensor Compute(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    const std::optional<std::vector<ExprHandle>>& strides,
+    const std::function<ExprHandle(const std::vector<VarHandle>&)>& body_func);
+TORCH_API Tensor Compute(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    const std::function<ExprHandle(const std::vector<VarHandle>&)>& body_func);
+
+inline std::vector<VarHandle> create_index_vars(
+    const std::vector<ExprHandle>& dims) {
+  std::vector<VarHandle> vars;
+  vars.reserve(dims.size());
+  for (const ExprHandle& dim : dims) {
+    vars.emplace_back(alloc<Var>(
+        "i", dim.dtype().scalar_type() == ScalarType::Long ? kLong : kInt));
+  }
+  return vars;
+}
+
+// Handle reductions over a Reducer and a body_func which produces values.
+template <typename InitFunc, typename BodyFunc>
+Tensor Reduce(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    const std::optional<std::vector<ExprHandle>>& strides,
+    const Reducer& reducer,
+    const InitFunc& init_func,
+    const BodyFunc& body_func,
+    const std::vector<ExprHandle>& reduce_dims) {
+  std::vector<VarHandle> vars = create_index_vars(dims);
+  std::vector<VarHandle> reduce_vars = create_index_vars(reduce_dims);
+
+  // If reduce_vars is empty, then it's not a reduction, but rather a simple
+  // copy
+  if (reduce_vars.empty()) {
+    ExprHandle body = Reducer::getReduceBody(body_func, vars);
+    BufHandle func_result =
+        Buf::make(func_name, dims, body.dtype(), std::nullopt, strides);
+    return Tensor(std::move(func_result), vars, std::move(body));
+  }
+
+  std::vector<VarHandle> all_vars;
+  all_vars.insert(all_vars.end(), vars.begin(), vars.end());
+  all_vars.insert(all_vars.end(), reduce_vars.begin(), reduce_vars.end());
+
+  ExprHandle body = Reducer::getReduceBody(body_func, all_vars);
+  std::vector<ExprHandle> output_args(vars.begin(), vars.end());
+  ExprHandle init_expr = Cast::make(body.dtype(), init_func(vars));
+  BufHandle func_result = Buf::make(func_name, dims, body.dtype(), init_expr);
+
+  ExprHandle reduce_op = reducer(func_result, body, output_args, reduce_vars);
+  if (body.dtype() == kBFloat16) {
+    ExprHandle init_expr_acc = Cast::make(kFloat, init_func(vars));
+    BufHandle func_result_acc =
+        Buf::make(func_name + "_acc", dims, kFloat, init_expr_acc);
+    reduce_op = reducer(
+        func_result,
+        std::move(func_result_acc),
+        body,
+        output_args,
+        reduce_vars);
+  }
+
+  Tensor t = Tensor(
+      std::move(func_result),
+      vars,
+      reduce_dims,
+      reduce_vars,
+      std::move(reduce_op));
+  return t;
+}
+template <typename InitFunc, typename BodyFunc>
+Tensor Reduce(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    const Reducer& reducer,
+    const InitFunc& init_func,
+    const BodyFunc& body_func,
+    const std::vector<ExprHandle>& reduce_dims) {
+  return Reduce<InitFunc, BodyFunc>(
+      func_name,
+      dims,
+      std::nullopt,
+      reducer,
+      init_func,
+      body_func,
+      reduce_dims);
+}
+
+template <typename BodyFunc>
+Tensor Reduce(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    const std::optional<std::vector<ExprHandle>>& strides,
+    const Reducer& reducer,
+    const BodyFunc& body_func,
+    const std::vector<ExprHandle>& reduce_dims) {
+  return Reduce(
+      func_name,
+      dims,
+      strides,
+      reducer,
+      [&](ParameterList& p [[maybe_unused]]) {
+        return ExprHandle(reducer.initializer());
+      },
+      body_func,
+      reduce_dims);
+}
+template <typename BodyFunc>
+Tensor Reduce(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    const Reducer& reducer,
+    const BodyFunc& body_func,
+    const std::vector<ExprHandle>& reduce_dims) {
+  return Reduce<BodyFunc>(
+      func_name, dims, std::nullopt, reducer, body_func, reduce_dims);
+}
+
+// Overload which allows inline lambda functions for the body_func.
+template <typename BodyFunc>
+Tensor Reduce(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    const std::optional<std::vector<ExprHandle>>& strides,
+    const Reducer& reducer,
+    const BodyFunc&& body_func,
+    const std::vector<ExprHandle>& reduce_dims) {
+  return Reduce(func_name, dims, strides, reducer, body_func, reduce_dims);
+}
+template <typename BodyFunc>
+Tensor Reduce(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    const Reducer& reducer,
+    const BodyFunc&& body_func,
+    const std::vector<ExprHandle>& reduce_dims) {
+  return Reduce(func_name, dims, std::nullopt, reducer, body_func, reduce_dims);
+}
+
+TORCH_API Tensor Reduce(
+    const std::string& name,
+    const std::vector<ExprHandle>& dims,
+    const std::optional<std::vector<ExprHandle>>& strides,
+    const Reducer& reducer,
+    const BufHandle& buffer,
+    const std::vector<ExprHandle>& reduce_dims);
+TORCH_API Tensor Reduce(
+    const std::string& name,
+    const std::vector<ExprHandle>& dims,
+    const Reducer& reducer,
+    const BufHandle& buffer,
+    const std::vector<ExprHandle>& reduce_dims);
+
+// Overload for the common case of all dimensions of a previously Computed
+// Tensor.
+TORCH_API Tensor Reduce(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    const std::optional<std::vector<ExprHandle>>& strides,
+    const Reducer& reducer,
+    const Tensor& tensor,
+    const std::vector<ExprHandle>& reduce_dims);
+TORCH_API Tensor Reduce(
+    const std::string& func_name,
+    const std::vector<ExprHandle>& dims,
+    const Reducer& reducer,
+    const Tensor& tensor,
+    const std::vector<ExprHandle>& reduce_dims);
+
+template <typename... Ts>
+inline ExprHandle Tensor::load(const Ts&... ts) const {
+  std::vector<ExprHandle> params({ExprHandle(ts)...});
+  return Load::make(BufHandle(this->buf()), params);
+}
+
+template <typename T>
+inline ExprHandle Tensor::load(const std::vector<T>& args) const {
+  std::vector<ExprHandle> params(args.begin(), args.end());
+  return Load::make(BufHandle(this->buf()), params);
+}
+
+template <typename... Ts>
+inline ExprHandle BufHandle::load(const Ts&... ts) const {
+  std::vector<ExprHandle> params({ExprHandle(ts)...});
+  return ExprHandle(alloc<Load>(node(), ExprHandleVectorToExprVector(params)));
+}
+
+template <typename T>
+inline ExprHandle BufHandle::load(const std::vector<T>& args) const {
+  std::vector<ExprHandle> params(args.begin(), args.end());
+  return ExprHandle(alloc<Load>(node(), ExprHandleVectorToExprVector(params)));
+}
+
+inline ExprHandle BufHandle::load(const std::vector<ExprHandle>& args) const {
+  return this->template load<ExprHandle>(args);
+}
+
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/tensorexpr_init.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/tensorexpr_init.h
new file mode 100644
index 0000000000000000000000000000000000000000..17e812109b40455efc943523349f6990f538b75b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/tensorexpr_init.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/csrc/jit/python/pybind.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch::jit {
+// Initialize Python bindings for Tensor Expressions
+void initTensorExprBindings(PyObject* module);
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/types.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/types.h
new file mode 100644
index 0000000000000000000000000000000000000000..75aec9d42b70a2ec8081407b9c8008d589d761c0
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/types.h
@@ -0,0 +1,158 @@
+#pragma once
+
+#include <cstdint>
+#include <iosfwd>
+
+#include <c10/core/ScalarType.h>
+#include <c10/util/Logging.h>
+#include <torch/csrc/Export.h>
+
+#include <torch/csrc/jit/tensorexpr/exceptions.h>
+
+namespace torch::jit::tensorexpr {
+
+using int32 = std::int32_t;
+
+class Dtype;
+TORCH_API std::ostream& operator<<(std::ostream& stream, const Dtype& dtype);
+
+using ScalarType = c10::ScalarType;
+
+enum ElementType {
+  kAllTypes = 0,
+  kIntegralTypes = 1 << 0,
+  kFloatingPointTypes = 1 << 1,
+  kBoolType = 1 << 2,
+  kComplexTypes = 1 << 3,
+  kQintTypes = 1 << 4,
+  kNonComplexOrQintTypes = kIntegralTypes | kBoolType | kFloatingPointTypes,
+};
+
+// Data types for scalar and vector elements.
+class TORCH_API Dtype {
+ public:
+  explicit Dtype(int8_t type)
+      : scalar_type_(static_cast<ScalarType>(type)), lanes_(1) {}
+  explicit Dtype(ScalarType type) : scalar_type_(type), lanes_(1) {}
+  Dtype(int8_t type, int64_t lanes)
+      : scalar_type_(static_cast<ScalarType>(type)), lanes_(lanes) {}
+  Dtype(ScalarType type, int64_t lanes) : scalar_type_(type), lanes_(lanes) {}
+  Dtype(Dtype type, int64_t lanes)
+      : scalar_type_(type.scalar_type_), lanes_(lanes) {
+    if (type.lanes() != 1) {
+      throw malformed_input("dtype lanes dont match");
+    }
+  }
+  int64_t lanes() const {
+    return lanes_;
+  }
+  ScalarType scalar_type() const {
+    return scalar_type_;
+  }
+  Dtype scalar_dtype() const;
+  bool operator==(const Dtype& other) const {
+    return scalar_type_ == other.scalar_type_ && lanes_ == other.lanes_;
+  }
+  bool operator!=(const Dtype& other) const {
+    return !(*this == other);
+  }
+  int byte_size() const;
+  std::string ToCppString() const;
+
+  bool is_integral() const {
+    return c10::isIntegralType(scalar_type_, true);
+  }
+  bool is_floating_point() const {
+    return c10::isFloatingType(scalar_type_);
+  }
+  bool is_signed() const {
+    return c10::isSignedType(scalar_type_);
+  }
+
+  Dtype cloneWithScalarType(ScalarType nt) const {
+    return Dtype(nt, lanes_);
+  }
+
+ private:
+  friend TORCH_API std::ostream& operator<<(
+      std::ostream& stream,
+      const Dtype& dtype);
+  ScalarType scalar_type_;
+  int64_t lanes_; // the width of the element for a vector time
+};
+
+extern TORCH_API Dtype kHandle;
+
+#define NNC_DTYPE_DECLARATION(ctype, name) extern TORCH_API Dtype k##name;
+
+AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, NNC_DTYPE_DECLARATION)
+NNC_DTYPE_DECLARATION(c10::quint8, QUInt8)
+NNC_DTYPE_DECLARATION(c10::qint8, QInt8)
+#undef NNC_DTYPE_DECLARATION
+
+template <typename T>
+TORCH_API Dtype ToDtype();
+
+#define NNC_TODTYPE_DECLARATION(ctype, name) \
+  template <>                                \
+  inline Dtype ToDtype<ctype>() {            \
+    return k##name;                          \
+  }
+AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, NNC_TODTYPE_DECLARATION)
+NNC_TODTYPE_DECLARATION(c10::quint8, QUInt8)
+NNC_TODTYPE_DECLARATION(c10::qint8, QInt8)
+#undef NNC_TODTYPE_DECLARATION
+
+TORCH_API Dtype ToDtype(ScalarType type);
+
+inline Dtype promoteTypes(Dtype a, Dtype b) {
+  if (a.lanes() != b.lanes()) {
+    throw malformed_input("promoting types with different lanes");
+  }
+  return Dtype(
+      static_cast<ScalarType>(c10::promoteTypes(
+          static_cast<c10::ScalarType>(a.scalar_type()),
+          static_cast<c10::ScalarType>(b.scalar_type()))),
+      a.lanes());
+}
+
+inline Dtype BinaryOpDtype(
+    Dtype op1_dtype,
+    Dtype op2_dtype,
+    ScalarType ret_type = ScalarType::Undefined) {
+  if (op1_dtype == op2_dtype) {
+    if (ret_type == ScalarType::Undefined) {
+      return op1_dtype;
+    }
+
+    return ToDtype(ret_type);
+  }
+
+  if (op1_dtype.lanes() != op2_dtype.lanes()) {
+    throw malformed_input("lanes dont match");
+  }
+  int64_t lanes = op1_dtype.lanes();
+
+  Dtype resultType = promoteTypes(op1_dtype, op2_dtype);
+  if (resultType.scalar_type() == ScalarType::Undefined) {
+    throw malformed_input("scalar type doesn't match");
+  }
+
+  if (lanes == 1) {
+    // Use the fixed scalar Dtypes.
+    return ToDtype(resultType.scalar_type());
+  }
+
+  return resultType;
+}
+
+} // namespace torch::jit::tensorexpr
+
+namespace std {
+
+using torch::jit::tensorexpr::Dtype;
+std::string to_string(const Dtype& dtype);
+using torch::jit::tensorexpr::ScalarType;
+std::string to_string(const ScalarType& dtype);
+
+} // namespace std
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/unique_name_manager.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/unique_name_manager.h
new file mode 100644
index 0000000000000000000000000000000000000000..ecf7dba8c32fcbcd4c127aad909ff6141900248b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/unique_name_manager.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/tensorexpr/fwd_decls.h>
+
+namespace torch::jit::tensorexpr {
+
+class VarHandle;
+class Var;
+
+using VarNameMap = std::unordered_map<VarPtr, std::string>;
+
+// A manager to get unique names from vars.
+// It starts with the name hints of the var and append "_" + $counter until it
+// hits a unique name.
+class TORCH_API UniqueNameManager {
+ public:
+  const std::string& get_unique_name(const VarHandle& v);
+
+  const std::string& get_unique_name(const VarPtr& v);
+
+ private:
+  friend class ScopedVarName;
+  VarNameMap unique_name_mapping_;
+  std::unordered_map<std::string, int> unique_name_count_;
+  std::unordered_set<std::string> all_unique_names_;
+};
+
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/var_substitutor.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/var_substitutor.h
new file mode 100644
index 0000000000000000000000000000000000000000..139a0c1d01bd9e82a08d6c038eaa3d080bcb1d5f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/tensorexpr/var_substitutor.h
@@ -0,0 +1,61 @@
+#pragma once
+
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <torch/csrc/jit/tensorexpr/analysis.h>
+#include <torch/csrc/jit/tensorexpr/ir.h>
+#include <torch/csrc/jit/tensorexpr/ir_mutator.h>
+#include <torch/csrc/jit/tensorexpr/ir_visitor.h>
+#include <torch/csrc/jit/tensorexpr/reduction.h>
+
+namespace torch::jit::tensorexpr {
+
+using VarMapping = std::vector<std::pair<VarPtr, ExprPtr>>;
+
+class VarSubMutator : public IRMutator {
+ public:
+  VarSubMutator(const VarMapping& var_mapping) {
+    for (auto& entry : var_mapping) {
+      VarPtr key_var = entry.first;
+      ExprPtr value = entry.second;
+      if (!key_var) {
+        throw malformed_input("missing key in VarSubMutator");
+      }
+      var_mapping_[std::move(key_var)] = std::move(value);
+    }
+  }
+
+  ExprPtr mutate(const VarPtr& var) override {
+    auto iter = var_mapping_.find(var);
+    if (iter == var_mapping_.end()) {
+      return var;
+    }
+    return iter->second;
+  }
+
+  ExprPtr mutate(const ReduceOpPtr& var) override {
+    auto body = var->body()->accept_mutator(this);
+    std::vector<VarPtr> new_inner;
+
+    for (const auto& v : var->reduce_args()) {
+      ExprPtr e = v->accept_mutator(this);
+      if (VarPtr new_var = to<Var>(e)) {
+        new_inner.push_back(std::move(new_var));
+      } else {
+        VarFinder varFinder;
+        e->accept(&varFinder);
+        auto varlist = varFinder.vars();
+        new_inner.insert(new_inner.end(), varlist.begin(), varlist.end());
+      }
+    }
+
+    return alloc<ReduceOp>(body, new_inner, var->reducer());
+  }
+
+ private:
+  std::unordered_map<VarPtr, ExprPtr> var_mapping_;
+};
+
+} // namespace torch::jit::tensorexpr
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/testing/catch_utils.hpp b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/testing/catch_utils.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..45ba6701576eb993a1283afb66d1ba6cdcec92d5
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/testing/catch_utils.hpp
@@ -0,0 +1,10 @@
+#pragma once
+
+#define CATCH_CONFIG_PREFIX_ALL
+#include <catch.hpp>
+
+// CATCH_REQUIRE_THROWS is not defined identically to REQUIRE_THROWS and causes
+// warning; define our own version that doesn't warn.
+#define _CATCH_REQUIRE_THROWS(...) \
+  INTERNAL_CATCH_THROWS(           \
+      "CATCH_REQUIRE_THROWS", Catch::ResultDisposition::Normal, __VA_ARGS__)
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/testing/file_check.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/testing/file_check.h
new file mode 100644
index 0000000000000000000000000000000000000000..c8fb3de4ee3911338fc841c6a8bdfc577f5bfb3b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/testing/file_check.h
@@ -0,0 +1,79 @@
+#pragma once
+
+#include <torch/csrc/Export.h>
+#include <memory>
+#include <string>
+
+namespace torch::jit {
+
+struct Graph;
+
+namespace testing {
+
+struct FileCheckImpl;
+
+struct FileCheck {
+ public:
+  TORCH_API explicit FileCheck();
+  TORCH_API ~FileCheck();
+
+  // Run FileCheck against test string
+  TORCH_API void run(const std::string& test_string);
+
+  // Run FileCheck against dump of graph IR
+  TORCH_API void run(const Graph& graph);
+
+  // Parsing input checks string and run against test string / dump of graph IR
+  TORCH_API void run(
+      const std::string& input_checks_string,
+      const std::string& test_string);
+  TORCH_API void run(
+      const std::string& input_checks_string,
+      const Graph& graph);
+
+  // Checks that the string occurs, starting at the end of the most recent match
+  TORCH_API FileCheck* check(const std::string& str);
+
+  // Checks that the string does not occur between the previous match and next
+  // match. Consecutive check_nots test against the same previous match and next
+  // match
+  TORCH_API FileCheck* check_not(const std::string& str);
+
+  // Checks that the string occurs on the same line as the previous match
+  TORCH_API FileCheck* check_same(const std::string& str);
+
+  // Checks that the string occurs on the line immediately following the
+  // previous match
+  TORCH_API FileCheck* check_next(const std::string& str);
+
+  // Checks that the string occurs count number of times, starting at the end
+  // of the previous match. If exactly is true, checks that there are exactly
+  // count many matches
+  TORCH_API FileCheck* check_count(
+      const std::string& str,
+      size_t count,
+      bool exactly = false);
+
+  // A series of consecutive check_dags get turned into a group of checks
+  // which can appear in any order relative to each other. The checks begin
+  // at the end of the previous match, and the match for the check_dag group
+  // is the minimum match of all individual checks to the maximum match of all
+  // individual checks.
+  TORCH_API FileCheck* check_dag(const std::string& str);
+
+  // Checks that source token is highlighted in str (usually an error message).
+  TORCH_API FileCheck* check_source_highlighted(const std::string& str);
+
+  // Checks that the regex matched string occurs, starting at the end of the
+  // most recent match
+  TORCH_API FileCheck* check_regex(const std::string& str);
+
+  // reset checks
+  TORCH_API void reset();
+
+ private:
+  bool has_run = false;
+  std::unique_ptr<FileCheckImpl> fcImpl;
+};
+} // namespace testing
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/testing/hooks_for_testing.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/testing/hooks_for_testing.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7dce3b0c8c9f716d5fb72ea1d0889623d76ba64
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/jit/testing/hooks_for_testing.h
@@ -0,0 +1,19 @@
+#pragma once
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/api/compilation_unit.h>
+#include <functional>
+#include <memory>
+
+namespace torch::jit {
+struct Module;
+
+using ModuleHook = std::function<void(Module module)>;
+using FunctionHook = std::function<void(StrongFunctionPtr function)>;
+
+TORCH_API void didFinishEmitModule(Module module);
+TORCH_API void didFinishEmitFunction(StrongFunctionPtr defined);
+TORCH_API void setEmitHooks(ModuleHook for_module, FunctionHook for_fn);
+
+TORCH_API std::pair<ModuleHook, FunctionHook> getEmitHooks();
+
+} // namespace torch::jit
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/backend/backend_data.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/backend/backend_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..2aa7b3f3e751bdebaccade0cf4fc8c2775e7fc59
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/backend/backend_data.h
@@ -0,0 +1,59 @@
+#pragma once
+
+#include <torch/csrc/lazy/backend/backend_device.h>
+#include <torch/csrc/lazy/core/shape.h>
+#include <cstring>
+
+namespace torch::lazy {
+
+class TORCH_API BackendData {
+ public:
+  struct Info {
+    /**
+     * Used by Lazy Graph Executor to tag info on BackendData objs
+     * */
+    virtual ~Info() = default;
+  };
+  /**
+   * Represents (Tensor) data stored on a backend device
+   * in its native format.
+   * */
+  using Handle = int64_t;
+
+  BackendData(BackendDevice device, Shape shape)
+      : device_(std::move(device)), shape_(std::move(shape)) {}
+
+  virtual ~BackendData() = default;
+
+  const BackendDevice& device() const {
+    return device_;
+  }
+
+  const Shape& shape() const {
+    return shape_;
+  }
+
+  Info* info() const {
+    return info_.get();
+  }
+
+  std::shared_ptr<Info> SetInfo(std::shared_ptr<Info> info) {
+    std::swap(info, info_);
+    return info;
+  }
+
+  virtual Handle GetHandle() = 0;
+
+  virtual void Assign(const BackendData& data) = 0;
+
+  virtual bool HasValue() const = 0;
+
+ private:
+  BackendDevice device_;
+  Shape shape_;
+  std::shared_ptr<Info> info_;
+};
+
+using BackendDataPtr = std::shared_ptr<BackendData>;
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/backend/backend_device.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/backend/backend_device.h
new file mode 100644
index 0000000000000000000000000000000000000000..52a0d6132162e2e11b7a3fabe86da37ea0ad7e76
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/backend/backend_device.h
@@ -0,0 +1,100 @@
+#pragma once
+
+#include <memory>
+#include <ostream>
+#include <string>
+
+#include <ATen/Tensor.h>
+#include <c10/macros/Export.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+
+namespace c10 {
+struct Device;
+}
+
+namespace torch::lazy {
+
+// Backend should extend it and define their own supported hardware types.
+struct TORCH_API BackendDeviceType {
+  int8_t type{(int8_t)at::kCPU};
+  // Note: previous default value was '0', which actually maps to at::kCPU, at
+  // least now it is explicit, we may want to make default/undefined semantics
+  // more clear though
+  BackendDeviceType() : type((int8_t)at::kCPU) {}
+  BackendDeviceType(int8_t type) : type(type) {}
+
+  virtual ~BackendDeviceType() = default;
+  virtual std::string toString() const {
+    return "Unknown";
+  }
+};
+
+class TORCH_API BackendDevice {
+ public:
+  // The default constructor will set both the device type and ordinal
+  // to backend specific defaults.
+  BackendDevice();
+  BackendDevice(std::shared_ptr<BackendDeviceType>&& type, int64_t ordinal);
+
+  int8_t type() const;
+  int64_t ordinal() const {
+    return ordinal_;
+  }
+
+  bool operator==(const BackendDevice& other) const {
+    return compare(other) == 0;
+  }
+  bool operator!=(const BackendDevice& other) const {
+    return compare(other) != 0;
+  }
+  bool operator<(const BackendDevice& rhs) const {
+    return compare(rhs) < 0;
+  }
+
+  std::string toString() const;
+
+ private:
+  int compare(const BackendDevice& rhs) const;
+
+  // Use shared_ptr instead of unique_ptr so that BackendDevice can be copied.
+  std::shared_ptr<BackendDeviceType> type_;
+  int64_t ordinal_;
+};
+
+TORCH_API std::ostream& operator<<(
+    std::ostream& os,
+    const BackendDevice& device);
+
+// Helpers for converting a c10::Device to BackendDevice and vice versa.
+TORCH_API BackendDevice atenDeviceToBackendDevice(const c10::Device& device);
+TORCH_API c10::Device backendDeviceToAtenDevice(const BackendDevice& device);
+
+// Tries to extract the backend device out of the lazy tensor. Returns nullopt
+// if the input is not a lazy tensor.
+TORCH_API std::optional<BackendDevice> GetBackendDevice(
+    const at::ITensorListRef tensors);
+TORCH_API std::optional<BackendDevice> GetBackendDevice(
+    const at::TensorList tensors);
+TORCH_API std::optional<BackendDevice> GetBackendDevice(
+    const at::Tensor& tensor);
+TORCH_API std::optional<BackendDevice> GetBackendDevice(
+    const std::optional<c10::Device>& device);
+
+// For variadic template.
+TORCH_API std::optional<BackendDevice> GetBackendDevice();
+
+C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Winfinite-recursion")
+template <typename T, typename... Args>
+std::optional<BackendDevice> GetBackendDevice(
+    const T& tensor,
+    const Args&... forward_tensors) {
+  auto optional_device = GetBackendDevice(tensor);
+  if (optional_device) {
+    return optional_device;
+  }
+  return GetBackendDevice(forward_tensors...);
+}
+C10_DIAGNOSTIC_POP()
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/backend/backend_interface.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/backend/backend_interface.h
new file mode 100644
index 0000000000000000000000000000000000000000..82aa111bd354dcb95958ad77fe0f413d045a7538
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/backend/backend_interface.h
@@ -0,0 +1,155 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <torch/csrc/lazy/backend/backend_data.h>
+#include <torch/csrc/lazy/backend/backend_device.h>
+#include <torch/csrc/lazy/backend/lowering_context.h>
+#include <torch/csrc/lazy/core/lazy_graph_executor.h>
+#include <torch/csrc/lazy/core/shape.h>
+#include <torch/csrc/lazy/core/tensor.h>
+
+namespace torch::lazy {
+
+struct IrBuilder;
+
+/**
+ * Work in progress- don't treat this as a stable interface yet!
+ */
+class TORCH_API BackendImplInterface {
+ public:
+  virtual ~BackendImplInterface() = default;
+
+  /**
+   * Initialization/Teardown
+   * */
+  // No-op by default. Allows custom functionality to be exposed through
+  // extension bindings.
+  virtual void InitializeAtenBindings() const {}
+
+  virtual void PrepareToExit() const = 0;
+
+  /**
+   * Configuration
+   * */
+
+  virtual void SetRngSeed(size_t seed) const = 0;
+
+  /**
+   * IR Tracing
+   * */
+
+  virtual const IrBuilder* GetIrBuilder() const = 0;
+
+  /**
+   * Data Transfer
+   * */
+
+  virtual BackendDataPtr MakeComputationDataFromTensor(
+      const at::Tensor& tensor,
+      const Shape& shape,
+      const BackendDevice& device) const = 0;
+  virtual BackendDataPtr MakeComputationDataFromScalar(
+      const at::Scalar& scalar,
+      const torch::lazy::BackendDevice& device) const = 0;
+  virtual BackendDataPtr CreateDataPlaceholder(
+      const BackendDevice& device,
+      const Shape& shape) const = 0;
+
+  // Gets backend data if the node is a device data node. Otherwise returns
+  // nullptr
+  virtual BackendDataPtr GetComputationDataFromNode(const Node*) const = 0;
+
+  virtual at::Tensor MakeTensorFromComputationData(
+      const BackendDataPtr data,
+      std::optional<at::ScalarType> logical_scalar_type) const = 0;
+
+  /**
+   * Lowering, Compilation, Execution
+   * */
+
+  virtual std::unique_ptr<LoweringContext> CreateLoweringContext(
+      const std::string& name,
+      BackendDevice device,
+      c10::ArrayRef<const torch::lazy::Node*> post_order,
+      Util::EmissionMap emit_status) const = 0;
+
+  virtual std::unique_ptr<LoweringContext> CreateLoweringContext(
+      const std::string& name,
+      BackendDevice device) const = 0;
+
+  // TODO(whc) need to keep this?
+  virtual std::vector<std::string> GetCompilationDevices(
+      const std::string& device,
+      c10::ArrayRef<std::string> devices) const = 0;
+
+  virtual std::vector<ComputationPtr> Compile(
+      std::vector<ComputationPtr> instances) const = 0;
+
+  virtual std::vector<BackendDataPtr> ExecuteComputation(
+      torch::lazy::ComputationPtr computation,
+      c10::ArrayRef<BackendDataPtr> arguments,
+      const BackendDevice& device) const = 0;
+
+  /**
+   * Device Configuration
+   * */
+
+  // Set or get the default device type.
+  // For backends used with virtual c10::Devices, this configures what real
+  // device type the backend should use, and matters if the backend supports
+  // more than one type of real device.
+  virtual std::shared_ptr<BackendDeviceType> GetDefaultDeviceType() const = 0;
+  virtual void SetDefaultDeviceType(int8_t type) = 0;
+
+  // Set or get the default device ordinal.
+  // For backends that supports multi-device, this configures what the
+  // default device the backend should use.
+  virtual int64_t GetDefaultDeviceOrdinal() const = 0;
+  virtual void SetDefaultDeviceOrdinal(int64_t) = 0;
+
+  // Specify which aten device should be used for eager fallback
+  // may change depending on current 'Default' DeviceType
+  virtual at::DeviceType EagerFallbackDeviceType() const = 0;
+
+  // Query all available backend devices
+  virtual std::vector<BackendDevice> GetBackendDevices() const = 0;
+
+  virtual std::string CreateMetricReport() const {
+    return "";
+  }
+
+  // Map a particular c10:: device to a concrete backend device
+  // Note:: c10:: devices may be virtual or concrete.  xla:: and lazy:: are
+  // virtual devices, meaning they may map to a gpu, tpu, etc. behind the
+  // scenes. In the future, non-virtual c10:: devices may also use lazy tensors
+  // through a mode, in which case these APIs should still work, but should be
+  // identity mappings.
+  virtual BackendDevice GetBackendDevice(c10::Device device) const = 0;
+
+  // TODO(whc)
+  // Additional APIs expected for supporting distributed training, to be
+  // designed
+
+  /**
+   * Debug/Metrics
+   * */
+
+  //   virtual std::map<std::string, Metric> GetMetrics() const = 0;
+
+  //   virtual MemoryInfo GetMemoryInfo(const std::string& device) = 0;
+
+  virtual std::string GetComputationBackendText(
+      const ComputationPtr computation) const = 0;
+};
+
+class TORCH_API BackendRegistrar {
+ public:
+  BackendRegistrar(const BackendImplInterface* backend_impl_interface);
+};
+
+TORCH_API bool hasBackend();
+TORCH_API const BackendImplInterface* getBackend();
+
+TORCH_API const IrBuilder* getIrBuilder();
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/backend/lowering_context.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/backend/lowering_context.h
new file mode 100644
index 0000000000000000000000000000000000000000..ec2bf25bbfabf9c91830829b9939bbf223b55c3f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/backend/lowering_context.h
@@ -0,0 +1,110 @@
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <torch/csrc/lazy/backend/backend_data.h>
+#include <torch/csrc/lazy/backend/backend_device.h>
+#include <torch/csrc/lazy/core/ir.h>
+#include <torch/csrc/lazy/core/ir_util.h>
+
+namespace torch::lazy {
+
+class TORCH_API Computation {
+ public:
+  virtual int parameters_size() const = 0;
+
+  virtual const std::vector<Shape>& parameter_shapes() const = 0;
+
+  virtual const std::vector<std::string>& parameter_names() const = 0;
+
+  virtual const Shape& result_shape() const = 0;
+
+  virtual const std::string to_string() const = 0;
+
+  virtual ~Computation() = default;
+
+  // Indicates whether this computation is being executed inside a mark step
+  // Assume false unless set otherwise
+  bool in_mark_step = false;
+};
+
+using ComputationPtr = std::shared_ptr<Computation>;
+
+// Keeps track of the code generation state.
+class TORCH_API LoweringContext {
+ public:
+  LoweringContext(const std::string& name, BackendDevice device);
+  LoweringContext(
+      const std::string& name,
+      BackendDevice device,
+      c10::ArrayRef<const torch::lazy::Node*> post_order,
+      Util::EmissionMap emit_status);
+
+  virtual ~LoweringContext() = default;
+
+  static std::unique_ptr<LoweringContext> Create(
+      const std::string& name,
+      BackendDevice device,
+      c10::ArrayRef<const torch::lazy::Node*> post_order,
+      Util::EmissionMap emit_status);
+
+  static std::unique_ptr<LoweringContext> Create(
+      const std::string& name,
+      BackendDevice device);
+
+  const BackendDevice& device() const {
+    return device_;
+  }
+
+  // Retrieves the vector holding all the tensors associated with the parameter
+  // instructions which have been created.
+  const std::vector<BackendDataPtr>& GetParametersData() const;
+
+  // Adds a new input/output alias.
+  virtual void SetUpAlias(
+      const std::vector<int64_t>& output_index,
+      int64_t param_number,
+      const std::vector<int64_t>& param_index,
+      bool must_alias = false) {
+    // Dummy default implementation to do nothing.
+  }
+
+  // Check if parameter shape matches result at index.
+  virtual bool CheckResultShape(
+      const BackendDataPtr& parameter_data,
+      size_t result_idx) {
+    // Dummy default implementation to do nothing.
+    return false;
+  }
+
+  // Adds the given output as a component of the result tuple and returns its
+  // assigned position within the tuple.
+  virtual size_t AddResult(const torch::lazy::Output& output) = 0;
+
+  // Associates the given output with the input parameter of the given index and
+  // shape. Only used for the operator-by-operator execution, mostly for
+  // debugging purposes.
+  virtual void AddParameter(
+      const torch::lazy::Output& output,
+      size_t index,
+      const Shape& shape,
+      const std::string& name) = 0;
+
+  // Build the computation capturing all the operations created with the
+  // embedded builder (returned by the builder() API).
+  virtual ComputationPtr Build() = 0;
+
+  size_t GetEmittedNodeCount() const {
+    return emit_status_.size();
+  }
+
+ protected:
+  BackendDevice device_;
+  std::vector<BackendDataPtr> parameters_;
+  std::vector<size_t> parameter_sequence_;
+  Util::EmissionMap emit_status_;
+};
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/cache.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/cache.h
new file mode 100644
index 0000000000000000000000000000000000000000..7760e3f7d38eefccfbef924d55051806663f0137
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/cache.h
@@ -0,0 +1,143 @@
+/**
+ * Cache utils in this file is adapted from PyTorch/XLA
+ * https://github.com/pytorch/xla/blob/e0e5f937a0ba8d904f9608137dc8c51ba439df2d/third_party/xla_client/cache.h
+ */
+
+#pragma once
+
+#include <functional>
+#include <list>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <utility>
+
+namespace torch::lazy {
+
+// Generic key and object cache with LRU expiration policy. The objects of type
+// T will be stored as std::shared_ptr<T> and taken and returned as such, by the
+// cache API.
+template <
+    typename K,
+    typename T,
+    typename H = std::hash<K>,
+    typename E = std::equal_to<K>>
+class Cache {
+ public:
+  using TypePtr = std::shared_ptr<T>;
+  using Element = std::pair<K, TypePtr>;
+
+  explicit Cache(size_t max_size) : max_size_(max_size) {}
+
+  // Adds an object to the cache, unless it already exists. If the cache grows
+  // beyond the limit set during construction, the oldest used object will be
+  // removed from the cache.
+  TypePtr Add(K key, TypePtr object) {
+    if (!max_size_) {
+      return object;
+    }
+    std::lock_guard<std::mutex> slock(lock_);
+    element_list_.emplace_front(Element(std::move(key), std::move(object)));
+    auto it = element_list_.begin();
+    auto emplace_result = element_map_.emplace(&it->first, it);
+    if (!emplace_result.second) {
+      element_list_.erase(it);
+      DoLRU(emplace_result.first->second);
+    } else if (element_list_.size() > max_size_) {
+      Element* last = &element_list_.back();
+      element_map_.erase(&last->first);
+      element_list_.pop_back();
+    }
+    return emplace_result.first->second->second;
+  }
+
+  // Retrieves the existing object if it exists. If it does, its position in
+  // the LRU list gets moved to the head of the list.
+  // Returns nullptr if no object with the specified key is found within the
+  // cache.
+  TypePtr Get(const K& key) {
+    if (!max_size_) {
+      return nullptr;
+    }
+    std::lock_guard<std::mutex> slock(lock_);
+    auto it = element_map_.find(&key);
+    if (it == element_map_.end()) {
+      return nullptr;
+    }
+    DoLRU(it->second);
+    return it->second->second;
+  }
+
+  TypePtr GetLatest() {
+    std::lock_guard<std::mutex> g(lock_);
+    TORCH_CHECK(!element_list_.empty());
+    return element_list_.front().second;
+  }
+
+  bool Erase(const K& key) {
+    if (!max_size_) {
+      return false;
+    }
+    std::lock_guard<std::mutex> slock(lock_);
+    auto it = element_map_.find(&key);
+    if (it == element_map_.end()) {
+      return false;
+    }
+    auto lit = it->second;
+    element_map_.erase(it);
+    element_list_.erase(lit);
+    return true;
+  }
+
+  void Clear() {
+    if (!max_size_) {
+      return;
+    }
+    std::lock_guard<std::mutex> slock(lock_);
+    element_map_.clear();
+    element_list_.clear();
+  }
+
+  int Numel() const {
+    if (!max_size_) {
+      return 0;
+    }
+    std::lock_guard<std::mutex> g(lock_);
+    TORCH_CHECK(element_map_.size() == element_list_.size());
+    return element_map_.size();
+  }
+
+ private:
+  using ElementList = std::list<Element>;
+
+  struct Hasher {
+    size_t operator()(const K* key) const {
+      return hasher(*key);
+    }
+
+    H hasher;
+  };
+
+  struct Equaler {
+    bool operator()(const K* k1, const K* k2) const {
+      return equaler(*k1, *k2);
+    }
+
+    E equaler;
+  };
+
+  using ElementMap = std::
+      unordered_map<const K*, typename ElementList::iterator, Hasher, Equaler>;
+
+  void DoLRU(typename ElementList::iterator it) {
+    element_list_.splice(element_list_.begin(), element_list_, it);
+  }
+
+  mutable std::mutex lock_;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const size_t max_size_ = 0;
+  ElementList element_list_;
+  ElementMap element_map_;
+};
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/config.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/config.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d1097b6f54aa3ebf4909bdb5a7366bbac118f8e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/config.h
@@ -0,0 +1,26 @@
+#pragma once
+#include <c10/macros/Export.h>
+#include <c10/util/Flags.h>
+
+TORCH_DECLARE_bool(torch_lazy_ir_debug);
+TORCH_DECLARE_bool(torch_lazy_handle_special_scalars);
+TORCH_DECLARE_bool(torch_lazy_all_numbers_special_scalars);
+TORCH_DECLARE_bool(torch_lazy_param_aliasing);
+TORCH_DECLARE_bool(torch_lazy_reuse_ir);
+TORCH_DECLARE_bool(torch_lazy_use_thread_pool);
+TORCH_DECLARE_bool(torch_lazy_enable_device_data_cache);
+
+TORCH_DECLARE_int(torch_lazy_compilation_cache_size);
+TORCH_DECLARE_int(torch_lazy_device_data_cache_size);
+TORCH_DECLARE_int(torch_lazy_io_thread_pool_size);
+TORCH_DECLARE_int(torch_lazy_metrics_samples);
+TORCH_DECLARE_int(torch_lazy_trim_graph_check_frequency);
+TORCH_DECLARE_int(torch_lazy_trim_graph_size);
+
+TORCH_DECLARE_string(torch_lazy_metrics_percentiles);
+
+TORCH_DECLARE_int(torch_lazy_shape_cache_size);
+
+namespace torch::lazy {
+TORCH_API std::string& getLTCForceFallback();
+}
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/debug_util.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/debug_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..d29cd813d213364ba364b86faad6a60c7967892d
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/debug_util.h
@@ -0,0 +1,45 @@
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include <torch/csrc/lazy/core/tensor.h>
+
+namespace torch::lazy {
+
+TORCH_API std::function<std::vector<SourceLocation>()>&
+GetPythonFramesFunction();
+
+TORCH_API std::string GetFirstUserFrameInPython();
+
+class TORCH_API DebugUtil {
+ public:
+  enum GraphFormat {
+    kText,
+    kDot,
+    kBackend,
+  };
+
+  static GraphFormat GetDefaultGraphFormat();
+
+  // Dumps the current Python frame and the IR Graph whose roots are the IR
+  // values held at the tensors. If indices is not nullptr, it selects the
+  // indices of the tensors whose graph will be emitted.
+  static std::string GetTensorsGraphInfo(
+      c10::ArrayRef<torch::lazy::LazyTensorPtr> tensors,
+      const std::vector<size_t>* indices,
+      GraphFormat format = GetDefaultGraphFormat());
+
+  // If the environment variable LTC_SAVE_TENSORS_FILE is set to the proper
+  // output path, an instance of the report returned by GetTensorsGraphInfo() is
+  // saved.
+  static void SaveTensorsGraphInfo(
+      const char* name,
+      c10::ArrayRef<torch::lazy::LazyTensorPtr> tensors,
+      const std::vector<size_t>* indices,
+      GraphFormat format = GetDefaultGraphFormat());
+
+  static bool ExperimentEnabled(const std::string& name);
+};
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/dynamic_ir.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/dynamic_ir.h
new file mode 100644
index 0000000000000000000000000000000000000000..10003a12d4f6a46e34fe5175f958bf08bda701be
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/dynamic_ir.h
@@ -0,0 +1,48 @@
+#pragma once
+
+#include <ATen/core/symbol.h>
+
+#include <c10/core/ScalarType.h>
+#include <c10/util/Flags.h>
+#include <torch/csrc/lazy/core/hash.h>
+#include <torch/csrc/lazy/core/ir.h>
+#include <torch/csrc/lazy/core/ir_metadata.h>
+#include <torch/csrc/lazy/ts_backend/ts_node.h>
+
+namespace torch::lazy {
+
+/**
+ * The goal of "dynamic" Nodes is to patch a hole in our tracing.
+ * Previously, if a user called `sizes` on a Tensor, it would leak out
+ * of our tracing system, as `sizes` returns a torch.Size or an int. To
+ * prevent this from happening, we introduce DimensionNode, a new type
+ * of Node that abstracts the operation of getting the dimensions of a
+ * Tensor.
+ *
+ * Consider the following example:
+ * ```
+ * numel = x.shape()[0] * x.shape()[1]
+ * ```
+ *
+ * Here, `x.shape()[i]` will be a SizeNode (subclass of DimensionNode),
+ * and the multiplication of the two SizeNodes will be represented by
+ * a SizeMul (also a subclass of DimensionNode). Through this, we can
+ * prevent `numel` from being represented as a Python int and thus
+ * burned into the Graph.
+ */
+
+class TORCH_API DimensionNode {
+ public:
+  virtual bool isSymbolic() const {
+    return false;
+  }
+  virtual int64_t getDynamicValue() const {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual int64_t getStaticValue() const {
+    TORCH_CHECK(false, "NYI");
+  }
+  virtual ~DimensionNode() = default;
+};
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/hash.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/hash.h
new file mode 100644
index 0000000000000000000000000000000000000000..539faa9ef778086bca490a1dafd062a87a8e8621
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/hash.h
@@ -0,0 +1,242 @@
+/**
+ * Hash utils in this file is adapted from PyTorch/XLA
+ * https://github.com/pytorch/xla/blob/e0e5f937a0ba8d904f9608137dc8c51ba439df2d/third_party/xla_client/util.h
+ */
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/util/int128.h>
+#include <torch/csrc/Export.h>
+#include <cstring>
+#include <set>
+#include <string>
+#include <string_view>
+#include <vector>
+
+namespace torch::lazy {
+
+using size_t = std::size_t;
+
+class TORCH_API hash_t : public c10::uint128 {
+ public:
+  // Switch from typedef hash_t = uint128 to provide explicit casters
+  hash_t(int8_t val) : uint128(static_cast<uint32_t>(val)) {}
+  hash_t(int16_t val) : uint128(static_cast<uint32_t>(val)) {}
+  hash_t(int32_t val) : uint128(static_cast<uint32_t>(val)) {}
+  hash_t(int64_t val) : uint128(static_cast<uint64_t>(val)) {}
+  hash_t(uint32_t val) : uint128(val) {}
+  hash_t(uint64_t val) : uint128(val) {}
+  hash_t(uint128 val) : uint128(val) {}
+  hash_t(uint64_t top, uint64_t bottom) : uint128(top, bottom) {}
+  hash_t() = default;
+};
+
+// Std* functions use 64-bit hash
+size_t TORCH_API StdDataHash(const void* data, size_t size);
+
+size_t TORCH_API StdHashCombine(uintmax_t a, uintmax_t b);
+
+// Other functions are all 128-bit
+hash_t TORCH_API HashBlock(const void* data, size_t n, const hash_t& seed);
+
+hash_t TORCH_API DataHash(const void* data, size_t size);
+
+hash_t TORCH_API HashCombine(const hash_t& a, const hash_t& b);
+
+size_t TORCH_API HashReduce(const hash_t& a);
+
+// Returns a string representation of a hash
+std::string TORCH_API HashToString(const hash_t& a);
+
+struct HashReducer {
+  size_t operator()(const hash_t& value) const {
+    return HashReduce(value);
+  }
+};
+
+static inline hash_t StringHash(const char* data) {
+  return DataHash(data, std::strlen(data));
+}
+
+// Automatic templated implementation for 'arithmetic' types
+template <typename T, std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
+hash_t Hash(const T& value) {
+  return DataHash(&value, sizeof(value));
+}
+
+// added because on macos builds the vector<bool> specialization
+// breaks falling through to the templated arithmetic types above
+hash_t TORCH_API Hash(const std::vector<bool>& value);
+
+// Specialized implementations for proprietary types
+static inline hash_t Hash(const c10::ScalarType& value) {
+  return DataHash(&value, sizeof(value));
+}
+
+static inline hash_t Hash(const c10::MemoryFormat& value) {
+  return DataHash(&value, sizeof(value));
+}
+
+static inline hash_t Hash(const c10::DeviceType& value) {
+  return DataHash(&value, sizeof(value));
+}
+
+static inline hash_t Hash(const c10::Device& value) {
+  return HashCombine(Hash(value.type()), Hash(value.index()));
+}
+
+static inline hash_t Hash(const c10::Layout& value) {
+  return DataHash(&value, sizeof(value));
+}
+
+static inline hash_t Hash(const c10::Scalar& value) {
+  switch (value.type()) {
+    case c10::ScalarType::ComplexDouble:
+      return Hash(value.toComplexDouble());
+    case c10::ScalarType::Double:
+      return Hash(value.toDouble());
+    case c10::ScalarType::Long:
+      return Hash(value.toLong());
+    case c10::ScalarType::Bool:
+      return Hash(value.toBool());
+    default:
+      TORCH_INTERNAL_ASSERT(false, "Unknown scalar type.", value.type());
+  }
+}
+
+static inline hash_t TensorHash(const at::Tensor& tensor) {
+  at::Tensor ctensor = tensor.contiguous();
+  int64_t size = ctensor.numel() * ctensor.element_size();
+  switch (ctensor.scalar_type()) {
+    case at::ScalarType::Bool:
+      return DataHash(ctensor.const_data_ptr<bool>(), size);
+    case at::ScalarType::Byte:
+      return DataHash(ctensor.const_data_ptr<uint8_t>(), size);
+    case at::ScalarType::Char:
+      return DataHash(ctensor.const_data_ptr<int8_t>(), size);
+    case at::ScalarType::Short:
+      return DataHash(ctensor.const_data_ptr<int16_t>(), size);
+    case at::ScalarType::Int:
+      return DataHash(ctensor.const_data_ptr<int32_t>(), size);
+    case at::ScalarType::Long:
+      return DataHash(ctensor.const_data_ptr<int64_t>(), size);
+    case at::ScalarType::Float:
+      return DataHash(ctensor.const_data_ptr<float>(), size);
+    case at::ScalarType::Double:
+      return DataHash(ctensor.const_data_ptr<double>(), size);
+    case at::ScalarType::BFloat16:
+      return DataHash(ctensor.const_data_ptr<at::BFloat16>(), size);
+    case at::ScalarType::Half:
+      return DataHash(ctensor.const_data_ptr<at::Half>(), size);
+    case at::ScalarType::ComplexFloat:
+      return DataHash(ctensor.const_data_ptr<c10::complex<float>>(), size);
+    case at::ScalarType::ComplexDouble:
+      return DataHash(ctensor.const_data_ptr<c10::complex<double>>(), size);
+    case at::ScalarType::UInt16:
+      return DataHash(ctensor.const_data_ptr<uint16_t>(), size);
+    case at::ScalarType::UInt32:
+      return DataHash(ctensor.const_data_ptr<uint32_t>(), size);
+    case at::ScalarType::UInt64:
+      return DataHash(ctensor.const_data_ptr<uint64_t>(), size);
+    default:
+      TORCH_INTERNAL_ASSERT(
+          false, "Unsupported scalar type:", ctensor.scalar_type());
+  }
+}
+
+static inline hash_t Hash(const std::string& value) {
+  return DataHash(value.data(), value.size());
+}
+
+static inline hash_t Hash(const std::string_view& value) {
+  return DataHash(value.data(), value.size());
+}
+
+static inline hash_t Hash(const at::Generator& value) {
+  return TensorHash(value.get_state());
+}
+
+// Taken from glibc's implementation of hashing optionals,
+// we want to include a contribution to the hash to distinguish
+// cases where one or another option was null, but we hope it doesn't
+// collide with an actually scalar value.
+//
+// Use an arbitrary randomly-selected 64-bit integer rather than a
+// small constant that we then hash at runtime so we don't have to
+// repeatedly hash a constant at runtime.
+// NOLINTNEXTLINE(*-narrowing-conversions)
+static const int64_t kNullOpt = 0x8655d738f3678dda;
+
+// Hashing for std::optional types contributes to hash
+// for optionals with null value, important to distinguish
+// between <nullopt, non-nullopt> and <non-nullopt, nullopt> cases
+template <typename T>
+hash_t Hash(const std::optional<T>& value) {
+  if (value.has_value()) {
+    return Hash(value.value());
+  } else {
+    return kNullOpt;
+  }
+}
+
+// Hashing of containers
+// Forward declare to allow hashes of vectors of vectors to work.
+template <typename T>
+hash_t ContainerHash(const T& values);
+
+template <typename T>
+hash_t Hash(const std::vector<T>& values) {
+  return ContainerHash(values);
+}
+
+// Need a special case for std::optional<container>?
+template <typename T>
+hash_t Hash(const std::optional<std::vector<T>>& value) {
+  if (value.has_value()) {
+    return ContainerHash(value.value());
+  } else {
+    return kNullOpt;
+  }
+}
+
+template <typename T>
+hash_t Hash(const std::set<T>& values) {
+  return ContainerHash(values);
+}
+
+template <typename T, typename S>
+hash_t Hash(const std::pair<T, S>& values) {
+  return HashCombine(Hash(values.first), Hash(values.second));
+}
+
+static inline hash_t Hash(const hash_t& value) {
+  return value;
+}
+
+template <typename T>
+hash_t Hash(c10::ArrayRef<T> values) {
+  return ContainerHash(values);
+}
+
+template <typename T>
+hash_t ContainerHash(const T& values) {
+  hash_t h(static_cast<uint64_t>(0x85ebca77c2b2ae63));
+  for (const auto& value : values) {
+    h = HashCombine(h, Hash(value));
+  }
+  return h;
+}
+
+// Varargs hashing
+template <typename T = void>
+hash_t MHash() {
+  return hash_t(static_cast<uint64_t>(0x165667b19e3779f9));
+}
+
+template <typename T, typename... Targs>
+hash_t MHash(T value, Targs... Fargs) {
+  return HashCombine(Hash(value), MHash(Fargs...));
+}
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/helpers.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..70cf184840acae06e89d48b8f41955a2026d9366
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/helpers.h
@@ -0,0 +1,70 @@
+#pragma once
+
+#include <c10/core/Scalar.h>
+#include <c10/util/BFloat16.h>
+#include <c10/util/Half.h>
+#include <torch/csrc/lazy/core/permutation_util.h>
+#include <torch/csrc/lazy/core/shape.h>
+#include <torch/csrc/lazy/core/util.h>
+
+#include <complex>
+#include <functional>
+#include <optional>
+#include <tuple>
+#include <vector>
+
+// TODO: Consolidate this file with util.h
+
+namespace torch::lazy {
+
+// Converts an iterable container to a vector of int64's.
+template <typename S>
+static std::vector<int64_t> ToI64Vector(const S& input) {
+  return ToVector<int64_t>(input);
+}
+
+// Creates a set of dimension by dropping the drop_dims ones.
+TORCH_API std::vector<int64_t> DropDimensions(
+    c10::ArrayRef<int64_t> sizes,
+    c10::ArrayRef<int64_t> drop_dims);
+
+// Get the canonical dimension index in the [0, rank) interval. Negative
+// indices are interpreted as follows: -1 is rank-1, -2 is rank-2 etc.
+TORCH_API int64_t GetCanonicalDimensionIndex(int64_t dim, int64_t rank);
+
+// Same as above, for multiple dimensions.
+TORCH_API std::vector<int64_t> GetCanonicalDimensionIndices(
+    c10::ArrayRef<int64_t> dimensions,
+    int64_t rank);
+
+// Returns the canonical position in the dim dimension, handling negative
+// values for the position.
+TORCH_API int64_t GetCanonicalPosition(
+    c10::ArrayRef<int64_t> dimensions,
+    int64_t dim,
+    int64_t pos);
+
+// Creates a transposition from the given input and dimensions.
+TORCH_API std::vector<int64_t> MakeTransposePermutation(
+    int64_t dim0,
+    int64_t dim1,
+    int64_t rank);
+
+// Calculates the protomoted shape to which the input shapes should be
+// broadcasted for an elementwise operation. The size of the common dimensions
+// (2,3,4 for shape1, and 0,1,2 for shape2) must either match, or either one
+// of the two be 1.
+// Example:
+//   shape1       = [9, 7, 6, 1, 2]
+//   shape2       =       [6, 5, 2]
+//   result_shape = [9, 7, 6, 5, 2]
+TORCH_API std::vector<int64_t> GetPromotedShape(
+    c10::ArrayRef<int64_t> shape1_dims,
+    c10::ArrayRef<int64_t> shape2_dims);
+
+TORCH_API Shape
+GetPromotedBinaryOpShape(const Shape& shape1, const Shape& shape2);
+
+TORCH_API std::vector<std::string> StrSplit(std::string_view text, char delim);
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/internal_ops/ltc_ops.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/internal_ops/ltc_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..f71cf182fa32a1095dcc95077f900d7a50d6e28a
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/internal_ops/ltc_ops.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include <torch/csrc/lazy/core/ir.h>
+
+#include <c10/util/CallOnce.h>
+
+#include <string>
+
+namespace torch::lazy {
+
+class TORCH_API OpKindWrapper {
+ public:
+  explicit OpKindWrapper(const char* name) : name_(name) {}
+
+  const OpKind& operator*() const {
+    return get();
+  }
+
+  operator OpKind() const {
+    return get();
+  }
+
+ private:
+  const OpKind& get() const {
+    c10::call_once(once_, [this]() { op_kind_ = OpKind::Get(name_); });
+    return op_kind_;
+  }
+
+  const char* name_;
+  mutable OpKind op_kind_;
+  mutable c10::once_flag once_;
+};
+
+const OpKindWrapper ltc_all_to_all("lazy_tensors::all_to_all");
+const OpKindWrapper ltc_cast("lazy_tensors::cast");
+const OpKindWrapper ltc_collective_permute("lazy_tensors::collective_permute");
+const OpKindWrapper ltc_cross_replica_sum("lazy_tensors::cross_replica_sum");
+const OpKindWrapper ltc_device_data("lazy_tensors::device_data");
+const OpKindWrapper ltc_get_dimensions_size(
+    "lazy_tensors::ltc_get_dimensions_size");
+const OpKindWrapper ltc_moving_average("lazy_tensors::moving_average");
+const OpKindWrapper ltc_nms("lazy_tensors::nms");
+const OpKindWrapper ltc_not_supported("lazy_tensors::not_supported");
+const OpKindWrapper ltc_replication_pad("lazy_tensors::replication_pad");
+const OpKindWrapper ltc_replication_pad_backward(
+    "lazy_tensors::replication_pad_backward");
+const OpKindWrapper ltc_tensor_data("lazy_tensors::tensor_data");
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/ir.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/ir.h
new file mode 100644
index 0000000000000000000000000000000000000000..4469f638a7f33209bdc84c9534610a11304403dc
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/ir.h
@@ -0,0 +1,289 @@
+#pragma once
+
+#include <ATen/core/symbol.h>
+
+#include <functional>
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include <c10/core/ScalarType.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Flags.h>
+#include <torch/csrc/lazy/core/hash.h>
+#include <torch/csrc/lazy/core/ir_metadata.h>
+#include <torch/csrc/lazy/core/shape.h>
+
+TORCH_DECLARE_bool(ltc_enable_dynamic_shapes);
+
+namespace torch::lazy {
+
+static const hash_t kHashSeed(static_cast<uint32_t>(0x5a2d296e9));
+
+class Node;
+struct Output;
+struct Value;
+
+using NodePtr = std::shared_ptr<Node>;
+
+// The Kind of operation a Node can be associated to.
+struct TORCH_API OpKind {
+  OpKind() = default;
+  explicit OpKind(c10::Symbol op) : op(op) {}
+
+  bool operator==(const OpKind& rhs) const {
+    return op == rhs.op;
+  }
+  bool operator!=(const OpKind& rhs) const {
+    return !operator==(rhs);
+  }
+  bool operator<(const OpKind& rhs) const {
+    return c10::unique_t(op) < c10::unique_t(rhs.op);
+  }
+
+  hash_t hash() const;
+
+  std::string ToString() const {
+    return op.toQualString();
+  }
+
+  // Retrieves an existing operation object, or creates a new one. Operations
+  // that are specific to lazy tensors, should live within the 'lazy_tensors::'
+  // namespace.
+  static OpKind Get(const std::string& name);
+
+  c10::Symbol op;
+};
+
+inline std::ostream& operator<<(std::ostream& stream, const OpKind& op) {
+  stream << op.ToString();
+  return stream;
+}
+
+using OpList = c10::ArrayRef<Value>;
+
+hash_t OperandHashes(
+    const OpList& operands,
+    const hash_t& seed,
+    bool bakeInSizes);
+// A node in the graph. Nodes for operations which require extra data to be
+// stored for lowering should inherit from this class and add an operation
+// specific member there. For example, a constant might create a new
+// NodeConstant class (inheriting from Node) with an extra lazy_tensors::Literal
+// field, or a tensor value might create a new NodeTensor with a computation
+// client data handle in it.
+class TORCH_API Node {
+ public:
+  static bool enableDynamicShape();
+
+  // Creates a new node with the given op name. The op is a unique identifier
+  // for the operation. The num_outputs tells how many outputs a given operation
+  // generates.
+  //
+  // None leaf node's node_hash does not contains shape information always.
+  // So we pass in the hash value rather than a function.
+  Node(OpKind op, size_t num_outputs);
+
+  // Construct node with operands and shapes
+  Node(
+      OpKind op,
+      OpList operands,
+      std::vector<Shape>&& shapes,
+      size_t num_outputs = 1);
+
+  // Construct node with operands and no shape
+  Node(OpKind op, OpList operands, size_t num_outputs = 1);
+
+  // Construct node with shape and no operands
+  Node(OpKind op, Shape shape, size_t num_outputs = 1);
+
+  virtual ~Node() = default;
+
+  const OpKind& op() const {
+    return op_;
+  }
+
+  size_t num_outputs() const {
+    return num_outputs_;
+  }
+
+  // Retrieves the full shape of the IR Node.
+  virtual c10::ArrayRef<Shape> shapes() const;
+
+  virtual const Shape& shape(size_t output_index = 0) const;
+
+  // Add the shape computed by the shape_fn
+  void addComputedShape(const std::function<Shape()>& shape_fn);
+
+  // Compute the shape using the provided shape_fn if not previously cached
+  Shape computeShape(const std::function<Shape()>& shape_fn);
+
+  virtual const std::vector<Output>& operands() const;
+
+  virtual const Output& operand(size_t i) const;
+
+  // Gets operand at index i if index is valid, or kNullOutput otherwise.
+  virtual const Output& nullable_operand(size_t i) const;
+
+  // Returns the hash of the dag used to look up the compiled graph
+  virtual hash_t hash() const = 0;
+
+  // Returns the hash of the dag used to for shape caching
+  virtual hash_t shapeHash() const = 0;
+
+  const MetaData& metadata() const {
+    return metadata_;
+  }
+
+  UserMetaData* user_metadata() const {
+    return user_metadata_.get();
+  }
+
+  std::shared_ptr<UserMetaData> SetUserMetadata(
+      std::shared_ptr<UserMetaData> user_meta) {
+    std::swap(user_metadata_, user_meta);
+    return user_meta;
+  }
+
+  virtual std::string ToString() const;
+
+ private:
+  // The ID of the operation captured by this node.
+  OpKind op_;
+  size_t num_outputs_ = 1;
+
+  // The IR specific metadata attached to the IR node.
+  MetaData metadata_;
+  // The IR framework user can attach a user defined metadata object deriving
+  // from UserMetaData.
+  std::shared_ptr<UserMetaData> user_metadata_;
+
+ protected:
+  // Adds node's index output number as operand.
+  void AddOperand(const NodePtr& node, size_t index = 0);
+
+  std::vector<Shape> shapes_;
+  // A node holds a real reference to its operands.
+  std::vector<NodePtr> operands_;
+  // Outputs do not hold references on the nodes, and neither do the uses, since
+  // otherwise we get into circular reference counting.
+  std::vector<Output> operands_as_outputs_;
+};
+
+inline std::ostream& operator<<(std::ostream& stream, const Node& node) {
+  stream << node.ToString();
+  return stream;
+}
+
+// Note: Keep this version of NodeCast for smooth PyTorch/XLA migration, and
+// clean up once the migration is done.
+template <typename T>
+const T* NodeCast(const Node* node, OpKind op) {
+  if (op != node->op()) {
+    return nullptr;
+  }
+#ifdef NDEBUG
+  return static_cast<const T*>(node);
+#else
+  return &dynamic_cast<const T&>(*node);
+#endif
+}
+
+template <typename T>
+const T* NodeCast(const Node* node) {
+  if (T::ClassOpKind() != node->op()) {
+    return nullptr;
+  }
+  // TODO: Some IR classes share the same opkind, such as Mean and MeanDim, so
+  // static_cast is not safe here. Unless we have opkind unique for each class,
+  // we have to use dynamic_cast here.
+  return dynamic_cast<const T*>(node);
+}
+
+// Represents a specific output produced by a node. Since the output of a node
+// can be composed by multiple outputs, the node+index coordinates fully qualify
+// each single output.
+struct TORCH_API Output {
+  struct Hasher {
+    size_t operator()(const Output& output) const;
+  };
+
+  Output() = default;
+  explicit Output(const Node* node, size_t index = 0)
+      : node(node), index(index) {}
+
+  hash_t hash() const;
+  hash_t shapeHash() const;
+
+  bool operator==(const Output& rhs) const {
+    return node == rhs.node && index == rhs.index;
+  }
+
+  // To compare the operands of to-be-constructed node and to-be-reused node
+  bool operator==(const Value& rhs) const;
+
+  bool operator!=(const Output& rhs) const {
+    return !operator==(rhs);
+  }
+
+  const Shape& shape() const {
+    return node->shape(index);
+  }
+
+  std::string ToString() const;
+
+  // The node providing the output.
+  const Node* node{nullptr};
+  // The index in the node's output this output refers to.
+  size_t index{0};
+};
+
+inline std::ostream& operator<<(std::ostream& stream, const Output& output) {
+  stream << output.ToString();
+  return stream;
+}
+
+template <typename T>
+using OutputMap = std::unordered_map<Output, T, Output::Hasher>;
+
+// Represents an input/operand for a Node object.
+struct TORCH_API Value {
+  Value() = default;
+  /* implicit */ Value(NodePtr&& node, size_t index = 0)
+      : node(std::move(node)), index(index) {}
+  /* implicit */ Value(const NodePtr& node, size_t index = 0)
+      : node(node), index(index) {}
+
+  hash_t hash() const;
+  hash_t shapeHash() const;
+
+  operator bool() const {
+    return node != nullptr;
+  }
+
+  operator Output() const {
+    return Output(node.get(), index);
+  }
+
+  const Shape& shape() const {
+    return node->shape(index);
+  }
+
+  Node* operator->() const {
+    return node.get();
+  }
+
+  NodePtr node;
+  size_t index = 0;
+};
+
+} // namespace torch::lazy
+
+namespace c10 {
+// Explicit template instantiation to make ArrayRef<Value> work
+template class at::ArrayRef<torch::lazy::Value>;
+} // namespace c10
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/ir_builder.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/ir_builder.h
new file mode 100644
index 0000000000000000000000000000000000000000..2465a07c153ad22ec9986057755635f9a06cce7e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/ir_builder.h
@@ -0,0 +1,148 @@
+#pragma once
+
+#include <c10/core/ScalarType.h>
+#include <torch/csrc/lazy/backend/backend_interface.h>
+#include <torch/csrc/lazy/core/config.h>
+#include <torch/csrc/lazy/core/ir.h>
+#include <torch/csrc/lazy/core/tensor.h>
+#include <torch/csrc/lazy/core/trie.h>
+#include <optional>
+#include <vector>
+
+// This file is part of the backend interface. So, ops shouldn't be added or
+// removed without due process The exception to this being the view ops which
+// will be removed soon pending functionalization
+
+namespace torch::lazy {
+
+template <typename T, typename... Args>
+NodePtr ReuseNode(Args&&... args) {
+  if (FLAGS_torch_lazy_reuse_ir) {
+    return LookupNodeFromTrieCache<T>(std::forward<Args>(args)...);
+  }
+  return nullptr;
+}
+
+// Caching an IR node into TrieCache
+static inline void CacheNode(NodePtr node) {
+  if (FLAGS_torch_lazy_reuse_ir) {
+    TrieCache::Get()->Insert(std::move(node));
+  }
+}
+
+template <typename T, typename... Args>
+NodePtr MakeNode(Args&&... args) {
+  return std::make_shared<T>(std::forward<Args>(args)...);
+}
+
+// op is passed in for a more efficient node casting, see the implementation of
+// NodeCast
+template <typename T, typename... Args>
+NodePtr ReuseOrMakeNode(Args&&... args) {
+  NodePtr node = ReuseNode<T>(std::forward<Args>(args)...);
+  if (!node) {
+    node = MakeNode<T>(std::forward<Args>(args)...);
+    CacheNode(node);
+  }
+  return node;
+}
+
+struct IrBuilder {
+  virtual NodePtr MakeDeviceData(
+      const std::shared_ptr<BackendData>& data) const = 0;
+  virtual NodePtr MakeScalar(
+      const at::Scalar& value,
+      const at::ScalarType& type) const = 0;
+  virtual NodePtr MakeExpand(
+      const Value& input0,
+      const std::vector<int64_t>& size,
+      const bool& is_scalar_expand) const = 0;
+  virtual NodePtr MakeCast(
+      const Value& input0,
+      const at::ScalarType& dtype,
+      const std::optional<at::ScalarType>& stype = std::nullopt) const = 0;
+  virtual NodePtr MakeTensorList(const OpList& inputs) const = 0;
+  virtual NodePtr MakeGeneric(
+      const OpKind& op,
+      const OpList& operands,
+      const Shape& shape,
+      const size_t& num_outputs = 1,
+      const hash_t& hash_seed = static_cast<uint32_t>(0x5a2d296e9)) const = 0;
+
+  // dynamic ir nodes
+  virtual NodePtr MakeSizeNode(const Value& input, size_t dim) const = 0;
+  virtual NodePtr MakeSizeAdd(const Value& a, const Value& b) const = 0;
+  virtual NodePtr MakeSizeMul(const Value& a, const Value& b) const = 0;
+  virtual NodePtr MakeSizeDiv(const Value& a, const Value& b) const = 0;
+
+  virtual ~IrBuilder() = default;
+};
+
+static inline NodePtr MakeDeviceData(const std::shared_ptr<BackendData>& data) {
+  return getIrBuilder()->MakeDeviceData(data);
+}
+static inline NodePtr MakeScalar(
+    const at::Scalar& value,
+    const at::ScalarType& type) {
+  return getIrBuilder()->MakeScalar(value, type);
+}
+static inline NodePtr MakeExpand(
+    const Value& input0,
+    const std::vector<int64_t>& size,
+    const bool& is_scalar_expand) {
+  return getIrBuilder()->MakeExpand(input0, size, is_scalar_expand);
+}
+static inline NodePtr MakeCast(
+    const Value& input0,
+    const at::ScalarType& dtype,
+    const std::optional<at::ScalarType>& stype = std::nullopt) {
+  return getIrBuilder()->MakeCast(input0, dtype, stype);
+}
+static inline NodePtr MakeTensorList(const OpList& inputs) {
+  return getIrBuilder()->MakeTensorList(inputs);
+}
+static inline NodePtr MakeGeneric(
+    const OpKind& op,
+    const OpList& operands,
+    const Shape& shape,
+    const size_t& num_outputs = 1,
+    const hash_t& hash_seed = static_cast<uint32_t>(0x5a2d296e9)) {
+  return getIrBuilder()->MakeGeneric(
+      op, operands, shape, num_outputs, hash_seed);
+}
+
+// dynamic ir nodes
+static inline NodePtr MakeSizeNode(const Value& input, size_t dim) {
+  return getIrBuilder()->MakeSizeNode(input, dim);
+}
+static inline NodePtr MakeSizeAdd(const Value& a, const Value& b) {
+  return getIrBuilder()->MakeSizeAdd(a, b);
+}
+static inline NodePtr MakeSizeMul(const Value& a, const Value& b) {
+  return getIrBuilder()->MakeSizeAdd(a, b);
+}
+static inline NodePtr MakeSizeDiv(const Value& a, const Value& b) {
+  return getIrBuilder()->MakeSizeDiv(a, b);
+}
+
+inline Value GetSymIntValue(const c10::SymInt& a) {
+  if (auto ma = a.maybe_as_int()) {
+    return Value(MakeScalar(*ma, at::kLong), 0);
+  } else {
+    return Value(
+        dynamic_cast<torch::lazy::SymNodeImpl*>(a.toSymNodeImplUnowned())
+            ->node_,
+        0);
+  }
+}
+
+// TODO: this should return Value
+inline std::vector<int64_t> GetSymIntArrayRefValue(c10::SymIntArrayRef arr) {
+  std::vector<int64_t> r;
+  for (const auto& a : arr) {
+    r.emplace_back(a.guard_int(__FILE__, __LINE__));
+  }
+  return r;
+}
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/ir_dump_util.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/ir_dump_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..357fc1598886921308b2f6c37741efd7dd7d6d6f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/ir_dump_util.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <torch/csrc/lazy/core/ir.h>
+
+#include <string>
+
+namespace torch::lazy {
+
+class BackendDevice;
+
+class TORCH_API DumpUtil {
+ public:
+  static std::string ToDot(c10::ArrayRef<const Node*> nodes);
+
+  static std::string PostOrderToDot(
+      c10::ArrayRef<const Node*> post_order,
+      c10::ArrayRef<const Node*> roots);
+
+  static std::string ToText(c10::ArrayRef<const Node*> nodes);
+
+  static std::string PostOrderToText(
+      c10::ArrayRef<const Node*> post_order,
+      c10::ArrayRef<const Node*> roots);
+
+  static std::string ToBackend(
+      c10::ArrayRef<Value> values,
+      const BackendDevice& device);
+};
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/ir_metadata.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/ir_metadata.h
new file mode 100644
index 0000000000000000000000000000000000000000..64c31c0b7ddd204cb13b5633c392f6b886d54bb8
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/ir_metadata.h
@@ -0,0 +1,51 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+
+#include <string>
+#include <vector>
+
+namespace torch::lazy {
+struct SourceLocation {
+  std::string file;
+  std::string function;
+  int line = -1;
+};
+
+TORCH_API void EmitShortFrameInfo(
+    std::ostream& stream,
+    const std::vector<SourceLocation>& frames);
+
+TORCH_API std::ostream& operator<<(
+    std::ostream& stream,
+    const std::vector<SourceLocation>& frames);
+
+// The base class for user defined metadata which is possible to attach to IR
+// nodes.
+struct TORCH_API UserMetaData {
+  virtual ~UserMetaData() = default;
+};
+
+struct TORCH_API MetaData {
+  std::string scope;
+  std::vector<SourceLocation> frame_info;
+};
+
+// TODO(whc) is this going to be used outside of in IR decompositions?
+// RAII data structure to be used a stack variable to enter a new IR scope. IR
+// scope names will appear in the IR and will help identifying the source of the
+// single IR nodes.
+struct TORCH_API ScopePusher {
+  explicit ScopePusher(const std::string& name);
+  ~ScopePusher();
+  ScopePusher(ScopePusher&& other) = delete;
+  ScopePusher(const ScopePusher&) = delete;
+  ScopePusher& operator=(const ScopePusher&) = delete;
+  ScopePusher& operator=(ScopePusher&&) = delete;
+
+  static void ResetScopes();
+};
+
+TORCH_API MetaData GetMetaDataIfDebugging();
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/ir_util.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/ir_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..788ebc81089bf64fc5ce58b16555aa85b8368bea
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/ir_util.h
@@ -0,0 +1,45 @@
+#pragma once
+
+#include <unordered_map>
+#include <vector>
+
+#include <torch/csrc/lazy/core/ir.h>
+
+namespace torch::lazy {
+
+class TORCH_API Util {
+ public:
+  // Tracks the emission status of the nodes during the post-order generation.
+  // It helps tracking loops within the computation graphs.
+  enum EmitStatus {
+    kNotEmitted,
+    kEmitting,
+    kEmitted,
+  };
+
+  using EmissionMap = std::unordered_map<const Node*, EmitStatus>;
+
+  // Computes the post order from the given node, without using recursion. The
+  // emission map can be used as saved state, for multiple separate calls to
+  // this API. The returned post-order can be empty if the node has already been
+  // emitted inside the emission map. An error is generated if a loop is
+  // detected.
+  static std::vector<const Node*> ComputePostOrder(
+      const Node* node,
+      EmissionMap* emap);
+
+  static std::vector<const Node*> ComputePostOrder(
+      c10::ArrayRef<const Node*> nodes,
+      EmissionMap* emap);
+
+  // Same as above, but computes the post order on the set of nodes specified as
+  // argument.
+  static std::vector<const Node*> ComputePostOrder(
+      c10::ArrayRef<const Node*> nodes);
+
+  // Retrieves the number of nodes within the graph whose sink are passed in the
+  // nodes argument.
+  static size_t GetGraphSize(c10::ArrayRef<const Node*> nodes);
+};
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/lazy_graph_executor.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/lazy_graph_executor.h
new file mode 100644
index 0000000000000000000000000000000000000000..928c1ba67f1bcda1fd76b6dd73b60da3bcda7d49
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/lazy_graph_executor.h
@@ -0,0 +1,429 @@
+#pragma once
+
+#include <c10/util/ArrayRef.h>
+#include <torch/csrc/lazy/backend/lowering_context.h>
+#include <torch/csrc/lazy/core/cache.h>
+#include <torch/csrc/lazy/core/ir_util.h>
+#include <torch/csrc/lazy/core/multi_wait.h>
+#include <torch/csrc/lazy/core/tensor.h>
+#include <torch/csrc/lazy/core/util.h>
+
+namespace torch::lazy {
+
+class TORCH_API LazyGraphExecutor {
+ public:
+  struct DeviceDataInfo : public BackendData::Info {
+    DeviceDataInfo(int64_t tensor_id, bool read_only)
+        : tensor_id(tensor_id), read_only(read_only) {}
+
+    int64_t tensor_id = 0;
+    bool read_only = false;
+  };
+
+  // Register a lazy graph executor instance that can be retrieved using Get()
+  static void Register(LazyGraphExecutor*);
+  static LazyGraphExecutor* Get();
+
+  virtual ~LazyGraphExecutor() = default;
+
+  // Override these methods to perform custom tensor registration and
+  // unregistration Note: It is vital that the parent implementations are also
+  // called in order for the tensors to show up in the live tensor list
+  virtual void RegisterTensor(std::shared_ptr<LazyTensor::Data> data);
+  virtual void UnregisterTensor(LazyTensor::Data* data);
+
+  // Seed for random generator.
+  // Override to supply your own DeviceContextArena.
+  virtual Value GetRngSeed(const BackendDevice& device);
+  virtual uint64_t GetRunningSeed(const BackendDevice& device);
+  virtual void SetRngSeed(const BackendDevice& device, uint64_t seed);
+
+  void DeviceBarrier(const BackendDevice& device);
+
+  BackendDataPtr GetDeviceData(
+      const at::Tensor& tensor,
+      const BackendDevice& device);
+
+  BackendDataPtr GetDeviceData(
+      const at::Scalar& value,
+      at::ScalarType scalar_type,
+      const BackendDevice& device);
+
+  // Retrieves the set of lazy tensors which are currently live in the system,
+  // for the given device. If device is nullptr, the live tensors for all
+  // devices will be returned. Returned tensors are sorted by device as primary
+  // key, and by unique ID as secondary key.
+  std::vector<LazyTensorPtr> GetLiveTensors(const BackendDevice* device);
+
+  // Makes sure that any outstanding IR operation accumulated over live tensors,
+  // gets turned into device data. If wait is true, the sync operation will be
+  // run synchronously. The devices argument, if not empty, tells the devices
+  // which should be partecipating into the replicated computation.
+  virtual void SyncLiveTensorsGraph(
+      const BackendDevice* device,
+      c10::ArrayRef<std::string> devices,
+      bool wait);
+
+  // Applies all the pending IR operations queued over the input tensors. All
+  // the tensors must be on the same device. If wait is true, the sync operation
+  // will be run synchronously. The devices argument, if not empty, tells the
+  // devices which should be partecipating into the replicated computation.
+  void SyncTensorsGraph(
+      std::vector<LazyTensorPtr>* tensors,
+      c10::ArrayRef<std::string> devices,
+      bool wait,
+      bool sync_ltc_data);
+
+  // Marks an execution step, which allows the tensor framework to understand
+  // the computation boundaries.
+  // Override to supply your own DeviceContextArena.
+  virtual void MarkStep(const BackendDevice& device);
+
+  // Waits for all the outstanding operations on all the supplied devices.
+  // If devices is empty, the wait will happen for all local devices.
+  void WaitDeviceOps(c10::ArrayRef<BackendDevice> devices);
+
+  // Retrieves the PyTorch CPU tensors behind the lazy tensors IR operations.
+  // All the tensors must be on the same device.
+  std::vector<at::Tensor> GetTensors(std::vector<LazyTensorPtr>* tensors);
+
+  size_t IncTrimCounter() const;
+
+  // Dumps the backend specific text of the computation accumulated in the graph
+  // which is attached the tensors.
+  std::string DumpBackendComputation(const std::vector<LazyTensorPtr>& tensors);
+
+  Value GetDeviceDataIrValue(
+      const at::Scalar& value,
+      c10::ScalarType type,
+      const BackendDevice& device);
+  Value GetIrValueForScalar(
+      const at::Scalar& value,
+      c10::ScalarType type,
+      const BackendDevice& device);
+  Value GetIrValueForScalar(
+      const at::Scalar& value,
+      const BackendDevice& device);
+
+  // TODO: even though this API is currently used **only** in codegen to
+  // generate real scalar IR values vs scalar tensors, we would like to
+  // use it in other cases where `GetIrValueForXXXScalar` is used, as well
+  // In order to do that, we need to untangle the cases where we don't need
+  // `expand` and where we don't expect a scalar tensor
+  Value GetIrValueForScalarFromCodegen(
+      const at::Scalar& value,
+      const BackendDevice& device);
+  Value GetIrValueForExpandedScalar(
+      const at::Scalar& value,
+      const Shape& shape,
+      const BackendDevice& device);
+
+  struct CachedComputation {
+    explicit CachedComputation(ComputationPtr computation)
+        : computation(std::move(computation)) {}
+
+    ComputationPtr computation;
+  };
+
+  using ComputationCache = Cache<hash_t, CachedComputation, HashReducer>;
+
+  ComputationCache* GetComputationCache();
+
+  hash_t GetGraphHash(const std::vector<LazyTensorPtr>& tensors);
+
+  // Clear the computation cache.
+  void ClearComputationCache();
+  // Remove a specific computation cache entry from its hash.
+  void RemoveFromComputationCache(const hash_t& hash);
+
+ protected:
+  // TODO(alanwaketan): Revisit if all of them need to be accessible to
+  // derived classes.
+
+  struct SyncTensorsConfig {
+    // Whether we want to force data on the target tensors (hence trimming
+    // the IR graph above them).
+    bool force_ltc_data = true;
+    // Whether when setting the data, the other properties of the tensor
+    // state should be reset.
+    bool sync_ltc_data = true;
+  };
+
+  struct SyncTensorCollection {
+    SyncTensorCollection() : hash(0) {}
+
+    SyncTensorsConfig config;
+    std::vector<size_t> indices;
+    hash_t hash;
+    std::vector<ExceptionCleanup> unlocker;
+    BackendDevice device;
+  };
+
+  struct PostOrderData {
+    std::vector<const Node*> post_order;
+    Util::EmissionMap emission_map;
+    std::vector<BackendDataPtr> parameters_data;
+    std::vector<size_t> parameter_sequence;
+  };
+
+  // Locking:
+  // We perform two kinds of operations of tensors, synchronous and
+  // asynchronous. The ApplyPendingGraph() are synchronous, as we need the
+  // device data result immediately. Before the synchronous operations can
+  // start, they need to wait that the pending asynchronous operations have
+  // completed. Synchronous operations do not hold device locks, since they are
+  // strictly sequential, dictated by the PyTorch execution order. The
+  // SyncTensorsGraph() is asynchronous, and returns immediately after having
+  // scheduled the asynchronous operation. While executing, the asynchronous
+  // operations will hold locks on all the participating devices (in most common
+  // cases there will be only one device).
+  // Since asynchronous operations capture device locks, only one asynchronous
+  // operation can execute at the same time, on a given device. Tensor
+  // operations which send data to device do not need to hold any device locks
+  // while doing so. Only operations which _use_ device data (computations, and
+  // transfer from server) need to wait for asynchronous operations to complete
+  // (barrier).
+
+  class DeviceLocker {
+   public:
+    explicit DeviceLocker(BackendDevice device) : device_(std::move(device)) {}
+
+    const BackendDevice& device() const {
+      return device_;
+    }
+
+    void Lock();
+    void Unlock(std::exception_ptr exptr);
+    void Barrier();
+
+   private:
+    void CheckResetException();
+
+    BackendDevice device_;
+    std::mutex mutex_;
+    std::condition_variable cv_;
+    bool locked_ = false;
+    std::exception_ptr exptr_;
+  };
+
+  class DeviceLockerArena {
+   public:
+    static DeviceLockerArena* Get();
+
+    std::shared_ptr<DeviceLocker> GetLocker(const BackendDevice& device);
+
+    void DeviceBarrier(const BackendDevice& device);
+
+    // Use a set to impose an order on the device locking sequence (ABBA
+    // prevention).
+    std::vector<ExceptionCleanup> LockDevices(
+        const std::set<BackendDevice>& devices);
+
+   private:
+    ExceptionCleanup LockDevice(const BackendDevice& device);
+
+    std::mutex mutex_;
+    std::map<BackendDevice, std::shared_ptr<DeviceLocker>> lockers_;
+  };
+
+  class DataCacheArena {
+   public:
+    static DataCacheArena* Get();
+
+    BackendDataPtr GetDeviceData(
+        const at::Tensor& tensor,
+        const BackendDevice& device);
+
+    BackendDataPtr GetDeviceData(
+        const at::Scalar& value,
+        at::ScalarType scalar_type,
+        const BackendDevice& device);
+
+   private:
+    struct TensorHasher {
+      size_t operator()(const at::Tensor& tensor) const;
+    };
+    struct TensorComparer {
+      bool operator()(const at::Tensor& tensor1, const at::Tensor& tensor2)
+          const;
+    };
+
+    explicit DataCacheArena(size_t max_cache_size);
+
+    using DataCache =
+        Cache<at::Tensor, BackendData, TensorHasher, TensorComparer>;
+
+    DataCache* GetDataCache(const BackendDevice& device);
+
+    size_t max_cache_size_ = 0;
+    std::mutex mutex_;
+    std::map<BackendDevice, std::unique_ptr<DataCache>> device_caches_;
+  };
+
+  // The DeviceContextArena holds per device live information and statistics,
+  // among which the lazy tensors which are currently alive in the system. This
+  // is used to create computation "barriers" in order to flush pending
+  // operations and ensure the same computations are created during the training
+  // loops.
+  // TODO(alanwaketan): Add a registry such that we don't need to make all
+  // related methods virtual.
+  class DeviceContextArena {
+   protected:
+    struct DeviceContext {
+      std::mutex lock;
+      std::map<int64_t, std::weak_ptr<LazyTensor::Data>> tensors_data;
+      uint64_t seed = 101;
+      uint64_t running_seed = 101;
+      Value seed_ir_value;
+    };
+
+   public:
+    static DeviceContextArena* Get();
+    virtual ~DeviceContextArena() = default;
+
+    void RegisterTensor(std::shared_ptr<LazyTensor::Data> data);
+    void UnregisterTensor(LazyTensor::Data* data);
+
+    std::vector<LazyTensorPtr> GetLiveTensors(const BackendDevice* device);
+
+    // Overriding it allow derived class to use their own IRs for Value.
+    virtual Value GetRngSeed(const BackendDevice& device);
+    uint64_t GetRunningSeed(const BackendDevice& device);
+    void SetRngSeed(const BackendDevice& device, uint64_t seed);
+
+    void MarkStep(const BackendDevice& device);
+
+    std::vector<BackendDevice> GetActiveDevices();
+
+   protected:
+    DeviceContext* GetDeviceContext(const BackendDevice& device);
+
+    void ForAllDeviceContexts(
+        const std::function<void(DeviceContext*)>& fn,
+        const BackendDevice* device);
+
+    // Overriding it allow derived class to use their own conversions.
+    virtual Value IrValueFromScalar(
+        const at::Scalar& value,
+        at::ScalarType scalar_type,
+        const BackendDevice& device);
+
+   private:
+    std::vector<DeviceContext*> GetAllDeviceContexts();
+
+    std::mutex lock_;
+    std::map<BackendDevice, DeviceContext*> device_contexts_;
+  };
+
+  struct Async {
+    Async(
+        SyncTensorCollection* coll,
+        std::vector<BackendDataPtr> parameters_data,
+        std::vector<BackendDataPtr> tensors_data,
+        ComputationCache::TypePtr cached_computation);
+    virtual ~Async() = default;
+
+    void Wait();
+
+    MultiWait mwait;
+    std::vector<size_t> indices;
+    std::vector<ExceptionCleanup> unlocker;
+    std::vector<BackendDataPtr> parameters_data;
+    BackendDevice device;
+    ComputationCache::TypePtr cached_computation;
+    std::vector<BackendDataPtr> tensors_data;
+  };
+
+  void ResetTrimCounter() const;
+
+  // Waits for this SyncTensorCollection's device barrier and acquire the lock.
+  virtual void TensorCollectionBarrier(SyncTensorCollection* coll);
+
+  // One can override to insert your own profiler.
+  virtual PostOrderData RunPostOrder(
+      const std::vector<Value>& ir_values,
+      SyncTensorCollection* coll);
+
+ private:
+  struct CompilationResult {
+    BackendDevice device;
+    size_t emitted_nodes = 0;
+    ComputationPtr computation;
+    std::vector<BackendDataPtr> parameters_data;
+  };
+
+  virtual bool ShouldSyncTensor(const LazyTensorPtr& tensor) const;
+
+  SyncTensorCollection CollectSyncTensors(
+      const std::vector<LazyTensorPtr>& tensors,
+      const SyncTensorsConfig& config);
+
+  std::vector<Value> CollectRoots(
+      const std::vector<LazyTensorPtr>& tensors,
+      c10::ArrayRef<size_t> indices);
+
+  std::vector<BackendDataPtr> SetTensorData(
+      std::vector<LazyTensorPtr>* tensors,
+      const SyncTensorsConfig& config,
+      c10::ArrayRef<size_t> indices,
+      const std::vector<torch::lazy::BackendDataPtr>& tensor_data_vec);
+
+  void ExtractIRAndPrepareTensorData(
+      std::vector<LazyTensorPtr>* tensors,
+      const SyncTensorsConfig& config,
+      c10::ArrayRef<size_t> indices,
+      std::vector<Value>& ir_values,
+      std::vector<BackendDataPtr>& tensor_data_vec);
+
+  std::shared_ptr<Async> TryRunCachedSync(
+      std::vector<LazyTensorPtr>* tensors,
+      SyncTensorCollection* coll,
+      PostOrderData* po_data,
+      const std::vector<BackendDataPtr>& tensor_data_vec);
+
+  CompilationResult Compile(
+      const std::vector<LazyTensorPtr>& tensors,
+      c10::ArrayRef<std::string> devices,
+      const SyncTensorCollection& coll,
+      PostOrderData* po_data,
+      const std::vector<Value>& ir_values);
+
+  ComputationCache::TypePtr LookupCachedCompile(const hash_t& hash);
+
+  std::shared_ptr<Async> SyncTensorsGraphInternal(
+      std::vector<LazyTensorPtr>* tensors,
+      c10::ArrayRef<std::string> devices,
+      const SyncTensorsConfig& config);
+
+  // Schedules the execution of a sync tensors operation in background. The
+  // asynchronous operation will hold the device locks by capturing the ones
+  // present within the coll structure.
+  std::shared_ptr<Async> ScheduleSyncTensorsGraph(
+      SyncTensorCollection* coll,
+      std::vector<BackendDataPtr> parameters_data,
+      std::vector<BackendDataPtr> tensors_data,
+      ComputationCache::TypePtr cached_computation);
+
+  std::shared_ptr<Async> ScheduleSyncTensorsGraph(
+      std::vector<LazyTensorPtr>* tensors,
+      SyncTensorCollection* coll,
+      std::vector<BackendDataPtr> parameters_data,
+      ComputationCache::TypePtr cached_computation,
+      const std::vector<BackendDataPtr>& tensor_data_vec);
+
+  std::vector<at::Tensor> GetTensorsFused(std::vector<LazyTensorPtr>* tensors);
+
+  std::vector<at::Tensor> FetchTensors(
+      std::vector<LazyTensorPtr>* tensors,
+      c10::ArrayRef<BackendDataPtr> tensors_data,
+      const std::vector<size_t>* indices);
+
+  // Gathers the device data for all the input tensors, after an
+  // asynchronous operation.
+  std::vector<BackendDataPtr> GatherTensorsData(
+      const std::vector<LazyTensorPtr>& tensors,
+      c10::ArrayRef<size_t> indices,
+      c10::ArrayRef<BackendDataPtr> tensors_data);
+};
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/metrics.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/metrics.h
new file mode 100644
index 0000000000000000000000000000000000000000..98288258e9ee3f1a4b5d54319ef64099b7a9343f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/metrics.h
@@ -0,0 +1,288 @@
+/**
+ * This file is adapted from PyTorch/XLA
+ * https://github.com/pytorch/xla/blob/e0e5f937a0ba8d904f9608137dc8c51ba439df2d/third_party/xla_client/metrics.h
+ */
+
+#pragma once
+
+#include <atomic>
+#include <functional>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <vector>
+
+#include <c10/macros/Export.h>
+
+namespace torch::lazy {
+
+struct TORCH_API Sample {
+  Sample() = default;
+  Sample(int64_t timestamp_ns, double value)
+      : timestamp_ns(timestamp_ns), value(value) {}
+
+  int64_t timestamp_ns = 0;
+  double value = 0;
+};
+
+using MetricReprFn = std::function<std::string(double)>;
+
+// Class used to collect time-stamped numeric samples. The samples are stored in
+// a circular buffer whose size can be configured at constructor time.
+class TORCH_API MetricData {
+ public:
+  // Creates a new MetricData object with the internal circular buffer storing
+  // max_samples samples. The repr_fn argument allow to specify a function which
+  // pretty-prints a sample value.
+  MetricData(MetricReprFn repr_fn, size_t max_samples);
+
+  // Returns the total values of all the samples being posted to this metric.
+  double Accumulator() const;
+
+  size_t TotalSamples() const;
+
+  void AddSample(int64_t timestamp_ns, double value);
+
+  // Returns a vector with all the current samples, from the oldest to the
+  // newer. If accumulator is not nullptr, it will receive the current value of
+  // the metrics' accumulator (the sum of all posted values). If total_samples
+  // is not nullptr, it will receive the count of the posted values.
+  std::vector<Sample> Samples(double* accumulator, size_t* total_samples) const;
+
+  std::string Repr(double value) const {
+    return repr_fn_(value);
+  }
+
+  void Reset();
+
+  bool IsValid() const {
+    return TotalSamples() > 0;
+  }
+
+ private:
+  mutable std::mutex lock_;
+  MetricReprFn repr_fn_;
+  size_t count_ = 0;
+  std::vector<Sample> samples_;
+  double accumulator_ = 0.0;
+};
+
+// Counters are a very lightweight form of metrics which do not need to track
+// sample time.
+class TORCH_API CounterData {
+ public:
+  CounterData() : value_(0) {}
+
+  void AddValue(int64_t value) {
+    value_ += value;
+  }
+
+  int64_t Value() const {
+    return value_;
+  }
+
+  void Reset() {
+    value_ = 0;
+  }
+
+  bool IsValid() const {
+    return value_ > 0;
+  }
+
+ private:
+  std::atomic<int64_t> value_;
+};
+
+class TORCH_API MetricsArena {
+ public:
+  static MetricsArena* Get();
+
+  void ResetCounters();
+  void ResetMetrics();
+
+  // Registers a new metric in the global arena.
+  void RegisterMetric(
+      const std::string& name,
+      MetricReprFn repr_fn,
+      size_t max_samples,
+      std::shared_ptr<MetricData>* data);
+
+  void RegisterCounter(
+      const std::string& name,
+      std::shared_ptr<CounterData>* data);
+
+  void ForEachMetric(
+      const std::function<void(const std::string&, MetricData*)>& metric_func);
+
+  void ForEachCounter(
+      const std::function<void(const std::string&, CounterData*)>&
+          counter_func);
+
+  std::vector<std::string> GetMetricNames();
+
+  MetricData* GetMetric(const std::string& name);
+
+  std::vector<std::string> GetCounterNames();
+
+  CounterData* GetCounter(const std::string& name);
+
+ private:
+  std::mutex lock_;
+  std::map<std::string, std::shared_ptr<MetricData>> metrics_;
+  std::map<std::string, std::shared_ptr<CounterData>> counters_;
+};
+
+// Emits the value in a to_string() conversion.
+TORCH_API std::string MetricFnValue(double value);
+// Emits the value in a humanized bytes representation.
+TORCH_API std::string MetricFnBytes(double value);
+// Emits the value in a humanized time representation. The value is expressed in
+// nanoseconds EPOCH time.
+TORCH_API std::string MetricFnTime(double value);
+
+// The typical use of a Metric is one in which it gets created either in a
+// global scope context:
+//   static Metric* metric = new Metric("RpcCount");
+// Or within a function scope:
+//   void MyFunction(...) {
+//     static Metric* metric = new Metric("RpcCount");
+//     ...
+//     metric->AddSample(ts_nanos, some_value);
+//   }
+class TORCH_API Metric {
+ public:
+  explicit Metric(
+      std::string name,
+      MetricReprFn repr_fn = MetricFnValue,
+      size_t max_samples = 0);
+
+  const std::string& Name() const {
+    return name_;
+  }
+
+  double Accumulator() const;
+
+  void AddSample(int64_t timestamp_ns, double value);
+
+  void AddSample(double value);
+
+  std::vector<Sample> Samples(double* accumulator, size_t* total_samples) const;
+
+  std::string Repr(double value) const;
+
+ private:
+  MetricData* GetData() const;
+
+  std::string name_;
+  MetricReprFn repr_fn_;
+  size_t max_samples_;
+  mutable std::shared_ptr<MetricData> data_ptr_;
+  mutable std::atomic<MetricData*> data_;
+};
+
+// A Counter is a lightweight form of metric which tracks an integer value which
+// can increase or decrease.
+// A typical use is as:
+//   static Counter* counter = new Counter("MyCounter");
+//   ...
+//   counter->AddValue(+1);
+class TORCH_API Counter {
+ public:
+  explicit Counter(std::string name);
+
+  void AddValue(int64_t value) {
+    GetData()->AddValue(value);
+  }
+
+  int64_t Value() const {
+    return GetData()->Value();
+  }
+
+ private:
+  CounterData* GetData() const;
+
+  std::string name_;
+  mutable std::shared_ptr<CounterData> data_ptr_;
+  mutable std::atomic<CounterData*> data_;
+};
+
+#define TORCH_LAZY_COUNTER(name, value)        \
+  do {                                         \
+    static ::torch::lazy::Counter* __counter = \
+        new ::torch::lazy::Counter(name);      \
+    __counter->AddValue(value);                \
+  } while (0)
+
+#define TORCH_LAZY_FN_COUNTER(ns) TORCH_LAZY_COUNTER(c10::str(ns, __func__), 1)
+
+#define TORCH_LAZY_VALUE_METRIC(name, value)                         \
+  do {                                                               \
+    static ::torch::lazy::Metric* __metric =                         \
+        new ::torch::lazy::Metric(name, torch::lazy::MetricFnValue); \
+    __metric->AddSample(value);                                      \
+  } while (0)
+
+// Creates a report with the current metrics statistics.
+TORCH_API std::string CreateMetricReport();
+
+// Creates a report with the selected metrics statistics.
+TORCH_API std::string CreateMetricReport(
+    const std::vector<std::string>& counter_names,
+    const std::vector<std::string>& metric_names);
+
+// Returns the currently registered metric names. Note that the list can grow
+// since metrics are usually function initialized (they are static function
+// variables).
+TORCH_API std::vector<std::string> GetMetricNames();
+
+// Retrieves the metric data of a given metric, or nullptr if such metric does
+// not exist.
+TORCH_API MetricData* GetMetric(const std::string& name);
+
+// Returns the currently registered counter names. Note that the list can grow
+// since counters are usually function initialized (they are static function
+// variables).
+TORCH_API std::vector<std::string> GetCounterNames();
+
+// Retrieves the counter data of a given counter, or nullptr if such counter
+// does not exist.
+TORCH_API CounterData* GetCounter(const std::string& name);
+
+// Retrieves the current EPOCH time in nanoseconds.
+TORCH_API int64_t NowNs();
+
+// Scope based utility class TORCH_API to measure the time the code takes within
+// a given C++ scope.
+class TORCH_API TimedSection {
+ public:
+  explicit TimedSection(Metric* metric) : metric_(metric), start_(NowNs()) {}
+
+  TimedSection(TimedSection&& other) = delete;
+  TimedSection(const TimedSection&) = delete;
+  TimedSection& operator=(const TimedSection&) = delete;
+  TimedSection& operator=(TimedSection&&) = delete;
+  ~TimedSection() {
+    int64_t now = NowNs();
+    metric_->AddSample(now, static_cast<double>(now - start_));
+  }
+
+  double Elapsed() const {
+    return 1e-9 * static_cast<double>(NowNs() - start_);
+  }
+
+ private:
+  Metric* metric_;
+  int64_t start_;
+};
+
+#define TORCH_LAZY_TIMED(name)                                  \
+  static torch::lazy::Metric* timed_metric =                    \
+      new torch::lazy::Metric(name, torch::lazy::MetricFnTime); \
+  torch::lazy::TimedSection timed_section(timed_metric)
+
+#define TORCH_LAZY_FN_COUNTER_TIMED_TRACING(ns) \
+  TORCH_LAZY_FN_COUNTER(ns);                    \
+  TORCH_LAZY_TIMED("LazyTracing")
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/multi_wait.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/multi_wait.h
new file mode 100644
index 0000000000000000000000000000000000000000..3bb0da19b45caaeda797789b09c078696ac81157
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/multi_wait.h
@@ -0,0 +1,60 @@
+/**
+ * This file is adapted from PyTorch/XLA
+ * https://github.com/pytorch/xla/blob/e0e5f937a0ba8d904f9608137dc8c51ba439df2d/third_party/xla_client/multi_wait.h
+ */
+
+#pragma once
+
+#include <condition_variable>
+#include <exception>
+#include <functional>
+#include <memory>
+#include <mutex>
+
+#include <c10/macros/Export.h>
+
+namespace torch::lazy {
+
+// Support waiting for a number of tasks to complete.
+class TORCH_API MultiWait {
+ public:
+  explicit MultiWait(size_t count) : count_(count) {}
+
+  // Signal the completion of a single task.
+  void Done();
+
+  // Waits until at least count (passed as constructor value) completions
+  // happened.
+  void Wait();
+
+  // Same as above, but waits up to wait_seconds.
+  void Wait(double wait_seconds);
+
+  // Resets the threshold counter for the MultiWait object. The completed count
+  // is also reset to zero.
+  void Reset(size_t count);
+
+  // Creates a completer functor which signals the mult wait object once func
+  // has completed. Handles exceptions by signaling the multi wait with the
+  // proper status value. This API returns a function which captures a MultiWait
+  // reference, so care must be taken such that the reference remains valid for
+  // the whole lifetime of the returned function.
+  std::function<void()> Completer(std::function<void()> func);
+
+  // Similar as the above API, but with explicit capture of the MultiWait shared
+  // pointer.
+  static std::function<void()> Completer(
+      std::shared_ptr<MultiWait> mwait,
+      std::function<void()> func);
+
+ private:
+  void Complete(const std::function<void()>& func);
+
+  std::mutex mutex_;
+  std::condition_variable cv_;
+  size_t count_ = 0;
+  size_t completed_count_ = 0;
+  std::exception_ptr exptr_;
+};
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/ops/arithmetic_ir_ops.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/ops/arithmetic_ir_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe9ed9f94ce310e0715204c92901642ace0cec0b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/ops/arithmetic_ir_ops.h
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <torch/csrc/lazy/core/ir.h>
+
+namespace torch::lazy {
+
+TORCH_API NodePtr operator+(const Value& node1, const Value& node2);
+TORCH_API NodePtr operator-(const Value& node1, const Value& node2);
+TORCH_API NodePtr operator*(const Value& node1, const Value& node2);
+TORCH_API NodePtr operator/(const Value& node1, const Value& node2);
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/ops/utils.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/ops/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..c0dc2cc57024c0be6da23fe8bf6e33914e102460
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/ops/utils.h
@@ -0,0 +1,39 @@
+#include <vector>
+
+#include <torch/csrc/lazy/core/tensor_util.h>
+#include <torch/csrc/lazy/core/util.h>
+
+namespace torch::lazy {
+
+TORCH_API bool StrideIsSupported(c10::ArrayRef<int64_t> stride);
+
+TORCH_API std::vector<int64_t> GetArrayStridePermutation(
+    c10::ArrayRef<int64_t> stride);
+
+TORCH_API Shape MakeDiagonalShape(
+    const Shape& shape,
+    int64_t offset,
+    int64_t dim1,
+    int64_t dim2);
+
+TORCH_API Shape
+MakePermuteShape(const Shape& source_shape, c10::ArrayRef<int64_t> permutation);
+
+TORCH_API Shape MakeSelectShape(
+    const Shape& shape,
+    int64_t dim,
+    int64_t start,
+    int64_t end,
+    int64_t stride);
+
+TORCH_API int64_t GetStride(int64_t start, int64_t end, int64_t stride);
+
+TORCH_API std::vector<int64_t> BuildSqueezedDimensions(
+    c10::ArrayRef<int64_t> dimensions,
+    int64_t squeeze_dim);
+
+TORCH_API std::vector<int64_t> BuildUnsqueezedDimensions(
+    c10::ArrayRef<int64_t> dimensions,
+    int64_t squeeze_dim);
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/permutation_util.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/permutation_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..019d98820959a8fa415f7f88eecef577fbe0f4fd
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/permutation_util.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Exception.h>
+#include <c10/util/irange.h>
+
+#include <vector>
+
+namespace torch::lazy {
+
+TORCH_API std::vector<int64_t> InversePermutation(
+    c10::ArrayRef<int64_t> input_permutation);
+
+TORCH_API bool IsPermutation(c10::ArrayRef<int64_t> permutation);
+
+// Gathers the input using the order specified by the permutation. For each i,
+// output[i] = dimensions[permutation[i]]. The given permutation must be the
+// same size as the input.
+template <typename Container>
+std::vector<typename Container::value_type> PermuteDimensions(
+    c10::ArrayRef<int64_t> permutation,
+    const Container& dimensions) {
+  using T = typename Container::value_type;
+  TORCH_CHECK(
+      dimensions.size() == permutation.size(),
+      "Invalid permutation specified. dimensions.size() != permutation.size()  (",
+      dimensions.size(),
+      " vs. ",
+      permutation.size(),
+      ")");
+  TORCH_CHECK(
+      IsPermutation(permutation),
+      "Invalid permutation specified. Permutation is not permutation");
+  std::vector<T> output(dimensions.size());
+  for (const auto i : c10::irange(permutation.size())) {
+    output[i] = dimensions[permutation[i]];
+  }
+  return output;
+}
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/shape.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/shape.h
new file mode 100644
index 0000000000000000000000000000000000000000..3fc251bdefaf969fb7adf37f2b9c2506dbb19ff4
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/shape.h
@@ -0,0 +1,78 @@
+#pragma once
+
+#include <ostream>
+#include <vector>
+
+#include <c10/core/Scalar.h>
+#include <torch/csrc/jit/passes/symbolic_shape_analysis.h>
+#include <torch/csrc/lazy/core/hash.h>
+
+TORCH_DECLARE_bool(ltc_enable_symbolic_shapes);
+
+namespace torch::lazy {
+
+class TORCH_API Shape {
+ public:
+  Shape() = default;
+
+  Shape(
+      at::ScalarType scalar_type,
+      c10::ArrayRef<int64_t> sizes,
+      std::optional<std::vector<bool>> is_symbolic = std::nullopt);
+
+  std::string to_string() const;
+
+  c10::ScalarType scalar_type() const {
+    return scalar_type_;
+  }
+  void set_scalar_type(at::ScalarType value) {
+    scalar_type_ = value;
+  }
+
+  int64_t dim() const {
+    return static_cast<int64_t>(sizes_.size());
+  }
+  c10::ArrayRef<int64_t> sizes() const {
+    return sizes_;
+  }
+  int64_t size(int64_t dim) const {
+    return sizes_.at(dim);
+  }
+  void set_size(int64_t dim, int64_t size) {
+    sizes_.at(dim) = size;
+  }
+
+  const std::optional<std::vector<bool>>& is_symbolic() const {
+    return is_symbolic_;
+  }
+
+  // Makes a copy with symbolic dims applied
+  Shape with_symbolic_dims(
+      std::optional<std::vector<bool>> symbolic_dims) const;
+
+  size_t numel() const;
+  hash_t hash(bool bakeInSizes) const;
+
+  bool operator==(const Shape& other) const;
+
+ private:
+  c10::ScalarType scalar_type_{c10::ScalarType::Undefined};
+
+  // Sizes are the upper bound sizes for a tensor, used by XLA.
+  std::vector<int64_t> sizes_;
+  // Stores which dimensions are symbolic
+  // If nullopt, either it hasn't been initialized or the symbolic
+  // dimensions are not calculable
+  std::optional<std::vector<bool>> is_symbolic_ = std::nullopt;
+};
+
+TORCH_API std::ostream& operator<<(std::ostream& out, const Shape& shape);
+
+TORCH_API bool symbolicShapeEnabled();
+// Calculate and applies symbolic shapes onto the
+// Shape objects passed to result_shapes
+TORCH_API void applySymbolicShapesOnLT(
+    const char* schema_str,
+    std::vector<c10::IValue> args,
+    std::vector<Shape>& result_shapes);
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/shape_inference.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/shape_inference.h
new file mode 100644
index 0000000000000000000000000000000000000000..5ff7febad4e634713a44ed9271abc95519d10a29
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/shape_inference.h
@@ -0,0 +1,122 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <c10/core/ScalarType.h>
+#include <c10/core/SymInt.h>
+#include <c10/core/SymIntArrayRef.h>
+#include <c10/core/SymNodeImpl.h>
+#include <c10/macros/Export.h>
+#include <torch/csrc/lazy/backend/backend_data.h>
+#include <torch/csrc/lazy/core/ir.h>
+#include <torch/csrc/lazy/core/shape.h>
+#include <torch/csrc/lazy/core/tensor.h>
+#include <optional>
+#include <vector>
+
+namespace torch::lazy {
+// Turn clang-format off, as we rely on the whole signature being on one line
+// for codegen.
+// clang-format off
+TORCH_API std::vector<torch::lazy::Shape> compute_shape__adaptive_avg_pool2d(const at::Tensor & self, at::IntArrayRef output_size);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape__adaptive_avg_pool2d_backward(const at::Tensor & grad_output, const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape__adaptive_avg_pool3d(const at::Tensor & self, at::IntArrayRef output_size);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape__adaptive_avg_pool3d_backward(const at::Tensor & grad_output, const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_abs(const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_arange_out(const at::Scalar & start, const at::Scalar & end, const at::Scalar & step, at::Tensor & out);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_bernoulli(const at::Tensor & self, ::std::optional<at::Generator> generator);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_bernoulli(const at::Tensor & self, double p, ::std::optional<at::Generator> generator);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_binary_cross_entropy(const at::Tensor & self, const at::Tensor & target, const ::std::optional<at::Tensor> & weight, int64_t reduction);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_binary_cross_entropy_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const ::std::optional<at::Tensor> & weight, int64_t reduction);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_cat(at::TensorList tensors, int64_t dim);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_cholesky(const at::Tensor & self, bool upper);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_clamp_min(const at::Tensor & self, const at::Scalar & min);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_clone(const at::Tensor & self, ::std::optional<at::MemoryFormat> memory_format);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_constant_pad_nd(const at::Tensor & self, at::IntArrayRef pad, const at::Scalar & value);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_convolution(const at::Tensor & input, const at::Tensor & weight, const ::std::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_convolution_backward(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalIntArrayRef bias_sizes, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, ::std::array<bool,3> output_mask);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_embedding(const at::Tensor & weight, const at::Tensor & indices, int64_t padding_idx, bool scale_grad_by_freq, bool sparse);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_embedding_dense_backward(const at::Tensor & grad_output, const at::Tensor & indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_expand(const at::Tensor & self, at::IntArrayRef size, bool implicit);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_expand(const at::Tensor & self, c10::SymIntArrayRef size, bool implicit);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_flip(const at::Tensor & self, at::IntArrayRef dims);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_glu_backward(const at::Tensor & grad_output, const at::Tensor & self, int64_t dim);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_glu_jvp(const at::Tensor & glu, const at::Tensor & x, const at::Tensor & dx, int64_t dim);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_grid_sampler_2d(const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_grid_sampler_2d_backward(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, ::std::array<bool,2> output_mask);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_index_select(const at::Tensor & self, int64_t dim, const at::Tensor & index);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_inverse(const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_isnan(const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_log_sigmoid_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & buffer);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_log_sigmoid_forward(const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_logdet(const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_logical_and(const at::Tensor & self, const at::Tensor & other);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_logical_not(const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_logical_or(const at::Tensor & self, const at::Tensor & other);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_logical_xor(const at::Tensor & self, const at::Tensor & other);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_masked_fill(const at::Tensor & self, const at::Tensor & mask, const at::Scalar & value);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_masked_fill(const at::Tensor & self, const at::Tensor & mask, const at::Tensor & value);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_max(const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_mean(const at::Tensor & self, ::std::optional<at::ScalarType> dtype);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_min(const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_mv(const at::Tensor & self, const at::Tensor & vec);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_native_batch_norm(const at::Tensor & input, const ::std::optional<at::Tensor> & weight, const ::std::optional<at::Tensor> & bias, const ::std::optional<at::Tensor> & running_mean, const ::std::optional<at::Tensor> & running_var, bool training, double momentum, double eps);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_native_batch_norm_backward(const at::Tensor & grad_out, const at::Tensor & input, const ::std::optional<at::Tensor> & weight, const ::std::optional<at::Tensor> & running_mean, const ::std::optional<at::Tensor> & running_var, const ::std::optional<at::Tensor> & save_mean, const ::std::optional<at::Tensor> & save_invstd, bool train, double eps, ::std::array<bool,3> output_mask);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_native_dropout(const at::Tensor & input, double p, ::std::optional<bool> train);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_native_dropout_backward(const at::Tensor & grad_output, const at::Tensor & mask, double scale);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_native_layer_norm(const at::Tensor & input, at::IntArrayRef normalized_shape, const ::std::optional<at::Tensor> & weight, const ::std::optional<at::Tensor> & bias, double eps);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_native_layer_norm_backward(const at::Tensor & grad_out, const at::Tensor & input, at::IntArrayRef normalized_shape, const at::Tensor & mean, const at::Tensor & rstd, const ::std::optional<at::Tensor> & weight, const ::std::optional<at::Tensor> & bias, ::std::array<bool,3> output_mask);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_new_empty_strided(const at::Tensor & self, at::IntArrayRef size, at::IntArrayRef stride, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_nll_loss2d_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const ::std::optional<at::Tensor> & weight, int64_t reduction, int64_t ignore_index, const at::Tensor & total_weight);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_nll_loss2d_forward(const at::Tensor & self, const at::Tensor & target, const ::std::optional<at::Tensor> & weight, int64_t reduction, int64_t ignore_index);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_nonzero(const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_normal_functional(const at::Tensor & self, double mean, double std, ::std::optional<at::Generator> generator);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_random(const at::Tensor & self, ::std::optional<at::Generator> generator);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_random(const at::Tensor & self, int64_t to, ::std::optional<at::Generator> generator);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_random(const at::Tensor & self, int64_t from, ::std::optional<int64_t> to, ::std::optional<at::Generator> generator);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_relu(const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_repeat(const at::Tensor & self, at::IntArrayRef repeats);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_slogdet(const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_smooth_l1_loss_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction, double beta);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_sort(const at::Tensor & self, int64_t dim, bool descending);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_stack(at::TensorList tensors, int64_t dim);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_std(const at::Tensor & self, bool unbiased);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_std(const at::Tensor & self, at::OptionalIntArrayRef dim, bool unbiased, bool keepdim);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_std(const at::Tensor & self, at::OptionalIntArrayRef dim, const ::std::optional<at::Scalar> & correction, bool keepdim);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_sum(const at::Tensor & self, ::std::optional<at::ScalarType> dtype);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape__to_copy(const at::Tensor & self, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory, bool non_blocking, ::std::optional<at::MemoryFormat> memory_format);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_take(const at::Tensor & self, const at::Tensor & index);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_trace(const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_zero(const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_narrow_copy_symint(const at::Tensor & self, int64_t dim, int64_t start, c10::SymInt length);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_hardswish(const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_hardswish_backward(const at::Tensor & grad_output, const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_selu(const at::Tensor & self);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_uniform(const at::Tensor & self, double from, double to, ::std::optional<at::Generator> generator);
+
+// Non-Native ops
+TORCH_API std::vector<Shape> compute_shape_scalar(const at::Scalar& value, const at::ScalarType& type);
+TORCH_API std::vector<Shape> compute_shape_expand(const Output& input0, const std::vector<int64_t>& size, const bool& is_scalar_expand);
+TORCH_API std::vector<Shape> compute_shape_view(const Output& input0, const std::vector<int64_t>& output_sizes);
+TORCH_API std::vector<Shape> compute_shape_cast(const Output& input0, const at::ScalarType& dtype, const ::std::optional<at::ScalarType>& stype);
+
+// View Ops
+// (Now that functionalization pass is used, we should kill these in a later PR)
+TORCH_API std::vector<Shape> compute_shape_as_strided_view_update(const Output& target, const Output& input, const std::vector<int64_t>& size, const std::vector<int64_t>& stride, const int64_t& storage_offset);
+TORCH_API std::vector<Shape> compute_shape_as_strided(const Output& input, const std::vector<int64_t>& size, const std::vector<int64_t>& stride, const int64_t& storage_offset);
+TORCH_API std::vector<Shape> compute_shape_diagonal_view_update(const Output& target, const Output& input, const int64_t& offset, const int64_t& dim1, const int64_t& dim2);
+TORCH_API std::vector<Shape> compute_shape_diagonal(const Output& input, const int64_t& offset, const int64_t& dim1, const int64_t& dim2);
+TORCH_API std::vector<Shape> compute_shape_narrow_view_update(const Output& input, const Output& source, const std::vector<int64_t>& base_indices);
+TORCH_API std::vector<Shape> compute_shape_narrow(const Output& input, const std::vector<int64_t>& base_indices, const std::vector<int64_t>& sizes);
+TORCH_API std::vector<Shape> compute_shape_permute(const Output& input, const std::vector<int64_t>& dims);
+TORCH_API std::vector<Shape> compute_shape_resize(const Output& input, const std::vector<int64_t>& size);
+TORCH_API std::vector<Shape> compute_shape_select_view_update(const Output& target, const Output& source, const int64_t& dim, const int64_t& start, const int64_t& end, const int64_t& stride);
+TORCH_API std::vector<Shape> compute_shape_select(const Output& input, const int64_t& dim, const int64_t& start, const int64_t& end, const int64_t& stride);
+TORCH_API std::vector<Shape> compute_shape_squeeze(const Output& input, const int& dim);
+TORCH_API std::vector<Shape> compute_shape_unsqueeze(const Output& input, const int& dim);
+
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_select_scatter(const at::Tensor & self, const at::Tensor & src, int64_t dim, int64_t index);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_diagonal_scatter(const at::Tensor & self, const at::Tensor & src, int64_t offset, int64_t dim1, int64_t dim2);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_slice_scatter_symint(const at::Tensor & self, const at::Tensor & src, int64_t dim, ::std::optional<c10::SymInt> start, ::std::optional<c10::SymInt> end, c10::SymInt step);
+TORCH_API std::vector<torch::lazy::Shape> compute_shape_as_strided_scatter_symint(const at::Tensor & self, const at::Tensor & src, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, ::std::optional<c10::SymInt> storage_offset);
+// clang-format on
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/tensor.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..1c393d11e197ebaa03e8d8c43c80143b5c648f81
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/tensor.h
@@ -0,0 +1,265 @@
+#pragma once
+
+#include <c10/core/SymNodeImpl.h>
+#include <c10/util/intrusive_ptr.h>
+#include <torch/csrc/lazy/backend/backend_data.h>
+#include <torch/csrc/lazy/backend/backend_device.h>
+#include <torch/csrc/lazy/core/ir.h>
+#include <torch/csrc/lazy/core/util.h>
+
+namespace torch::lazy {
+
+class TORCH_API SymNodeImpl : public c10::SymNodeImpl {
+ public:
+  SymNodeImpl(NodePtr ptr) : node_(std::move(ptr)) {}
+  NodePtr node_;
+};
+
+class LazyTensor;
+using LazyTensorPtr = c10::intrusive_ptr<LazyTensor>;
+
+class TORCH_API LazyTensor : public c10::intrusive_ptr_target {
+ public:
+  // This is the core lazy tensor data structure where all the tensor data is
+  // held. The lazy tensor is nothing more than a shared pointer to a Data
+  // object.
+  struct Data {
+    Data(BackendDataPtr handle, BackendDevice device)
+        : handle(std::move(handle)),
+          device(std::move(device)),
+          unique_id(GetNextTensorId()) {}
+    Data(Value ir_value, BackendDevice device)
+        : ir_value(std::move(ir_value)),
+          device(std::move(device)),
+          unique_id(GetNextTensorId()) {}
+    Data(at::Tensor tensor_data, BackendDevice device)
+        : tensor_data(std::move(tensor_data)),
+          device(std::move(device)),
+          unique_id(GetNextTensorId()) {}
+    // TODO(alanwaketan): Remove this ctor. This is a
+    // temporary ctor to ease XLA LTC migration. It depends on
+    // XLA's Functionalization integration.
+    Data(BackendDevice device)
+        : device(std::move(device)), unique_id(GetNextTensorId()) {}
+
+    Data(Data&& other) = delete;
+    Data(const Data&) = delete;
+    Data& operator=(const Data&) = delete;
+    Data& operator=(Data&&) = delete;
+    virtual ~Data();
+
+    BackendDataPtr handle;
+    Value ir_value;
+    std::optional<at::Tensor> tensor_data;
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+    const BackendDevice device;
+    // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+    const int64_t unique_id = 0;
+    size_t generation = 1;
+  };
+
+  static LazyTensorPtr Create(
+      const at::Tensor& tensor,
+      const BackendDevice& device);
+  static LazyTensorPtr Create(Value ir_value, const BackendDevice& device);
+  static LazyTensorPtr Create(const BackendDataPtr& handle);
+  static LazyTensorPtr Create(std::shared_ptr<Data> data);
+
+  // The default ctor previously created a null LazyTensor (one with no 'data'
+  // obj). Creating a null LazyTensor is no longer possible, since the same can
+  // be achieved by creating a null LazyTensorPtr and it is way too confusing to
+  // have to check both lazy_tensor_ptr && *lazy_tensor_ptr, so everywhere that
+  // used to rely on a LazyTensor obj with a null Data can now rely on a null
+  // LazyTensorPtr instead.
+  LazyTensor() = delete;
+  LazyTensor(const LazyTensor&) = default;
+  LazyTensor(LazyTensor&&) noexcept = default;
+  LazyTensor& operator=(const LazyTensor&) = default;
+  LazyTensor& operator=(LazyTensor&&) noexcept = default;
+
+  ~LazyTensor() override = default;
+
+  size_t generation() const {
+    return data()->generation;
+  }
+
+  // Override it to use your own Shape.
+  virtual int64_t size(int64_t dim) const;
+
+  // Override it to use your own graph executor.
+  virtual at::Tensor ToTensor(bool detached);
+
+  void ShallowCopyTo(const LazyTensorPtr& dest) const;
+
+  // Assigns the tensor value to the lazy tensor.
+  void SetTensor(at::Tensor tensor);
+
+  void UpdateFromTensor(const at::Tensor& tensor, bool sync);
+  void UpdateFromTensorOut(const at::Tensor& tensor);
+  void UpdateFromTensorOut(const LazyTensorPtr& tensor);
+
+  const std::shared_ptr<Data>& data() const;
+
+  // Override it to use your own type conversion.
+  virtual at::ScalarType dtype() const;
+
+  MaybeRef<Shape> shape() const;
+
+  const BackendDevice& GetDevice() const;
+  int64_t GetUniqueId() const;
+
+  // Fetches the data behind the tensor. If the tensor has a graph defining
+  // its current value, executes the graph and fetches the data result.
+  BackendDataPtr GetDataHandle();
+
+  // Fetches the current value of the data, which can be missing (nullptr)
+  // in case the tensor has a graph defining its current value,
+  BackendDataPtr CurrentDataHandle() const;
+
+  void SetDataHandle(BackendDataPtr handle);
+  void SetDataHandle(BackendDataPtr handle, bool sync);
+
+  // Retrieves the current IR Node, or nullptr in case no active IR Node is
+  // available.
+  Value CurrentIrValue() const;
+
+  // Retrieves the IR Node representing this LazyTensor. One will be created if
+  // missing. Note that although this is a const API, it actually changes the
+  // internal state of the object.
+  Value GetIrValue() const;
+
+  void SetIrValue(Value ir_value);
+  void SetInPlaceIrValue(Value ir_value);
+
+  std::optional<at::Tensor> CurrentTensorData() const;
+
+  std::vector<LazyTensorPtr> MakeOutputTensors(const NodePtr& node) const;
+
+  LazyTensorPtr CopyTensorToDevice(const BackendDevice& device);
+
+  // Applies the queue of operations in preparation for using the data.
+  // Override it to use your own graph executor.
+  virtual void ApplyPendingGraph();
+
+  // Override it to set extra information.
+  virtual void AssignIrValue(Value ir_value) const;
+
+ protected:
+  explicit LazyTensor(std::shared_ptr<Data> data);
+
+  void SetTensorData(at::Tensor tensor_data);
+
+  // We build a graph accumulating operations, but at a given point we
+  // need to force a rendering, otherwise the graph can grow without control.
+  // Think:
+  //   for i in range(0, 100000):
+  //     a = a + b
+  void TryLimitGraphSize();
+
+  // Override it to instantiate your own data.
+  virtual Value GetIrValueForTensor(
+      const at::Tensor& tensor,
+      const BackendDevice& device) const;
+
+  Value CreateTensorNode(const BackendDataPtr& data, bool read_only) const;
+
+ private:
+  LazyTensor(const at::Tensor& tensor, const BackendDevice& device);
+  LazyTensor(Value ir_value, const BackendDevice& device);
+  explicit LazyTensor(const BackendDataPtr& handle);
+
+  static int64_t GetNextTensorId();
+
+  std::shared_ptr<Data> data_;
+};
+
+// Utils to convert at::Tensor to LazyTensor, and vice versa.
+
+// Section 0: c10::Tensorlist ==> lazy::TensorList
+// note: GetTensorList is not totally parallel to GetLtcTensor; A TensorList
+// skips
+//       the LazyTensor wrappers, assuming that the list of underlying IR nodes
+//       is actually more useful for downstream computations.  TBD.
+TORCH_API torch::lazy::Value GetTensorList(at::ITensorListRef tensors);
+
+// Section 1: at::Tensor => LazyTensor.
+// Extracts the LazyTensor out of an at::Tensor. Returns a null LazyTensor
+// if the tensor is not a lazy tensor.
+TORCH_API LazyTensorPtr TryGetLtcTensor(const at::Tensor& tensor);
+
+// Extracts the LazyTensor out of an at::Tensor. Throws an exception
+// if the tensor is not a lazy tensor.
+TORCH_API LazyTensorPtr GetLtcTensor(const at::Tensor& tensor);
+
+// Same as above, applied to a list of tensors.
+TORCH_API std::vector<LazyTensorPtr> GetLtcTensors(
+    c10::ArrayRef<at::Tensor> tensors);
+
+// If tensor is a lazy tensor type, returns the LazyTensor embedded within it,
+// otherwise creates a new lazy tensor type with tensor as data.
+TORCH_API LazyTensorPtr GetOrCreateLtcTensor(
+    const std::optional<at::Tensor>& tensor,
+    const BackendDevice& device);
+
+TORCH_API LazyTensorPtr GetLtcTensorOrCreateForWrappedNumber(
+    const at::Tensor& tensor,
+    const BackendDevice& device);
+
+// Section 2: LazyTensor => at::Tensor.
+// Creates an ATen tensor from an LazyTensor.
+TORCH_API at::Tensor CreateAtenFromLtcTensor(const LazyTensorPtr& ltc_tensor);
+TORCH_API at::Tensor CreateAtenFromLtcTensor(LazyTensor&& ltc_tensor);
+
+// Note [Lazy Tensor Functionalization]
+// The functionalization pass is implemented by wrapping all TensorImpl
+// objects in C++ with an extra FunctionalTensorWrapper object,
+// that knows how to perform functionalization
+//
+// Certain functions in the aten API serve as entry/exit points for
+// functionalization, where we need to perform the wrapping/unwrapping:
+// - aten::to.device
+// - aten::empty
+
+// Given a non-lazy tensor, this function creates a lazy tensor on the specified
+// (lazy) device. The functionalize_output determines whether or not we should
+// wrap the output in a "functional wrapper".
+//
+// How do you know whether to pass true/false for functionalize_output?
+//
+// Case 1: nonlazy -> lazy
+//   If you're implementing a function that takes in nonlazy tensors and returns
+//   lazy tensors, then you should think of that function as an "entrypoint" to
+//   functionalization, and use functionalize_output=true Examples include:
+//   - factory functions (the LTC kernel for at::empty)
+//   - CPU -> Lazy device conversions (the LTC kernel for at::to_device)
+//
+// Case 2: lazy -> lazy
+//   If you're implementing a function that takes in lazy tensors and returns
+//   lazy tensors,
+//   **but** requires creating lazy tensors internally,
+//   then you can assume that the current function is running inside of some
+//   outer context where functionalization is already running, that will take
+//   care of doing the wrapping for you, and use functionalize_output=true
+//   Examples include:
+//   - CPU fallback (takes in lazy tensors, converts to cpu, calls kernel,
+//   converts returns back to lazy tensors).
+TORCH_API at::Tensor to_lazy_tensor(
+    const at::Tensor& self,
+    const c10::TensorOptions& options,
+    at::Device device,
+    bool non_blocking,
+    bool functionalize_output);
+
+template <size_t... Indices>
+auto TupleAtenFromLtcTensorsImpl(
+    const std::vector<LazyTensorPtr>& tensors,
+    std::index_sequence<Indices...>) {
+  return std::make_tuple(CreateAtenFromLtcTensor(tensors[Indices])...);
+}
+
+template <size_t N>
+auto TupleAtenFromLtcTensors(const std::vector<LazyTensorPtr>& tensors) {
+  return TupleAtenFromLtcTensorsImpl(tensors, std::make_index_sequence<N>{});
+}
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/tensor_impl.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/tensor_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..ec3f0c28508313d254a7bf664ad5a3b6c981d8de
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/tensor_impl.h
@@ -0,0 +1,60 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+#include <c10/core/SymIntArrayRef.h>
+#include <c10/core/TensorImpl.h>
+
+#include <torch/csrc/lazy/core/tensor.h>
+
+namespace torch::lazy {
+
+// Tensor implementation class used to be fed to the at::Tensor.
+// Its scope is just to handle an LazyTensor.
+class TORCH_API LTCTensorImpl final : public c10::TensorImpl {
+ public:
+  explicit LTCTensorImpl(const LazyTensorPtr& tensor);
+  explicit LTCTensorImpl(const LazyTensor& tensor);
+  explicit LTCTensorImpl(LazyTensor&& tensor);
+
+  LazyTensorPtr tensor() {
+    return tensor_;
+  }
+
+  void set_tensor(const LazyTensorPtr& lazy_tensor);
+
+  void force_refresh_sizes() {
+    generation_ = 0;
+  }
+
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+      const c10::VariableVersion& version_counter,
+      bool allow_tensor_metadata_change) const override;
+
+  c10::intrusive_ptr<TensorImpl> shallow_copy_and_detach(
+      c10::VariableVersion&& version_counter,
+      bool allow_tensor_metadata_change) const override;
+
+  void shallow_copy_from(const c10::intrusive_ptr<TensorImpl>& impl) override;
+
+  at::IntArrayRef sizes_custom() const override;
+  at::IntArrayRef strides_custom() const override;
+  int64_t numel_custom() const override;
+  int64_t storage_offset_custom() const override;
+  int64_t dim_custom() const override;
+  bool is_contiguous_custom(at::MemoryFormat memory_format) const override;
+  bool is_strides_like_custom(at::MemoryFormat memory_format) const override;
+  bool is_non_overlapping_and_dense_custom() const override;
+
+  c10::SymIntArrayRef sym_sizes_custom() const override;
+  c10::SymIntArrayRef sym_strides_custom() const override;
+  c10::SymInt sym_numel_custom() const override;
+
+ private:
+  void setup_size_properties();
+
+  LazyTensorPtr tensor_;
+  mutable std::optional<std::vector<c10::SymInt>> sym_sizes_;
+  size_t generation_{0};
+};
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/tensor_util.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/tensor_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..353d7d1352279df87a8a3095d1d41a502ddfce38
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/tensor_util.h
@@ -0,0 +1,76 @@
+#pragma once
+
+#include <torch/csrc/lazy/backend/backend_interface.h>
+#include <torch/csrc/lazy/core/shape.h>
+
+#include <ATen/FunctionalTensorWrapper.h>
+
+#include <string>
+#include <vector>
+
+namespace torch::lazy {
+
+TORCH_API std::vector<int64_t> ComputeArrayStrides(
+    c10::ArrayRef<int64_t> sizes);
+
+TORCH_API std::vector<at::Tensor> DataHandlesToTensors(
+    c10::ArrayRef<BackendDataPtr> data_handles,
+    at::ScalarType dest_element_type);
+
+// Uploads an ATEN tensor data to the device and fetches the corresponding
+// device data handle.
+TORCH_API BackendDataPtr
+TensorToDataHandle(const at::Tensor& tensor, const BackendDevice& device);
+
+// Retrieves the device data handles by parallel uploading data onto the
+// corresponding devices.
+TORCH_API std::vector<BackendDataPtr> CreateTensorsData(
+    const std::vector<at::Tensor>& tensors,
+    const std::vector<BackendDevice>& devices);
+
+// Makes a deep copy of an ATEN tensor.
+inline at::Tensor CopyTensor(const at::Tensor& ref) {
+  return ref.to(ref.options(), /*non_blocking=*/false, /*copy=*/true);
+}
+
+// Same as above, with an additional cast.
+inline at::Tensor CopyTensor(
+    const at::Tensor& ref,
+    at::ScalarType dest_type,
+    bool copy = true) {
+  return ref.to(ref.options().dtype(dest_type), /*non_blocking=*/false, copy);
+}
+
+template <typename T, typename S>
+T OptionalOr(const std::optional<S>& value, T defval) {
+  return value ? static_cast<T>(*value) : defval;
+}
+
+// Unwraps tensor to target dtype if it's a wrapped number.
+inline at::Tensor UnwrapNumber(const at::Tensor& tensor, at::ScalarType dtype) {
+  return tensor.unsafeGetTensorImpl()->is_wrapped_number() ? tensor.to(dtype)
+                                                           : tensor;
+}
+
+template <typename T>
+at::Scalar MakeIntScalar(T value) {
+  return at::Scalar(static_cast<int64_t>(value));
+}
+
+// Routing values to device data maximizes the changes for compilation cache
+// hits, but it can prevent the compiler to perform optimizations. So tensor
+// values which are within a given set, are routed to constant scalars if this
+// API returns true.
+TORCH_API bool IsSpecialScalar(const at::Scalar& value);
+
+// Note: returns a reference instead of a fresh tensor to avoid refcount bumps.
+inline const at::Tensor& maybe_unwrap_functional(const at::Tensor& tensor) {
+  if (at::functionalization::impl::isFunctionalTensor(tensor)) {
+    return at::functionalization::impl::unsafeGetFunctionalWrapper(tensor)
+        ->value();
+  } else {
+    return tensor;
+  }
+}
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/thread_pool.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/thread_pool.h
new file mode 100644
index 0000000000000000000000000000000000000000..32715f14810f495d8ed2746297dd7bd754225b6c
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/thread_pool.h
@@ -0,0 +1,36 @@
+/**
+ * This file is adapted from PyTorch/XLA
+ * https://github.com/pytorch/xla/blob/e0e5f937a0ba8d904f9608137dc8c51ba439df2d/third_party/xla_client/metrics.h
+ */
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <thread>
+
+#include <c10/macros/Export.h>
+
+namespace torch::lazy {
+
+// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
+class TORCH_API Completion {
+ public:
+  class Data;
+
+  explicit Completion(std::shared_ptr<Data> data);
+
+  ~Completion();
+
+  void Wait();
+
+ private:
+  std::shared_ptr<Data> data_;
+};
+
+// Schedules a closure which might wait for IO or other events/conditions.
+TORCH_API void ScheduleIoClosure(std::function<void()> closure);
+TORCH_API Completion
+ScheduleIoClosureWithCompletion(std::function<void()> closure);
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/trie.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/trie.h
new file mode 100644
index 0000000000000000000000000000000000000000..45a03039389b8fe7a3ddae563929f995e1a10a91
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/trie.h
@@ -0,0 +1,77 @@
+#pragma once
+
+#include <atomic>
+#include <list>
+
+#include <c10/core/ScalarType.h>
+#include <torch/csrc/lazy/core/ir.h>
+#include <torch/csrc/lazy/core/metrics.h>
+
+namespace torch::lazy {
+
+struct TORCH_API TrieNode {
+  static size_t GetNextUniqueId() {
+    static thread_local size_t id_generator = 0;
+    return id_generator++;
+  }
+
+  size_t unique_id;
+  size_t hit_counter;
+  NodePtr ir_node;
+  std::list<std::shared_ptr<TrieNode>> successors;
+
+  TrieNode() : unique_id(GetNextUniqueId()), hit_counter(0), ir_node(nullptr) {}
+  explicit TrieNode(NodePtr node)
+      : unique_id(GetNextUniqueId()),
+        hit_counter(0),
+        ir_node(std::move(node)) {}
+};
+
+class TORCH_API TrieCache {
+ public:
+  static TrieCache* Get();
+
+  TrieNode* Current() const;
+  // Take an iterator as the input because we want to move the corresponding
+  // node in the successor list to achieve a LRU caching effect
+  void SetCurrent(std::list<std::shared_ptr<TrieNode>>::iterator& iter);
+  // Used in MarkStep to indicate the end of one tracing
+  void ResetCurrent();
+
+  // Create a new TrieNode for ir_node and insert into the TrieCache
+  void Insert(NodePtr ir_node);
+
+  // Clear all TrieCache nodes
+  // TODO: Because we don't expect user to explicitly call this function via
+  // a Python API, we may need to introduce a threshold on the size of the cache
+  // to avoid holding tensors for too long.
+  void Clear();
+
+  void DumpToDotFile(const std::string& file_name);
+
+ private:
+  TrieCache();
+
+  std::shared_ptr<TrieNode> root_;
+  TrieNode* current_;
+};
+
+template <typename T, typename... Args>
+NodePtr LookupNodeFromTrieCache(Args&&... args) {
+  auto& successors = TrieCache::Get()->Current()->successors;
+  for (auto it = successors.begin(); it != successors.end(); it++) {
+    NodePtr ir_node = (*it)->ir_node;
+    const T* concrete_node = NodeCast<T>(ir_node.get());
+    if (concrete_node &&
+        concrete_node->CanBeReused(std::forward<Args>(args)...)) {
+      TORCH_LAZY_COUNTER(
+          "IrNodeReused_" + c10::demangle((typeid(T).name())), 1);
+      (*it)->hit_counter++;
+      TrieCache::Get()->SetCurrent(it);
+      return ir_node;
+    }
+  }
+  return nullptr;
+}
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/unique.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/unique.h
new file mode 100644
index 0000000000000000000000000000000000000000..dae0c1926a82f2276308e3782bbeb34e419fa626
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/unique.h
@@ -0,0 +1,54 @@
+/**
+ * Unique in this file is adapted from PyTorch/XLA
+ * https://github.com/pytorch/xla/blob/e0e5f937a0ba8d904f9608137dc8c51ba439df2d/third_party/xla_client/unique.h
+ */
+
+#pragma once
+
+#include <optional>
+
+#include <functional>
+#include <set>
+
+namespace torch::lazy {
+
+// Helper class to allow tracking zero or more things, which should be forcibly
+// be one only thing.
+template <typename T, typename C = std::equal_to<T>>
+class Unique {
+ public:
+  std::pair<bool, const T&> set(const T& value) {
+    if (value_) {
+      TORCH_CHECK(C()(*value_, value), "'", *value_, "' vs '", value);
+      return std::pair<bool, const T&>(false, *value_);
+    }
+    value_ = value;
+    return std::pair<bool, const T&>(true, *value_);
+  }
+
+  operator bool() const {
+    return value_.has_value();
+  }
+  operator const T&() const {
+    return *value_;
+  }
+  const T& operator*() const {
+    return *value_;
+  }
+  const T* operator->() const {
+    return value_.operator->();
+  }
+
+  std::set<T> AsSet() const {
+    std::set<T> vset;
+    if (value_.has_value()) {
+      vset.insert(*value_);
+    }
+    return vset;
+  }
+
+ private:
+  std::optional<T> value_;
+};
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/util.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/util.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd672fffa6ac61d5118b98ebcb0bd55f9f3c9b4e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/core/util.h
@@ -0,0 +1,125 @@
+/**
+ * Most of the utils in this file is adapted from PyTorch/XLA
+ * https://github.com/pytorch/xla/blob/e0e5f937a0ba8d904f9608137dc8c51ba439df2d/third_party/xla_client/util.h
+ */
+
+#pragma once
+
+#include <exception>
+#include <functional>
+#include <vector>
+
+#include <c10/util/OptionalArrayRef.h>
+#include <optional>
+
+namespace torch::lazy {
+
+// Similar to c10::scope_exit but with a status.
+// TODO(alanwaketan): Consolidate it with c10::scope_exit.
+template <typename T>
+class Cleanup {
+ public:
+  using StatusType = T;
+
+  explicit Cleanup(std::function<void(StatusType&&)>&& func)
+      : func_(std::move(func)) {}
+  Cleanup(Cleanup&& ref) noexcept
+      : func_(std::move(ref.func_)), status_(std::move(ref.status_)) {}
+  Cleanup(const Cleanup&) = delete;
+
+  ~Cleanup() {
+    if (func_ != nullptr) {
+      func_(std::move(status_));
+    }
+  }
+
+  Cleanup& operator=(const Cleanup&) = delete;
+
+  Cleanup& operator=(Cleanup&& ref) noexcept {
+    if (this != &ref) {
+      func_ = std::move(ref.func_);
+      status_ = std::move(ref.status_);
+    }
+    return *this;
+  }
+
+  void Release() {
+    func_ = nullptr;
+  }
+
+  void SetStatus(StatusType&& status) {
+    status_ = std::move(status);
+  }
+
+  const StatusType& GetStatus() const {
+    return status_;
+  }
+
+ private:
+  std::function<void(StatusType&&)> func_;
+  StatusType status_;
+};
+
+using ExceptionCleanup = Cleanup<std::exception_ptr>;
+
+// Allows APIs which might return const references and values, to not be forced
+// to return values in the signature.
+// TODO(alanwaketan): This is clever, but is there really no std or c10
+// supports? Needs more investigations.
+template <typename T>
+class MaybeRef {
+ public:
+  /* implicit */ MaybeRef(const T& ref) : ref_(ref) {}
+  /* implicit */ MaybeRef(T&& value)
+      : storage_(std::move(value)), ref_(*storage_) {}
+
+  const T& Get() const {
+    return ref_;
+  }
+  const T& operator*() const {
+    return Get();
+  }
+  operator const T&() const {
+    return Get();
+  }
+
+  bool IsStored() const {
+    return storage_.has_value();
+  }
+
+ private:
+  std::optional<T> storage_;
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
+  const T& ref_;
+};
+
+template <typename T>
+std::vector<T> Iota(size_t size, T init = 0, T incr = 1) {
+  std::vector<T> result(size);
+  T value = init;
+  for (size_t i = 0; i < size; ++i, value += incr) {
+    result[i] = value;
+  }
+  return result;
+}
+
+template <typename T, typename S>
+std::vector<T> ToVector(const S& input) {
+  return std::vector<T>(input.begin(), input.end());
+}
+
+template <typename T>
+std::optional<std::vector<T>> ToOptionalVector(
+    c10::OptionalArrayRef<T> arrayRef) {
+  if (arrayRef) {
+    return arrayRef->vec();
+  }
+  return std::nullopt;
+}
+
+template <typename T>
+std::underlying_type_t<T> GetEnumValue(T value) {
+  return static_cast<std::underlying_type_t<T>>(value);
+}
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/generated/LazyIr.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/generated/LazyIr.h
new file mode 100644
index 0000000000000000000000000000000000000000..9b55a624c5f7d7d1010053cd52feaedc1d744b2b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/generated/LazyIr.h
@@ -0,0 +1,10307 @@
+#pragma once
+
+// This file contains autogenerated LazyTensor IR nodes
+#include <ATen/core/Formatting.h>
+#include <c10/core/ScalarType.h>
+#include <torch/csrc/lazy/core/hash.h>
+#include <torch/csrc/lazy/core/ir.h>
+#include <torch/csrc/lazy/core/shape.h>
+#include <optional>
+#include <vector>
+#include "torch/csrc/lazy/ts_backend/ts_node.h"
+
+namespace torch {
+namespace lazy {
+using at::operator<<;
+
+// kNullValue is used to contribute a static hash value any time
+// a node has an Optional<Value> input that is nullopt.  It is important
+// to differentiate between HASH(std::nullopt, something) and HASH(something, std::nullopt),
+// and using kNullValue in the hash function in the order of arguments
+// serves this purpose.
+static const torch::lazy::Value kNullValue = torch::lazy::Value();
+
+class AdaptiveAvgPool2d : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::_adaptive_avg_pool2d);
+  }
+
+  AdaptiveAvgPool2d(const torch::lazy::Value& self, const ::std::vector<int64_t>& output_size, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              AdaptiveAvgPool2d::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(output_size)),
+        output_size(output_size)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", output_size=" << output_size;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const ::std::vector<int64_t>& output_size) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        this->output_size == output_size);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("output_size", output_size);
+
+    torch::lazy::TSOpVector _adaptive_avg_pool2d_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(_adaptive_avg_pool2d_out.size(), 1);
+
+    return _adaptive_avg_pool2d_out;
+
+  }
+
+
+  ::std::vector<int64_t> output_size;
+
+
+};
+
+class AdaptiveAvgPool2dBackward : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::_adaptive_avg_pool2d_backward);
+  }
+
+  AdaptiveAvgPool2dBackward(const torch::lazy::Value& grad_output, const torch::lazy::Value& self, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              AdaptiveAvgPool2dBackward::ClassOpKind(),
+              OpList{grad_output, self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& grad_output, const torch::lazy::Value& self) const {
+    size_t i = 0;
+    return (operand(i++) == grad_output &&
+        operand(i++) == self);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector _adaptive_avg_pool2d_backward_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(_adaptive_avg_pool2d_backward_out.size(), 1);
+
+    return _adaptive_avg_pool2d_backward_out;
+
+  }
+
+
+
+
+
+};
+
+class LogSoftmax : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::_log_softmax);
+  }
+
+  LogSoftmax(const torch::lazy::Value& self, const int64_t& dim, const bool& half_to_float, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              LogSoftmax::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(dim, half_to_float)),
+        dim(dim),
+        half_to_float(half_to_float)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", dim=" << dim;
+    ss << ", half_to_float=" << half_to_float;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const int64_t& dim, const bool& half_to_float) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        this->dim == dim &&
+        this->half_to_float == half_to_float);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(3);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("dim", dim);
+    arguments.emplace_back("half_to_float", half_to_float);
+
+    torch::lazy::TSOpVector _log_softmax_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(_log_softmax_out.size(), 1);
+
+    return _log_softmax_out;
+
+  }
+
+
+  int64_t dim;
+  bool half_to_float;
+
+
+};
+
+class LogSoftmaxBackwardData : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::_log_softmax_backward_data);
+  }
+
+  LogSoftmaxBackwardData(const torch::lazy::Value& grad_output, const torch::lazy::Value& output, const int64_t& dim, const at::ScalarType& input_dtype, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              LogSoftmaxBackwardData::ClassOpKind(),
+              OpList{grad_output, output},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(dim, input_dtype)),
+        dim(dim),
+        input_dtype(input_dtype)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", dim=" << dim;
+    ss << ", input_dtype=" << input_dtype;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& grad_output, const torch::lazy::Value& output, const int64_t& dim, const at::ScalarType& input_dtype) const {
+    size_t i = 0;
+    return (operand(i++) == grad_output &&
+        operand(i++) == output &&
+        this->dim == dim &&
+        this->input_dtype == input_dtype);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(4);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("dim", dim);
+    arguments.emplace_back("input_dtype", input_dtype);
+
+    torch::lazy::TSOpVector _log_softmax_backward_data_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(_log_softmax_backward_data_out.size(), 1);
+
+    return _log_softmax_backward_data_out;
+
+  }
+
+
+  int64_t dim;
+  at::ScalarType input_dtype;
+
+
+};
+
+class ReshapeAliasCopy : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::_reshape_alias_copy);
+  }
+
+  ReshapeAliasCopy(const torch::lazy::Value& self, const ::std::vector<int64_t>& size, const ::std::vector<int64_t>& stride, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              ReshapeAliasCopy::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(size, stride)),
+        size(size),
+        stride(stride)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", size=" << size;
+    ss << ", stride=" << stride;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const ::std::vector<int64_t>& size, const ::std::vector<int64_t>& stride) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        this->size == size &&
+        this->stride == stride);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(3);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("size", size);
+    arguments.emplace_back("stride", stride);
+
+    torch::lazy::TSOpVector _reshape_alias_copy_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(_reshape_alias_copy_out.size(), 1);
+
+    return _reshape_alias_copy_out;
+
+  }
+
+
+  ::std::vector<int64_t> size;
+  ::std::vector<int64_t> stride;
+
+
+};
+
+class Softmax : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::_softmax);
+  }
+
+  Softmax(const torch::lazy::Value& self, const int64_t& dim, const bool& half_to_float, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Softmax::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(dim, half_to_float)),
+        dim(dim),
+        half_to_float(half_to_float)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", dim=" << dim;
+    ss << ", half_to_float=" << half_to_float;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const int64_t& dim, const bool& half_to_float) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        this->dim == dim &&
+        this->half_to_float == half_to_float);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(3);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("dim", dim);
+    arguments.emplace_back("half_to_float", half_to_float);
+
+    torch::lazy::TSOpVector _softmax_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(_softmax_out.size(), 1);
+
+    return _softmax_out;
+
+  }
+
+
+  int64_t dim;
+  bool half_to_float;
+
+
+};
+
+class SoftmaxBackwardData : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::_softmax_backward_data);
+  }
+
+  SoftmaxBackwardData(const torch::lazy::Value& grad_output, const torch::lazy::Value& output, const int64_t& dim, const at::ScalarType& input_dtype, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              SoftmaxBackwardData::ClassOpKind(),
+              OpList{grad_output, output},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(dim, input_dtype)),
+        dim(dim),
+        input_dtype(input_dtype)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", dim=" << dim;
+    ss << ", input_dtype=" << input_dtype;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& grad_output, const torch::lazy::Value& output, const int64_t& dim, const at::ScalarType& input_dtype) const {
+    size_t i = 0;
+    return (operand(i++) == grad_output &&
+        operand(i++) == output &&
+        this->dim == dim &&
+        this->input_dtype == input_dtype);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(4);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("dim", dim);
+    arguments.emplace_back("input_dtype", input_dtype);
+
+    torch::lazy::TSOpVector _softmax_backward_data_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(_softmax_backward_data_out.size(), 1);
+
+    return _softmax_backward_data_out;
+
+  }
+
+
+  int64_t dim;
+  at::ScalarType input_dtype;
+
+
+};
+
+class Abs : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::abs);
+  }
+
+  Abs(const torch::lazy::Value& self, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Abs::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self) const {
+    size_t i = 0;
+    return (operand(i++) == self);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector abs_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(abs_out.size(), 1);
+
+    return abs_out;
+
+  }
+
+
+
+
+
+};
+
+class AddTensor : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::add);
+  }
+
+  AddTensor(const torch::lazy::Value& self, const torch::lazy::Value& other, const torch::lazy::Value& alpha, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              AddTensor::ClassOpKind(),
+              OpList{self, other, alpha},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& other, const torch::lazy::Value& alpha) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == other &&
+        operand(i++) == alpha);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(1);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    kwarguments.emplace_back("alpha", loctx->GetOutputOp(operand(i++)));
+    torch::lazy::TSOpVector add_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(add_out.size(), 1);
+
+    return add_out;
+
+  }
+
+
+
+
+
+};
+
+class Addcdiv : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::addcdiv);
+  }
+
+  Addcdiv(const torch::lazy::Value& self, const torch::lazy::Value& tensor1, const torch::lazy::Value& tensor2, const torch::lazy::Value& value, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Addcdiv::ClassOpKind(),
+              OpList{self, tensor1, tensor2, value},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& tensor1, const torch::lazy::Value& tensor2, const torch::lazy::Value& value) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == tensor1 &&
+        operand(i++) == tensor2 &&
+        operand(i++) == value);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(3);
+    kwarguments.reserve(1);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    kwarguments.emplace_back("value", loctx->GetOutputOp(operand(i++)));
+    torch::lazy::TSOpVector addcdiv_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(addcdiv_out.size(), 1);
+
+    return addcdiv_out;
+
+  }
+
+
+
+
+
+};
+
+class Addcmul : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::addcmul);
+  }
+
+  Addcmul(const torch::lazy::Value& self, const torch::lazy::Value& tensor1, const torch::lazy::Value& tensor2, const torch::lazy::Value& value, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Addcmul::ClassOpKind(),
+              OpList{self, tensor1, tensor2, value},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& tensor1, const torch::lazy::Value& tensor2, const torch::lazy::Value& value) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == tensor1 &&
+        operand(i++) == tensor2 &&
+        operand(i++) == value);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(3);
+    kwarguments.reserve(1);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    kwarguments.emplace_back("value", loctx->GetOutputOp(operand(i++)));
+    torch::lazy::TSOpVector addcmul_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(addcmul_out.size(), 1);
+
+    return addcmul_out;
+
+  }
+
+
+
+
+
+};
+
+class Addmm : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::addmm);
+  }
+
+  Addmm(const torch::lazy::Value& self, const torch::lazy::Value& mat1, const torch::lazy::Value& mat2, const torch::lazy::Value& beta, const torch::lazy::Value& alpha, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Addmm::ClassOpKind(),
+              OpList{self, mat1, mat2, beta, alpha},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& mat1, const torch::lazy::Value& mat2, const torch::lazy::Value& beta, const torch::lazy::Value& alpha) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == mat1 &&
+        operand(i++) == mat2 &&
+        operand(i++) == beta &&
+        operand(i++) == alpha);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(3);
+    kwarguments.reserve(2);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    kwarguments.emplace_back("beta", loctx->GetOutputOp(operand(i++)));
+    kwarguments.emplace_back("alpha", loctx->GetOutputOp(operand(i++)));
+    torch::lazy::TSOpVector addmm_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(addmm_out.size(), 1);
+
+    return addmm_out;
+
+  }
+
+
+
+
+
+};
+
+class AliasCopy : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::alias_copy);
+  }
+
+  AliasCopy(const torch::lazy::Value& self, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              AliasCopy::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self) const {
+    size_t i = 0;
+    return (operand(i++) == self);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector alias_copy_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(alias_copy_out.size(), 1);
+
+    return alias_copy_out;
+
+  }
+
+
+
+
+
+};
+
+class All : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::all);
+  }
+
+  All(const torch::lazy::Value& self, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              All::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self) const {
+    size_t i = 0;
+    return (operand(i++) == self);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector all_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(all_out.size(), 1);
+
+    return all_out;
+
+  }
+
+
+
+
+
+};
+
+class Any : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::any);
+  }
+
+  Any(const torch::lazy::Value& self, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Any::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self) const {
+    size_t i = 0;
+    return (operand(i++) == self);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector any_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(any_out.size(), 1);
+
+    return any_out;
+
+  }
+
+
+
+
+
+};
+
+class ArangeStartOut : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::arange);
+  }
+
+  ArangeStartOut(const torch::lazy::Value& start, const torch::lazy::Value& end, const torch::lazy::Value& step, const torch::lazy::Value& out, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              ArangeStartOut::ClassOpKind(),
+              OpList{start, end, step, out},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& start, const torch::lazy::Value& end, const torch::lazy::Value& step, const torch::lazy::Value& out) const {
+    size_t i = 0;
+    return (operand(i++) == start &&
+        operand(i++) == end &&
+        operand(i++) == step &&
+        operand(i++) == out);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(3);
+    kwarguments.reserve(1);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    kwarguments.emplace_back("out", loctx->GetOutputOp(operand(i++)));
+    torch::lazy::TSOpVector arange_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(arange_out.size(), 1);
+
+    return arange_out;
+
+  }
+
+
+
+
+
+};
+
+class AsStridedCopy : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::as_strided_copy);
+  }
+
+  AsStridedCopy(const torch::lazy::Value& self, const ::std::vector<int64_t>& size, const ::std::vector<int64_t>& stride, const ::std::optional<torch::lazy::Value>& storage_offset, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              AsStridedCopy::ClassOpKind(),
+              OpList{self, storage_offset.value_or(kNullValue)},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(size, stride)),
+        size(size),
+        stride(stride)
+  {
+    has_storage_offset = !!storage_offset;
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", size=" << size;
+    ss << ", stride=" << stride;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const ::std::vector<int64_t>& size, const ::std::vector<int64_t>& stride, const ::std::optional<torch::lazy::Value>& storage_offset) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        nullable_operand(i++) == storage_offset.value_or(kNullValue) &&
+        this->size == size &&
+        this->stride == stride);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(4);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("size", size);
+    arguments.emplace_back("stride", stride);
+    arguments.emplace_back(has_storage_offset ? loctx->GetOutputOp(operand(i++)) : nullptr);
+
+    torch::lazy::TSOpVector as_strided_copy_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(as_strided_copy_out.size(), 1);
+
+    return as_strided_copy_out;
+
+  }
+
+
+  ::std::vector<int64_t> size;
+  ::std::vector<int64_t> stride;
+  bool has_storage_offset: 1;
+
+};
+
+class AsStridedScatter : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::as_strided_scatter);
+  }
+
+  AsStridedScatter(const torch::lazy::Value& self, const torch::lazy::Value& src, const ::std::vector<int64_t>& size, const ::std::vector<int64_t>& stride, const ::std::optional<torch::lazy::Value>& storage_offset, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              AsStridedScatter::ClassOpKind(),
+              OpList{self, src, storage_offset.value_or(kNullValue)},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(size, stride)),
+        size(size),
+        stride(stride)
+  {
+    has_storage_offset = !!storage_offset;
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", size=" << size;
+    ss << ", stride=" << stride;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& src, const ::std::vector<int64_t>& size, const ::std::vector<int64_t>& stride, const ::std::optional<torch::lazy::Value>& storage_offset) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == src &&
+        nullable_operand(i++) == storage_offset.value_or(kNullValue) &&
+        this->size == size &&
+        this->stride == stride);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(5);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("size", size);
+    arguments.emplace_back("stride", stride);
+    arguments.emplace_back(has_storage_offset ? loctx->GetOutputOp(operand(i++)) : nullptr);
+
+    torch::lazy::TSOpVector as_strided_scatter_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(as_strided_scatter_out.size(), 1);
+
+    return as_strided_scatter_out;
+
+  }
+
+
+  ::std::vector<int64_t> size;
+  ::std::vector<int64_t> stride;
+  bool has_storage_offset: 1;
+
+};
+
+class AvgPool2d : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::avg_pool2d);
+  }
+
+  AvgPool2d(const torch::lazy::Value& self, const ::std::vector<int64_t>& kernel_size, const ::std::vector<int64_t>& stride, const ::std::vector<int64_t>& padding, const bool& ceil_mode, const bool& count_include_pad, const ::std::optional<int64_t>& divisor_override, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              AvgPool2d::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override)),
+        kernel_size(kernel_size),
+        stride(stride),
+        padding(padding),
+        ceil_mode(ceil_mode),
+        count_include_pad(count_include_pad),
+        divisor_override(divisor_override)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", kernel_size=" << kernel_size;
+    ss << ", stride=" << stride;
+    ss << ", padding=" << padding;
+    ss << ", ceil_mode=" << ceil_mode;
+    ss << ", count_include_pad=" << count_include_pad;
+    if (divisor_override.has_value()) {
+      ss << ", divisor_override=" << divisor_override.value();
+    } else {
+      ss << ", divisor_override=null";
+    }
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const ::std::vector<int64_t>& kernel_size, const ::std::vector<int64_t>& stride, const ::std::vector<int64_t>& padding, const bool& ceil_mode, const bool& count_include_pad, const ::std::optional<int64_t>& divisor_override) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        this->kernel_size == kernel_size &&
+        this->stride == stride &&
+        this->padding == padding &&
+        this->ceil_mode == ceil_mode &&
+        this->count_include_pad == count_include_pad &&
+        ((!this->divisor_override&&!divisor_override) || (this->divisor_override&&divisor_override && *(this->divisor_override) == *divisor_override)));
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(7);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("kernel_size", kernel_size);
+    arguments.emplace_back("stride", stride);
+    arguments.emplace_back("padding", padding);
+    arguments.emplace_back("ceil_mode", ceil_mode);
+    arguments.emplace_back("count_include_pad", count_include_pad);
+    arguments.emplace_back("divisor_override", divisor_override);
+
+    torch::lazy::TSOpVector avg_pool2d_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(avg_pool2d_out.size(), 1);
+
+    return avg_pool2d_out;
+
+  }
+
+
+  ::std::vector<int64_t> kernel_size;
+  ::std::vector<int64_t> stride;
+  ::std::vector<int64_t> padding;
+  bool ceil_mode;
+  bool count_include_pad;
+  ::std::optional<int64_t> divisor_override;
+
+
+};
+
+class AvgPool2dBackward : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::avg_pool2d_backward);
+  }
+
+  AvgPool2dBackward(const torch::lazy::Value& grad_output, const torch::lazy::Value& self, const ::std::vector<int64_t>& kernel_size, const ::std::vector<int64_t>& stride, const ::std::vector<int64_t>& padding, const bool& ceil_mode, const bool& count_include_pad, const ::std::optional<int64_t>& divisor_override, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              AvgPool2dBackward::ClassOpKind(),
+              OpList{grad_output, self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(kernel_size, stride, padding, ceil_mode, count_include_pad, divisor_override)),
+        kernel_size(kernel_size),
+        stride(stride),
+        padding(padding),
+        ceil_mode(ceil_mode),
+        count_include_pad(count_include_pad),
+        divisor_override(divisor_override)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", kernel_size=" << kernel_size;
+    ss << ", stride=" << stride;
+    ss << ", padding=" << padding;
+    ss << ", ceil_mode=" << ceil_mode;
+    ss << ", count_include_pad=" << count_include_pad;
+    if (divisor_override.has_value()) {
+      ss << ", divisor_override=" << divisor_override.value();
+    } else {
+      ss << ", divisor_override=null";
+    }
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& grad_output, const torch::lazy::Value& self, const ::std::vector<int64_t>& kernel_size, const ::std::vector<int64_t>& stride, const ::std::vector<int64_t>& padding, const bool& ceil_mode, const bool& count_include_pad, const ::std::optional<int64_t>& divisor_override) const {
+    size_t i = 0;
+    return (operand(i++) == grad_output &&
+        operand(i++) == self &&
+        this->kernel_size == kernel_size &&
+        this->stride == stride &&
+        this->padding == padding &&
+        this->ceil_mode == ceil_mode &&
+        this->count_include_pad == count_include_pad &&
+        ((!this->divisor_override&&!divisor_override) || (this->divisor_override&&divisor_override && *(this->divisor_override) == *divisor_override)));
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(8);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("kernel_size", kernel_size);
+    arguments.emplace_back("stride", stride);
+    arguments.emplace_back("padding", padding);
+    arguments.emplace_back("ceil_mode", ceil_mode);
+    arguments.emplace_back("count_include_pad", count_include_pad);
+    arguments.emplace_back("divisor_override", divisor_override);
+
+    torch::lazy::TSOpVector avg_pool2d_backward_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(avg_pool2d_backward_out.size(), 1);
+
+    return avg_pool2d_backward_out;
+
+  }
+
+
+  ::std::vector<int64_t> kernel_size;
+  ::std::vector<int64_t> stride;
+  ::std::vector<int64_t> padding;
+  bool ceil_mode;
+  bool count_include_pad;
+  ::std::optional<int64_t> divisor_override;
+
+
+};
+
+class Baddbmm : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::baddbmm);
+  }
+
+  Baddbmm(const torch::lazy::Value& self, const torch::lazy::Value& batch1, const torch::lazy::Value& batch2, const torch::lazy::Value& beta, const torch::lazy::Value& alpha, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Baddbmm::ClassOpKind(),
+              OpList{self, batch1, batch2, beta, alpha},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& batch1, const torch::lazy::Value& batch2, const torch::lazy::Value& beta, const torch::lazy::Value& alpha) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == batch1 &&
+        operand(i++) == batch2 &&
+        operand(i++) == beta &&
+        operand(i++) == alpha);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(3);
+    kwarguments.reserve(2);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    kwarguments.emplace_back("beta", loctx->GetOutputOp(operand(i++)));
+    kwarguments.emplace_back("alpha", loctx->GetOutputOp(operand(i++)));
+    torch::lazy::TSOpVector baddbmm_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(baddbmm_out.size(), 1);
+
+    return baddbmm_out;
+
+  }
+
+
+
+
+
+};
+
+class Bernoulli : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::bernoulli);
+  }
+
+  Bernoulli(const torch::lazy::Value& self, const ::std::optional<at::Generator>& generator, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Bernoulli::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(generator)),
+        generator(generator)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    if (generator.has_value()) {
+      ss << ", generator=" << "torch.Generator()";
+    } else {
+      ss << ", generator=null";
+    }
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const ::std::optional<at::Generator>& generator) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        ((!this->generator&&!generator) || (this->generator&&generator && *(this->generator) == *generator)));
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(1);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    kwarguments.emplace_back("generator", generator);
+    torch::lazy::TSOpVector bernoulli_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(bernoulli_out.size(), 1);
+
+    return bernoulli_out;
+
+  }
+
+
+  ::std::optional<at::Generator> generator;
+
+
+};
+
+class BernoulliP : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::bernoulli);
+  }
+
+  BernoulliP(const torch::lazy::Value& self, const double& p, const ::std::optional<at::Generator>& generator, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              BernoulliP::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(p, generator)),
+        p(p),
+        generator(generator)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", p=" << p;
+    if (generator.has_value()) {
+      ss << ", generator=" << "torch.Generator()";
+    } else {
+      ss << ", generator=null";
+    }
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const double& p, const ::std::optional<at::Generator>& generator) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        this->p == p &&
+        ((!this->generator&&!generator) || (this->generator&&generator && *(this->generator) == *generator)));
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(1);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("p", p);
+    kwarguments.emplace_back("generator", generator);
+    torch::lazy::TSOpVector bernoulli_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(bernoulli_out.size(), 1);
+
+    return bernoulli_out;
+
+  }
+
+
+  double p;
+  ::std::optional<at::Generator> generator;
+
+
+};
+
+class BinaryCrossEntropy : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::binary_cross_entropy);
+  }
+
+  BinaryCrossEntropy(const torch::lazy::Value& self, const torch::lazy::Value& target, const ::std::optional<torch::lazy::Value>& weight, const int64_t& reduction, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              BinaryCrossEntropy::ClassOpKind(),
+              OpList{self, target, weight.value_or(kNullValue)},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(reduction)),
+        reduction(reduction)
+  {
+    has_weight = !!weight;
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", reduction=" << reduction;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& target, const ::std::optional<torch::lazy::Value>& weight, const int64_t& reduction) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == target &&
+        nullable_operand(i++) == weight.value_or(kNullValue) &&
+        this->reduction == reduction);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(4);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(has_weight ? loctx->GetOutputOp(operand(i++)) : nullptr);
+    arguments.emplace_back("reduction", reduction);
+
+    torch::lazy::TSOpVector binary_cross_entropy_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(binary_cross_entropy_out.size(), 1);
+
+    return binary_cross_entropy_out;
+
+  }
+
+
+  int64_t reduction;
+  bool has_weight: 1;
+
+};
+
+class BinaryCrossEntropyBackward : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::binary_cross_entropy_backward);
+  }
+
+  BinaryCrossEntropyBackward(const torch::lazy::Value& grad_output, const torch::lazy::Value& self, const torch::lazy::Value& target, const ::std::optional<torch::lazy::Value>& weight, const int64_t& reduction, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              BinaryCrossEntropyBackward::ClassOpKind(),
+              OpList{grad_output, self, target, weight.value_or(kNullValue)},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(reduction)),
+        reduction(reduction)
+  {
+    has_weight = !!weight;
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", reduction=" << reduction;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& grad_output, const torch::lazy::Value& self, const torch::lazy::Value& target, const ::std::optional<torch::lazy::Value>& weight, const int64_t& reduction) const {
+    size_t i = 0;
+    return (operand(i++) == grad_output &&
+        operand(i++) == self &&
+        operand(i++) == target &&
+        nullable_operand(i++) == weight.value_or(kNullValue) &&
+        this->reduction == reduction);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(5);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(has_weight ? loctx->GetOutputOp(operand(i++)) : nullptr);
+    arguments.emplace_back("reduction", reduction);
+
+    torch::lazy::TSOpVector binary_cross_entropy_backward_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(binary_cross_entropy_backward_out.size(), 1);
+
+    return binary_cross_entropy_backward_out;
+
+  }
+
+
+  int64_t reduction;
+  bool has_weight: 1;
+
+};
+
+class BitwiseAndTensor : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::bitwise_and);
+  }
+
+  BitwiseAndTensor(const torch::lazy::Value& self, const torch::lazy::Value& other, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              BitwiseAndTensor::ClassOpKind(),
+              OpList{self, other},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& other) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == other);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector bitwise_and_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(bitwise_and_out.size(), 1);
+
+    return bitwise_and_out;
+
+  }
+
+
+
+
+
+};
+
+class BitwiseOrTensor : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::bitwise_or);
+  }
+
+  BitwiseOrTensor(const torch::lazy::Value& self, const torch::lazy::Value& other, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              BitwiseOrTensor::ClassOpKind(),
+              OpList{self, other},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& other) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == other);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector bitwise_or_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(bitwise_or_out.size(), 1);
+
+    return bitwise_or_out;
+
+  }
+
+
+
+
+
+};
+
+class Bmm : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::bmm);
+  }
+
+  Bmm(const torch::lazy::Value& self, const torch::lazy::Value& mat2, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Bmm::ClassOpKind(),
+              OpList{self, mat2},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& mat2) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == mat2);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector bmm_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(bmm_out.size(), 1);
+
+    return bmm_out;
+
+  }
+
+
+
+
+
+};
+
+class Cat : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::cat);
+  }
+
+  Cat(const torch::lazy::Value& tensors, const int64_t& dim, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Cat::ClassOpKind(),
+              OpList{tensors},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(dim)),
+        dim(dim)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", dim=" << dim;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& tensors, const int64_t& dim) const {
+    size_t i = 0;
+    return (operand(i++) == tensors &&
+        this->dim == dim);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("dim", dim);
+
+    torch::lazy::TSOpVector cat_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(cat_out.size(), 1);
+
+    return cat_out;
+
+  }
+
+
+  int64_t dim;
+
+
+};
+
+class Clamp : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::clamp);
+  }
+
+  Clamp(const torch::lazy::Value& self, const ::std::optional<torch::lazy::Value>& min, const ::std::optional<torch::lazy::Value>& max, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Clamp::ClassOpKind(),
+              OpList{self, min.value_or(kNullValue), max.value_or(kNullValue)},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+    has_min = !!min;
+    has_max = !!max;
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const ::std::optional<torch::lazy::Value>& min, const ::std::optional<torch::lazy::Value>& max) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        nullable_operand(i++) == min.value_or(kNullValue) &&
+        nullable_operand(i++) == max.value_or(kNullValue));
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(3);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(has_min ? loctx->GetOutputOp(operand(i++)) : nullptr);
+    arguments.emplace_back(has_max ? loctx->GetOutputOp(operand(i++)) : nullptr);
+
+    torch::lazy::TSOpVector clamp_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(clamp_out.size(), 1);
+
+    return clamp_out;
+
+  }
+
+
+
+  bool has_min: 1;
+  bool has_max: 1;
+
+};
+
+class ClampMin : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::clamp_min);
+  }
+
+  ClampMin(const torch::lazy::Value& self, const torch::lazy::Value& min, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              ClampMin::ClassOpKind(),
+              OpList{self, min},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& min) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == min);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector clamp_min_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(clamp_min_out.size(), 1);
+
+    return clamp_min_out;
+
+  }
+
+
+
+
+
+};
+
+class ConstantPadNd : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::constant_pad_nd);
+  }
+
+  ConstantPadNd(const torch::lazy::Value& self, const ::std::vector<int64_t>& pad, const torch::lazy::Value& value, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              ConstantPadNd::ClassOpKind(),
+              OpList{self, value},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(pad)),
+        pad(pad)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", pad=" << pad;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const ::std::vector<int64_t>& pad, const torch::lazy::Value& value) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == value &&
+        this->pad == pad);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(3);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("pad", pad);
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector constant_pad_nd_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(constant_pad_nd_out.size(), 1);
+
+    return constant_pad_nd_out;
+
+  }
+
+
+  ::std::vector<int64_t> pad;
+
+
+};
+
+class Convolution : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::convolution);
+  }
+
+  Convolution(const torch::lazy::Value& input, const torch::lazy::Value& weight, const ::std::optional<torch::lazy::Value>& bias, const ::std::vector<int64_t>& stride, const ::std::vector<int64_t>& padding, const ::std::vector<int64_t>& dilation, const bool& transposed, const ::std::vector<int64_t>& output_padding, const int64_t& groups, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Convolution::ClassOpKind(),
+              OpList{input, weight, bias.value_or(kNullValue)},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(stride, padding, dilation, transposed, output_padding, groups)),
+        stride(stride),
+        padding(padding),
+        dilation(dilation),
+        transposed(transposed),
+        output_padding(output_padding),
+        groups(groups)
+  {
+    has_bias = !!bias;
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", stride=" << stride;
+    ss << ", padding=" << padding;
+    ss << ", dilation=" << dilation;
+    ss << ", transposed=" << transposed;
+    ss << ", output_padding=" << output_padding;
+    ss << ", groups=" << groups;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& input, const torch::lazy::Value& weight, const ::std::optional<torch::lazy::Value>& bias, const ::std::vector<int64_t>& stride, const ::std::vector<int64_t>& padding, const ::std::vector<int64_t>& dilation, const bool& transposed, const ::std::vector<int64_t>& output_padding, const int64_t& groups) const {
+    size_t i = 0;
+    return (operand(i++) == input &&
+        operand(i++) == weight &&
+        nullable_operand(i++) == bias.value_or(kNullValue) &&
+        this->stride == stride &&
+        this->padding == padding &&
+        this->dilation == dilation &&
+        this->transposed == transposed &&
+        this->output_padding == output_padding &&
+        this->groups == groups);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(9);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(has_bias ? loctx->GetOutputOp(operand(i++)) : nullptr);
+    arguments.emplace_back("stride", stride);
+    arguments.emplace_back("padding", padding);
+    arguments.emplace_back("dilation", dilation);
+    arguments.emplace_back("transposed", transposed);
+    arguments.emplace_back("output_padding", output_padding);
+    arguments.emplace_back("groups", groups);
+
+    torch::lazy::TSOpVector convolution_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(convolution_out.size(), 1);
+
+    return convolution_out;
+
+  }
+
+
+  ::std::vector<int64_t> stride;
+  ::std::vector<int64_t> padding;
+  ::std::vector<int64_t> dilation;
+  bool transposed;
+  ::std::vector<int64_t> output_padding;
+  int64_t groups;
+  bool has_bias: 1;
+
+};
+
+class ConvolutionBackward : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::convolution_backward);
+  }
+
+  ConvolutionBackward(const torch::lazy::Value& grad_output, const torch::lazy::Value& input, const torch::lazy::Value& weight, const ::std::optional<::std::vector<int64_t>>& bias_sizes, const ::std::vector<int64_t>& stride, const ::std::vector<int64_t>& padding, const ::std::vector<int64_t>& dilation, const bool& transposed, const ::std::vector<int64_t>& output_padding, const int64_t& groups, const ::std::vector<bool>& output_mask, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              ConvolutionBackward::ClassOpKind(),
+              OpList{grad_output, input, weight},
+              std::move(shapes),
+              /* num_outputs */ 3,
+              torch::lazy::MHash(bias_sizes, stride, padding, dilation, transposed, output_padding, groups, output_mask)),
+        bias_sizes(bias_sizes),
+        stride(stride),
+        padding(padding),
+        dilation(dilation),
+        transposed(transposed),
+        output_padding(output_padding),
+        groups(groups),
+        output_mask(output_mask)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    if (bias_sizes.has_value()) {
+      ss << ", bias_sizes=" << bias_sizes.value();
+    } else {
+      ss << ", bias_sizes=null";
+    }
+    ss << ", stride=" << stride;
+    ss << ", padding=" << padding;
+    ss << ", dilation=" << dilation;
+    ss << ", transposed=" << transposed;
+    ss << ", output_padding=" << output_padding;
+    ss << ", groups=" << groups;
+    ss << ", output_mask=" << output_mask;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& grad_output, const torch::lazy::Value& input, const torch::lazy::Value& weight, const ::std::optional<::std::vector<int64_t>>& bias_sizes, const ::std::vector<int64_t>& stride, const ::std::vector<int64_t>& padding, const ::std::vector<int64_t>& dilation, const bool& transposed, const ::std::vector<int64_t>& output_padding, const int64_t& groups, const ::std::vector<bool>& output_mask) const {
+    size_t i = 0;
+    return (operand(i++) == grad_output &&
+        operand(i++) == input &&
+        operand(i++) == weight &&
+        ((!this->bias_sizes&&!bias_sizes) || (this->bias_sizes&&bias_sizes && *(this->bias_sizes) == *bias_sizes)) &&
+        this->stride == stride &&
+        this->padding == padding &&
+        this->dilation == dilation &&
+        this->transposed == transposed &&
+        this->output_padding == output_padding &&
+        this->groups == groups &&
+        this->output_mask == output_mask);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(11);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("bias_sizes", bias_sizes);
+    arguments.emplace_back("stride", stride);
+    arguments.emplace_back("padding", padding);
+    arguments.emplace_back("dilation", dilation);
+    arguments.emplace_back("transposed", transposed);
+    arguments.emplace_back("output_padding", output_padding);
+    arguments.emplace_back("groups", groups);
+    arguments.emplace_back("output_mask", output_mask);
+
+    torch::lazy::TSOpVector convolution_backward_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(convolution_backward_out.size(), 3);
+
+    return convolution_backward_out;
+
+  }
+
+
+  ::std::optional<::std::vector<int64_t>> bias_sizes;
+  ::std::vector<int64_t> stride;
+  ::std::vector<int64_t> padding;
+  ::std::vector<int64_t> dilation;
+  bool transposed;
+  ::std::vector<int64_t> output_padding;
+  int64_t groups;
+  ::std::vector<bool> output_mask;
+
+
+};
+
+class Cos : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::cos);
+  }
+
+  Cos(const torch::lazy::Value& self, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Cos::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self) const {
+    size_t i = 0;
+    return (operand(i++) == self);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector cos_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(cos_out.size(), 1);
+
+    return cos_out;
+
+  }
+
+
+
+
+
+};
+
+class Cumsum : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::cumsum);
+  }
+
+  Cumsum(const torch::lazy::Value& self, const int64_t& dim, const ::std::optional<at::ScalarType>& dtype, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Cumsum::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(dim, dtype)),
+        dim(dim),
+        dtype(dtype)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", dim=" << dim;
+    if (dtype.has_value()) {
+      ss << ", dtype=" << dtype.value();
+    } else {
+      ss << ", dtype=null";
+    }
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const int64_t& dim, const ::std::optional<at::ScalarType>& dtype) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        this->dim == dim &&
+        ((!this->dtype&&!dtype) || (this->dtype&&dtype && *(this->dtype) == *dtype)));
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(1);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("dim", dim);
+    kwarguments.emplace_back("dtype", dtype);
+    torch::lazy::TSOpVector cumsum_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(cumsum_out.size(), 1);
+
+    return cumsum_out;
+
+  }
+
+
+  int64_t dim;
+  ::std::optional<at::ScalarType> dtype;
+
+
+};
+
+class DetachCopy : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::detach_copy);
+  }
+
+  DetachCopy(const torch::lazy::Value& self, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              DetachCopy::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self) const {
+    size_t i = 0;
+    return (operand(i++) == self);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector detach_copy_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(detach_copy_out.size(), 1);
+
+    return detach_copy_out;
+
+  }
+
+
+
+
+
+};
+
+class DiagonalCopy : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::diagonal_copy);
+  }
+
+  DiagonalCopy(const torch::lazy::Value& self, const int64_t& offset, const int64_t& dim1, const int64_t& dim2, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              DiagonalCopy::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(offset, dim1, dim2)),
+        offset(offset),
+        dim1(dim1),
+        dim2(dim2)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", offset=" << offset;
+    ss << ", dim1=" << dim1;
+    ss << ", dim2=" << dim2;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const int64_t& offset, const int64_t& dim1, const int64_t& dim2) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        this->offset == offset &&
+        this->dim1 == dim1 &&
+        this->dim2 == dim2);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(4);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("offset", offset);
+    arguments.emplace_back("dim1", dim1);
+    arguments.emplace_back("dim2", dim2);
+
+    torch::lazy::TSOpVector diagonal_copy_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(diagonal_copy_out.size(), 1);
+
+    return diagonal_copy_out;
+
+  }
+
+
+  int64_t offset;
+  int64_t dim1;
+  int64_t dim2;
+
+
+};
+
+class DiagonalScatter : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::diagonal_scatter);
+  }
+
+  DiagonalScatter(const torch::lazy::Value& self, const torch::lazy::Value& src, const int64_t& offset, const int64_t& dim1, const int64_t& dim2, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              DiagonalScatter::ClassOpKind(),
+              OpList{self, src},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(offset, dim1, dim2)),
+        offset(offset),
+        dim1(dim1),
+        dim2(dim2)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", offset=" << offset;
+    ss << ", dim1=" << dim1;
+    ss << ", dim2=" << dim2;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& src, const int64_t& offset, const int64_t& dim1, const int64_t& dim2) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == src &&
+        this->offset == offset &&
+        this->dim1 == dim1 &&
+        this->dim2 == dim2);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(5);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("offset", offset);
+    arguments.emplace_back("dim1", dim1);
+    arguments.emplace_back("dim2", dim2);
+
+    torch::lazy::TSOpVector diagonal_scatter_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(diagonal_scatter_out.size(), 1);
+
+    return diagonal_scatter_out;
+
+  }
+
+
+  int64_t offset;
+  int64_t dim1;
+  int64_t dim2;
+
+
+};
+
+class DivTensor : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::div);
+  }
+
+  DivTensor(const torch::lazy::Value& self, const torch::lazy::Value& other, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              DivTensor::ClassOpKind(),
+              OpList{self, other},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& other) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == other);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector div_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(div_out.size(), 1);
+
+    return div_out;
+
+  }
+
+
+
+
+
+};
+
+class DivTensorMode : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::div);
+  }
+
+  DivTensorMode(const torch::lazy::Value& self, const torch::lazy::Value& other, const ::std::optional<c10::string_view>& rounding_mode, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              DivTensorMode::ClassOpKind(),
+              OpList{self, other},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(rounding_mode)),
+        rounding_mode(rounding_mode.has_value() ? ::std::make_optional(std::string(*rounding_mode)) : ::std::nullopt)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    if (rounding_mode.has_value()) {
+      ss << ", rounding_mode=" << rounding_mode.value();
+    } else {
+      ss << ", rounding_mode=null";
+    }
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& other, const ::std::optional<c10::string_view>& rounding_mode) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == other &&
+        ((!this->rounding_mode&&!rounding_mode) || (this->rounding_mode&&rounding_mode && *(this->rounding_mode) == *rounding_mode)));
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(1);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    kwarguments.emplace_back("rounding_mode", rounding_mode);
+    torch::lazy::TSOpVector div_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(div_out.size(), 1);
+
+    return div_out;
+
+  }
+
+
+  ::std::optional<std::string> rounding_mode;
+
+
+};
+
+class Elu : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::elu);
+  }
+
+  Elu(const torch::lazy::Value& self, const torch::lazy::Value& alpha, const torch::lazy::Value& scale, const torch::lazy::Value& input_scale, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Elu::ClassOpKind(),
+              OpList{self, alpha, scale, input_scale},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& alpha, const torch::lazy::Value& scale, const torch::lazy::Value& input_scale) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == alpha &&
+        operand(i++) == scale &&
+        operand(i++) == input_scale);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(4);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector elu_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(elu_out.size(), 1);
+
+    return elu_out;
+
+  }
+
+
+
+
+
+};
+
+class EluBackward : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::elu_backward);
+  }
+
+  EluBackward(const torch::lazy::Value& grad_output, const torch::lazy::Value& alpha, const torch::lazy::Value& scale, const torch::lazy::Value& input_scale, const bool& is_result, const torch::lazy::Value& self_or_result, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              EluBackward::ClassOpKind(),
+              OpList{grad_output, alpha, scale, input_scale, self_or_result},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(is_result)),
+        is_result(is_result)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", is_result=" << is_result;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& grad_output, const torch::lazy::Value& alpha, const torch::lazy::Value& scale, const torch::lazy::Value& input_scale, const bool& is_result, const torch::lazy::Value& self_or_result) const {
+    size_t i = 0;
+    return (operand(i++) == grad_output &&
+        operand(i++) == alpha &&
+        operand(i++) == scale &&
+        operand(i++) == input_scale &&
+        operand(i++) == self_or_result &&
+        this->is_result == is_result);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(6);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("is_result", is_result);
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector elu_backward_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(elu_backward_out.size(), 1);
+
+    return elu_backward_out;
+
+  }
+
+
+  bool is_result;
+
+
+};
+
+class Embedding : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::embedding);
+  }
+
+  Embedding(const torch::lazy::Value& weight, const torch::lazy::Value& indices, const int64_t& padding_idx, const bool& scale_grad_by_freq, const bool& sparse, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Embedding::ClassOpKind(),
+              OpList{weight, indices},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(padding_idx, scale_grad_by_freq, sparse)),
+        padding_idx(padding_idx),
+        scale_grad_by_freq(scale_grad_by_freq),
+        sparse(sparse)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", padding_idx=" << padding_idx;
+    ss << ", scale_grad_by_freq=" << scale_grad_by_freq;
+    ss << ", sparse=" << sparse;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& weight, const torch::lazy::Value& indices, const int64_t& padding_idx, const bool& scale_grad_by_freq, const bool& sparse) const {
+    size_t i = 0;
+    return (operand(i++) == weight &&
+        operand(i++) == indices &&
+        this->padding_idx == padding_idx &&
+        this->scale_grad_by_freq == scale_grad_by_freq &&
+        this->sparse == sparse);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(5);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("padding_idx", padding_idx);
+    arguments.emplace_back("scale_grad_by_freq", scale_grad_by_freq);
+    arguments.emplace_back("sparse", sparse);
+
+    torch::lazy::TSOpVector embedding_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(embedding_out.size(), 1);
+
+    return embedding_out;
+
+  }
+
+
+  int64_t padding_idx;
+  bool scale_grad_by_freq;
+  bool sparse;
+
+
+};
+
+class EmbeddingDenseBackward : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::embedding_dense_backward);
+  }
+
+  EmbeddingDenseBackward(const torch::lazy::Value& grad_output, const torch::lazy::Value& indices, const int64_t& num_weights, const int64_t& padding_idx, const bool& scale_grad_by_freq, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              EmbeddingDenseBackward::ClassOpKind(),
+              OpList{grad_output, indices},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(num_weights, padding_idx, scale_grad_by_freq)),
+        num_weights(num_weights),
+        padding_idx(padding_idx),
+        scale_grad_by_freq(scale_grad_by_freq)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", num_weights=" << num_weights;
+    ss << ", padding_idx=" << padding_idx;
+    ss << ", scale_grad_by_freq=" << scale_grad_by_freq;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& grad_output, const torch::lazy::Value& indices, const int64_t& num_weights, const int64_t& padding_idx, const bool& scale_grad_by_freq) const {
+    size_t i = 0;
+    return (operand(i++) == grad_output &&
+        operand(i++) == indices &&
+        this->num_weights == num_weights &&
+        this->padding_idx == padding_idx &&
+        this->scale_grad_by_freq == scale_grad_by_freq);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(5);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("num_weights", num_weights);
+    arguments.emplace_back("padding_idx", padding_idx);
+    arguments.emplace_back("scale_grad_by_freq", scale_grad_by_freq);
+
+    torch::lazy::TSOpVector embedding_dense_backward_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(embedding_dense_backward_out.size(), 1);
+
+    return embedding_dense_backward_out;
+
+  }
+
+
+  int64_t num_weights;
+  int64_t padding_idx;
+  bool scale_grad_by_freq;
+
+
+};
+
+class EqScalar : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::eq);
+  }
+
+  EqScalar(const torch::lazy::Value& self, const torch::lazy::Value& other, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              EqScalar::ClassOpKind(),
+              OpList{self, other},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& other) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == other);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector eq_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(eq_out.size(), 1);
+
+    return eq_out;
+
+  }
+
+
+
+
+
+};
+
+class EqTensor : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::eq);
+  }
+
+  EqTensor(const torch::lazy::Value& self, const torch::lazy::Value& other, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              EqTensor::ClassOpKind(),
+              OpList{self, other},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& other) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == other);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector eq_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(eq_out.size(), 1);
+
+    return eq_out;
+
+  }
+
+
+
+
+
+};
+
+class Exp : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::exp);
+  }
+
+  Exp(const torch::lazy::Value& self, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Exp::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self) const {
+    size_t i = 0;
+    return (operand(i++) == self);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector exp_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(exp_out.size(), 1);
+
+    return exp_out;
+
+  }
+
+
+
+
+
+};
+
+class ExpandCopy : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::expand_copy);
+  }
+
+  ExpandCopy(const torch::lazy::Value& self, const ::std::vector<int64_t>& size, const bool& implicit, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              ExpandCopy::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(size, implicit)),
+        size(size),
+        implicit(implicit)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", size=" << size;
+    ss << ", implicit=" << implicit;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const ::std::vector<int64_t>& size, const bool& implicit) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        this->size == size &&
+        this->implicit == implicit);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(1);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("size", size);
+    kwarguments.emplace_back("implicit", implicit);
+    torch::lazy::TSOpVector expand_copy_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(expand_copy_out.size(), 1);
+
+    return expand_copy_out;
+
+  }
+
+
+  ::std::vector<int64_t> size;
+  bool implicit;
+
+
+};
+
+class Flip : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::flip);
+  }
+
+  Flip(const torch::lazy::Value& self, const ::std::vector<int64_t>& dims, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Flip::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(dims)),
+        dims(dims)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", dims=" << dims;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const ::std::vector<int64_t>& dims) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        this->dims == dims);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("dims", dims);
+
+    torch::lazy::TSOpVector flip_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(flip_out.size(), 1);
+
+    return flip_out;
+
+  }
+
+
+  ::std::vector<int64_t> dims;
+
+
+};
+
+class Floor : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::floor);
+  }
+
+  Floor(const torch::lazy::Value& self, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Floor::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self) const {
+    size_t i = 0;
+    return (operand(i++) == self);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector floor_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(floor_out.size(), 1);
+
+    return floor_out;
+
+  }
+
+
+
+
+
+};
+
+class Frac : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::frac);
+  }
+
+  Frac(const torch::lazy::Value& self, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Frac::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self) const {
+    size_t i = 0;
+    return (operand(i++) == self);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector frac_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(frac_out.size(), 1);
+
+    return frac_out;
+
+  }
+
+
+
+
+
+};
+
+class Gather : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::gather);
+  }
+
+  Gather(const torch::lazy::Value& self, const int64_t& dim, const torch::lazy::Value& index, const bool& sparse_grad, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Gather::ClassOpKind(),
+              OpList{self, index},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(dim, sparse_grad)),
+        dim(dim),
+        sparse_grad(sparse_grad)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", dim=" << dim;
+    ss << ", sparse_grad=" << sparse_grad;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const int64_t& dim, const torch::lazy::Value& index, const bool& sparse_grad) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == index &&
+        this->dim == dim &&
+        this->sparse_grad == sparse_grad);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(3);
+    kwarguments.reserve(1);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("dim", dim);
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    kwarguments.emplace_back("sparse_grad", sparse_grad);
+    torch::lazy::TSOpVector gather_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(gather_out.size(), 1);
+
+    return gather_out;
+
+  }
+
+
+  int64_t dim;
+  bool sparse_grad;
+
+
+};
+
+class GeScalar : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::ge);
+  }
+
+  GeScalar(const torch::lazy::Value& self, const torch::lazy::Value& other, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              GeScalar::ClassOpKind(),
+              OpList{self, other},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& other) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == other);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector ge_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(ge_out.size(), 1);
+
+    return ge_out;
+
+  }
+
+
+
+
+
+};
+
+class GeTensor : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::ge);
+  }
+
+  GeTensor(const torch::lazy::Value& self, const torch::lazy::Value& other, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              GeTensor::ClassOpKind(),
+              OpList{self, other},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& other) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == other);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector ge_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(ge_out.size(), 1);
+
+    return ge_out;
+
+  }
+
+
+
+
+
+};
+
+class Gelu : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::gelu);
+  }
+
+  Gelu(const torch::lazy::Value& self, const c10::string_view& approximate, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Gelu::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(approximate)),
+        approximate(approximate)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", approximate=" << approximate;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const c10::string_view& approximate) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        this->approximate == approximate);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(1);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    kwarguments.emplace_back("approximate", approximate);
+    torch::lazy::TSOpVector gelu_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(gelu_out.size(), 1);
+
+    return gelu_out;
+
+  }
+
+
+  std::string approximate;
+
+
+};
+
+class GeluBackward : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::gelu_backward);
+  }
+
+  GeluBackward(const torch::lazy::Value& grad_output, const torch::lazy::Value& self, const c10::string_view& approximate, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              GeluBackward::ClassOpKind(),
+              OpList{grad_output, self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(approximate)),
+        approximate(approximate)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", approximate=" << approximate;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& grad_output, const torch::lazy::Value& self, const c10::string_view& approximate) const {
+    size_t i = 0;
+    return (operand(i++) == grad_output &&
+        operand(i++) == self &&
+        this->approximate == approximate);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(1);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    kwarguments.emplace_back("approximate", approximate);
+    torch::lazy::TSOpVector gelu_backward_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(gelu_backward_out.size(), 1);
+
+    return gelu_backward_out;
+
+  }
+
+
+  std::string approximate;
+
+
+};
+
+class Glu : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::glu);
+  }
+
+  Glu(const torch::lazy::Value& self, const int64_t& dim, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Glu::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(dim)),
+        dim(dim)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", dim=" << dim;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const int64_t& dim) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        this->dim == dim);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("dim", dim);
+
+    torch::lazy::TSOpVector glu_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(glu_out.size(), 1);
+
+    return glu_out;
+
+  }
+
+
+  int64_t dim;
+
+
+};
+
+class GluBackward : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::glu_backward);
+  }
+
+  GluBackward(const torch::lazy::Value& grad_output, const torch::lazy::Value& self, const int64_t& dim, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              GluBackward::ClassOpKind(),
+              OpList{grad_output, self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(dim)),
+        dim(dim)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", dim=" << dim;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& grad_output, const torch::lazy::Value& self, const int64_t& dim) const {
+    size_t i = 0;
+    return (operand(i++) == grad_output &&
+        operand(i++) == self &&
+        this->dim == dim);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(3);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("dim", dim);
+
+    torch::lazy::TSOpVector glu_backward_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(glu_backward_out.size(), 1);
+
+    return glu_backward_out;
+
+  }
+
+
+  int64_t dim;
+
+
+};
+
+class GluJvp : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::glu_jvp);
+  }
+
+  GluJvp(const torch::lazy::Value& glu, const torch::lazy::Value& x, const torch::lazy::Value& dx, const int64_t& dim, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              GluJvp::ClassOpKind(),
+              OpList{glu, x, dx},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(dim)),
+        dim(dim)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", dim=" << dim;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& glu, const torch::lazy::Value& x, const torch::lazy::Value& dx, const int64_t& dim) const {
+    size_t i = 0;
+    return (operand(i++) == glu &&
+        operand(i++) == x &&
+        operand(i++) == dx &&
+        this->dim == dim);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(4);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("dim", dim);
+
+    torch::lazy::TSOpVector glu_jvp_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(glu_jvp_out.size(), 1);
+
+    return glu_jvp_out;
+
+  }
+
+
+  int64_t dim;
+
+
+};
+
+class GridSampler2d : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::grid_sampler_2d);
+  }
+
+  GridSampler2d(const torch::lazy::Value& input, const torch::lazy::Value& grid, const int64_t& interpolation_mode, const int64_t& padding_mode, const bool& align_corners, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              GridSampler2d::ClassOpKind(),
+              OpList{input, grid},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(interpolation_mode, padding_mode, align_corners)),
+        interpolation_mode(interpolation_mode),
+        padding_mode(padding_mode),
+        align_corners(align_corners)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", interpolation_mode=" << interpolation_mode;
+    ss << ", padding_mode=" << padding_mode;
+    ss << ", align_corners=" << align_corners;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& input, const torch::lazy::Value& grid, const int64_t& interpolation_mode, const int64_t& padding_mode, const bool& align_corners) const {
+    size_t i = 0;
+    return (operand(i++) == input &&
+        operand(i++) == grid &&
+        this->interpolation_mode == interpolation_mode &&
+        this->padding_mode == padding_mode &&
+        this->align_corners == align_corners);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(5);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("interpolation_mode", interpolation_mode);
+    arguments.emplace_back("padding_mode", padding_mode);
+    arguments.emplace_back("align_corners", align_corners);
+
+    torch::lazy::TSOpVector grid_sampler_2d_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(grid_sampler_2d_out.size(), 1);
+
+    return grid_sampler_2d_out;
+
+  }
+
+
+  int64_t interpolation_mode;
+  int64_t padding_mode;
+  bool align_corners;
+
+
+};
+
+class GridSampler2dBackward : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::grid_sampler_2d_backward);
+  }
+
+  GridSampler2dBackward(const torch::lazy::Value& grad_output, const torch::lazy::Value& input, const torch::lazy::Value& grid, const int64_t& interpolation_mode, const int64_t& padding_mode, const bool& align_corners, const ::std::vector<bool>& output_mask, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              GridSampler2dBackward::ClassOpKind(),
+              OpList{grad_output, input, grid},
+              std::move(shapes),
+              /* num_outputs */ 2,
+              torch::lazy::MHash(interpolation_mode, padding_mode, align_corners, output_mask)),
+        interpolation_mode(interpolation_mode),
+        padding_mode(padding_mode),
+        align_corners(align_corners),
+        output_mask(output_mask)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", interpolation_mode=" << interpolation_mode;
+    ss << ", padding_mode=" << padding_mode;
+    ss << ", align_corners=" << align_corners;
+    ss << ", output_mask=" << output_mask;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& grad_output, const torch::lazy::Value& input, const torch::lazy::Value& grid, const int64_t& interpolation_mode, const int64_t& padding_mode, const bool& align_corners, const ::std::vector<bool>& output_mask) const {
+    size_t i = 0;
+    return (operand(i++) == grad_output &&
+        operand(i++) == input &&
+        operand(i++) == grid &&
+        this->interpolation_mode == interpolation_mode &&
+        this->padding_mode == padding_mode &&
+        this->align_corners == align_corners &&
+        this->output_mask == output_mask);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(7);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("interpolation_mode", interpolation_mode);
+    arguments.emplace_back("padding_mode", padding_mode);
+    arguments.emplace_back("align_corners", align_corners);
+    arguments.emplace_back("output_mask", output_mask);
+
+    torch::lazy::TSOpVector grid_sampler_2d_backward_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(grid_sampler_2d_backward_out.size(), 2);
+
+    return grid_sampler_2d_backward_out;
+
+  }
+
+
+  int64_t interpolation_mode;
+  int64_t padding_mode;
+  bool align_corners;
+  ::std::vector<bool> output_mask;
+
+
+};
+
+class GtScalar : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::gt);
+  }
+
+  GtScalar(const torch::lazy::Value& self, const torch::lazy::Value& other, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              GtScalar::ClassOpKind(),
+              OpList{self, other},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& other) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == other);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector gt_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(gt_out.size(), 1);
+
+    return gt_out;
+
+  }
+
+
+
+
+
+};
+
+class GtTensor : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::gt);
+  }
+
+  GtTensor(const torch::lazy::Value& self, const torch::lazy::Value& other, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              GtTensor::ClassOpKind(),
+              OpList{self, other},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& other) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == other);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector gt_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(gt_out.size(), 1);
+
+    return gt_out;
+
+  }
+
+
+
+
+
+};
+
+class Hardsigmoid : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::hardsigmoid);
+  }
+
+  Hardsigmoid(const torch::lazy::Value& self, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Hardsigmoid::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self) const {
+    size_t i = 0;
+    return (operand(i++) == self);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector hardsigmoid_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(hardsigmoid_out.size(), 1);
+
+    return hardsigmoid_out;
+
+  }
+
+
+
+
+
+};
+
+class IndexSelect : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::index_select);
+  }
+
+  IndexSelect(const torch::lazy::Value& self, const int64_t& dim, const torch::lazy::Value& index, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              IndexSelect::ClassOpKind(),
+              OpList{self, index},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(dim)),
+        dim(dim)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", dim=" << dim;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const int64_t& dim, const torch::lazy::Value& index) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == index &&
+        this->dim == dim);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(3);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("dim", dim);
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector index_select_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(index_select_out.size(), 1);
+
+    return index_select_out;
+
+  }
+
+
+  int64_t dim;
+
+
+};
+
+class LeScalar : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::le);
+  }
+
+  LeScalar(const torch::lazy::Value& self, const torch::lazy::Value& other, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              LeScalar::ClassOpKind(),
+              OpList{self, other},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& other) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == other);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector le_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(le_out.size(), 1);
+
+    return le_out;
+
+  }
+
+
+
+
+
+};
+
+class LeTensor : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::le);
+  }
+
+  LeTensor(const torch::lazy::Value& self, const torch::lazy::Value& other, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              LeTensor::ClassOpKind(),
+              OpList{self, other},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& other) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == other);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector le_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(le_out.size(), 1);
+
+    return le_out;
+
+  }
+
+
+
+
+
+};
+
+class LeakyRelu : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::leaky_relu);
+  }
+
+  LeakyRelu(const torch::lazy::Value& self, const torch::lazy::Value& negative_slope, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              LeakyRelu::ClassOpKind(),
+              OpList{self, negative_slope},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& negative_slope) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == negative_slope);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector leaky_relu_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(leaky_relu_out.size(), 1);
+
+    return leaky_relu_out;
+
+  }
+
+
+
+
+
+};
+
+class LeakyReluBackward : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::leaky_relu_backward);
+  }
+
+  LeakyReluBackward(const torch::lazy::Value& grad_output, const torch::lazy::Value& self, const torch::lazy::Value& negative_slope, const bool& self_is_result, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              LeakyReluBackward::ClassOpKind(),
+              OpList{grad_output, self, negative_slope},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(self_is_result)),
+        self_is_result(self_is_result)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", self_is_result=" << self_is_result;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& grad_output, const torch::lazy::Value& self, const torch::lazy::Value& negative_slope, const bool& self_is_result) const {
+    size_t i = 0;
+    return (operand(i++) == grad_output &&
+        operand(i++) == self &&
+        operand(i++) == negative_slope &&
+        this->self_is_result == self_is_result);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(4);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("self_is_result", self_is_result);
+
+    torch::lazy::TSOpVector leaky_relu_backward_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(leaky_relu_backward_out.size(), 1);
+
+    return leaky_relu_backward_out;
+
+  }
+
+
+  bool self_is_result;
+
+
+};
+
+class Log : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::log);
+  }
+
+  Log(const torch::lazy::Value& self, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Log::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self) const {
+    size_t i = 0;
+    return (operand(i++) == self);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector log_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(log_out.size(), 1);
+
+    return log_out;
+
+  }
+
+
+
+
+
+};
+
+class Log2 : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::log2);
+  }
+
+  Log2(const torch::lazy::Value& self, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Log2::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self) const {
+    size_t i = 0;
+    return (operand(i++) == self);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector log2_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(log2_out.size(), 1);
+
+    return log2_out;
+
+  }
+
+
+
+
+
+};
+
+class LogSigmoidBackward : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::log_sigmoid_backward);
+  }
+
+  LogSigmoidBackward(const torch::lazy::Value& grad_output, const torch::lazy::Value& self, const torch::lazy::Value& buffer, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              LogSigmoidBackward::ClassOpKind(),
+              OpList{grad_output, self, buffer},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& grad_output, const torch::lazy::Value& self, const torch::lazy::Value& buffer) const {
+    size_t i = 0;
+    return (operand(i++) == grad_output &&
+        operand(i++) == self &&
+        operand(i++) == buffer);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(3);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector log_sigmoid_backward_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(log_sigmoid_backward_out.size(), 1);
+
+    return log_sigmoid_backward_out;
+
+  }
+
+
+
+
+
+};
+
+class LogSigmoidForward : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::log_sigmoid_forward);
+  }
+
+  LogSigmoidForward(const torch::lazy::Value& self, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              LogSigmoidForward::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 2,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self) const {
+    size_t i = 0;
+    return (operand(i++) == self);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector log_sigmoid_forward_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(log_sigmoid_forward_out.size(), 2);
+
+    return log_sigmoid_forward_out;
+
+  }
+
+
+
+
+
+};
+
+class Logdet : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::logdet);
+  }
+
+  Logdet(const torch::lazy::Value& self, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Logdet::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self) const {
+    size_t i = 0;
+    return (operand(i++) == self);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector logdet_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(logdet_out.size(), 1);
+
+    return logdet_out;
+
+  }
+
+
+
+
+
+};
+
+class LtScalar : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::lt);
+  }
+
+  LtScalar(const torch::lazy::Value& self, const torch::lazy::Value& other, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              LtScalar::ClassOpKind(),
+              OpList{self, other},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& other) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == other);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector lt_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(lt_out.size(), 1);
+
+    return lt_out;
+
+  }
+
+
+
+
+
+};
+
+class LtTensor : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::lt);
+  }
+
+  LtTensor(const torch::lazy::Value& self, const torch::lazy::Value& other, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              LtTensor::ClassOpKind(),
+              OpList{self, other},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& other) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == other);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector lt_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(lt_out.size(), 1);
+
+    return lt_out;
+
+  }
+
+
+
+
+
+};
+
+class MaskedFillScalar : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::masked_fill);
+  }
+
+  MaskedFillScalar(const torch::lazy::Value& self, const torch::lazy::Value& mask, const torch::lazy::Value& value, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              MaskedFillScalar::ClassOpKind(),
+              OpList{self, mask, value},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& mask, const torch::lazy::Value& value) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == mask &&
+        operand(i++) == value);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(3);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector masked_fill_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(masked_fill_out.size(), 1);
+
+    return masked_fill_out;
+
+  }
+
+
+
+
+
+};
+
+class MaskedFillTensor : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::masked_fill);
+  }
+
+  MaskedFillTensor(const torch::lazy::Value& self, const torch::lazy::Value& mask, const torch::lazy::Value& value, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              MaskedFillTensor::ClassOpKind(),
+              OpList{self, mask, value},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& mask, const torch::lazy::Value& value) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == mask &&
+        operand(i++) == value);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(3);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector masked_fill_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(masked_fill_out.size(), 1);
+
+    return masked_fill_out;
+
+  }
+
+
+
+
+
+};
+
+class MaxDim : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::max);
+  }
+
+  MaxDim(const torch::lazy::Value& self, const int64_t& dim, const bool& keepdim, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              MaxDim::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 2,
+              torch::lazy::MHash(dim, keepdim)),
+        dim(dim),
+        keepdim(keepdim)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", dim=" << dim;
+    ss << ", keepdim=" << keepdim;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const int64_t& dim, const bool& keepdim) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        this->dim == dim &&
+        this->keepdim == keepdim);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(3);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("dim", dim);
+    arguments.emplace_back("keepdim", keepdim);
+
+    torch::lazy::TSOpVector max_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(max_out.size(), 2);
+
+    return max_out;
+
+  }
+
+
+  int64_t dim;
+  bool keepdim;
+
+
+};
+
+class Max : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::max);
+  }
+
+  Max(const torch::lazy::Value& self, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Max::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self) const {
+    size_t i = 0;
+    return (operand(i++) == self);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector max_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(max_out.size(), 1);
+
+    return max_out;
+
+  }
+
+
+
+
+
+};
+
+class MaxPool2dWithIndices : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::max_pool2d_with_indices);
+  }
+
+  MaxPool2dWithIndices(const torch::lazy::Value& self, const ::std::vector<int64_t>& kernel_size, const ::std::vector<int64_t>& stride, const ::std::vector<int64_t>& padding, const ::std::vector<int64_t>& dilation, const bool& ceil_mode, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              MaxPool2dWithIndices::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 2,
+              torch::lazy::MHash(kernel_size, stride, padding, dilation, ceil_mode)),
+        kernel_size(kernel_size),
+        stride(stride),
+        padding(padding),
+        dilation(dilation),
+        ceil_mode(ceil_mode)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", kernel_size=" << kernel_size;
+    ss << ", stride=" << stride;
+    ss << ", padding=" << padding;
+    ss << ", dilation=" << dilation;
+    ss << ", ceil_mode=" << ceil_mode;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const ::std::vector<int64_t>& kernel_size, const ::std::vector<int64_t>& stride, const ::std::vector<int64_t>& padding, const ::std::vector<int64_t>& dilation, const bool& ceil_mode) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        this->kernel_size == kernel_size &&
+        this->stride == stride &&
+        this->padding == padding &&
+        this->dilation == dilation &&
+        this->ceil_mode == ceil_mode);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(6);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("kernel_size", kernel_size);
+    arguments.emplace_back("stride", stride);
+    arguments.emplace_back("padding", padding);
+    arguments.emplace_back("dilation", dilation);
+    arguments.emplace_back("ceil_mode", ceil_mode);
+
+    torch::lazy::TSOpVector max_pool2d_with_indices_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(max_pool2d_with_indices_out.size(), 2);
+
+    return max_pool2d_with_indices_out;
+
+  }
+
+
+  ::std::vector<int64_t> kernel_size;
+  ::std::vector<int64_t> stride;
+  ::std::vector<int64_t> padding;
+  ::std::vector<int64_t> dilation;
+  bool ceil_mode;
+
+
+};
+
+class MaxPool2dWithIndicesBackward : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::max_pool2d_with_indices_backward);
+  }
+
+  MaxPool2dWithIndicesBackward(const torch::lazy::Value& grad_output, const torch::lazy::Value& self, const ::std::vector<int64_t>& kernel_size, const ::std::vector<int64_t>& stride, const ::std::vector<int64_t>& padding, const ::std::vector<int64_t>& dilation, const bool& ceil_mode, const torch::lazy::Value& indices, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              MaxPool2dWithIndicesBackward::ClassOpKind(),
+              OpList{grad_output, self, indices},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(kernel_size, stride, padding, dilation, ceil_mode)),
+        kernel_size(kernel_size),
+        stride(stride),
+        padding(padding),
+        dilation(dilation),
+        ceil_mode(ceil_mode)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", kernel_size=" << kernel_size;
+    ss << ", stride=" << stride;
+    ss << ", padding=" << padding;
+    ss << ", dilation=" << dilation;
+    ss << ", ceil_mode=" << ceil_mode;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& grad_output, const torch::lazy::Value& self, const ::std::vector<int64_t>& kernel_size, const ::std::vector<int64_t>& stride, const ::std::vector<int64_t>& padding, const ::std::vector<int64_t>& dilation, const bool& ceil_mode, const torch::lazy::Value& indices) const {
+    size_t i = 0;
+    return (operand(i++) == grad_output &&
+        operand(i++) == self &&
+        operand(i++) == indices &&
+        this->kernel_size == kernel_size &&
+        this->stride == stride &&
+        this->padding == padding &&
+        this->dilation == dilation &&
+        this->ceil_mode == ceil_mode);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(8);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("kernel_size", kernel_size);
+    arguments.emplace_back("stride", stride);
+    arguments.emplace_back("padding", padding);
+    arguments.emplace_back("dilation", dilation);
+    arguments.emplace_back("ceil_mode", ceil_mode);
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector max_pool2d_with_indices_backward_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(max_pool2d_with_indices_backward_out.size(), 1);
+
+    return max_pool2d_with_indices_backward_out;
+
+  }
+
+
+  ::std::vector<int64_t> kernel_size;
+  ::std::vector<int64_t> stride;
+  ::std::vector<int64_t> padding;
+  ::std::vector<int64_t> dilation;
+  bool ceil_mode;
+
+
+};
+
+class Maximum : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::maximum);
+  }
+
+  Maximum(const torch::lazy::Value& self, const torch::lazy::Value& other, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Maximum::ClassOpKind(),
+              OpList{self, other},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& other) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == other);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector maximum_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(maximum_out.size(), 1);
+
+    return maximum_out;
+
+  }
+
+
+
+
+
+};
+
+class Mean : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::mean);
+  }
+
+  Mean(const torch::lazy::Value& self, const ::std::optional<at::ScalarType>& dtype, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Mean::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(dtype)),
+        dtype(dtype)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    if (dtype.has_value()) {
+      ss << ", dtype=" << dtype.value();
+    } else {
+      ss << ", dtype=null";
+    }
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const ::std::optional<at::ScalarType>& dtype) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        ((!this->dtype&&!dtype) || (this->dtype&&dtype && *(this->dtype) == *dtype)));
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(1);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    kwarguments.emplace_back("dtype", dtype);
+    torch::lazy::TSOpVector mean_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(mean_out.size(), 1);
+
+    return mean_out;
+
+  }
+
+
+  ::std::optional<at::ScalarType> dtype;
+
+
+};
+
+class MeanDim : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::mean);
+  }
+
+  MeanDim(const torch::lazy::Value& self, const ::std::optional<::std::vector<int64_t>>& dim, const bool& keepdim, const ::std::optional<at::ScalarType>& dtype, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              MeanDim::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(dim, keepdim, dtype)),
+        dim(dim),
+        keepdim(keepdim),
+        dtype(dtype)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    if (dim.has_value()) {
+      ss << ", dim=" << dim.value();
+    } else {
+      ss << ", dim=null";
+    }
+    ss << ", keepdim=" << keepdim;
+    if (dtype.has_value()) {
+      ss << ", dtype=" << dtype.value();
+    } else {
+      ss << ", dtype=null";
+    }
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const ::std::optional<::std::vector<int64_t>>& dim, const bool& keepdim, const ::std::optional<at::ScalarType>& dtype) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        ((!this->dim&&!dim) || (this->dim&&dim && *(this->dim) == *dim)) &&
+        this->keepdim == keepdim &&
+        ((!this->dtype&&!dtype) || (this->dtype&&dtype && *(this->dtype) == *dtype)));
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(3);
+    kwarguments.reserve(1);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("dim", dim);
+    arguments.emplace_back("keepdim", keepdim);
+    kwarguments.emplace_back("dtype", dtype);
+    torch::lazy::TSOpVector mean_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(mean_out.size(), 1);
+
+    return mean_out;
+
+  }
+
+
+  ::std::optional<::std::vector<int64_t>> dim;
+  bool keepdim;
+  ::std::optional<at::ScalarType> dtype;
+
+
+};
+
+class Min : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::min);
+  }
+
+  Min(const torch::lazy::Value& self, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Min::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self) const {
+    size_t i = 0;
+    return (operand(i++) == self);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector min_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(min_out.size(), 1);
+
+    return min_out;
+
+  }
+
+
+
+
+
+};
+
+class Minimum : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::minimum);
+  }
+
+  Minimum(const torch::lazy::Value& self, const torch::lazy::Value& other, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Minimum::ClassOpKind(),
+              OpList{self, other},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& other) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == other);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector minimum_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(minimum_out.size(), 1);
+
+    return minimum_out;
+
+  }
+
+
+
+
+
+};
+
+class Mm : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::mm);
+  }
+
+  Mm(const torch::lazy::Value& self, const torch::lazy::Value& mat2, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Mm::ClassOpKind(),
+              OpList{self, mat2},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& mat2) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == mat2);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector mm_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(mm_out.size(), 1);
+
+    return mm_out;
+
+  }
+
+
+
+
+
+};
+
+class MulTensor : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::mul);
+  }
+
+  MulTensor(const torch::lazy::Value& self, const torch::lazy::Value& other, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              MulTensor::ClassOpKind(),
+              OpList{self, other},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& other) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == other);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector mul_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(mul_out.size(), 1);
+
+    return mul_out;
+
+  }
+
+
+
+
+
+};
+
+class Mv : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::mv);
+  }
+
+  Mv(const torch::lazy::Value& self, const torch::lazy::Value& vec, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Mv::ClassOpKind(),
+              OpList{self, vec},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& vec) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == vec);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector mv_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(mv_out.size(), 1);
+
+    return mv_out;
+
+  }
+
+
+
+
+
+};
+
+class NativeBatchNorm : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::native_batch_norm);
+  }
+
+  NativeBatchNorm(const torch::lazy::Value& input, const ::std::optional<torch::lazy::Value>& weight, const ::std::optional<torch::lazy::Value>& bias, const ::std::optional<torch::lazy::Value>& running_mean, const ::std::optional<torch::lazy::Value>& running_var, const bool& training, const double& momentum, const double& eps, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              NativeBatchNorm::ClassOpKind(),
+              OpList{input, weight.value_or(kNullValue), bias.value_or(kNullValue), running_mean.value_or(kNullValue), running_var.value_or(kNullValue)},
+              std::move(shapes),
+              /* num_outputs */ 3,
+              torch::lazy::MHash(training, momentum, eps)),
+        training(training),
+        momentum(momentum),
+        eps(eps)
+  {
+    has_weight = !!weight;
+    has_bias = !!bias;
+    has_running_mean = !!running_mean;
+    has_running_var = !!running_var;
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", training=" << training;
+    ss << ", momentum=" << momentum;
+    ss << ", eps=" << eps;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& input, const ::std::optional<torch::lazy::Value>& weight, const ::std::optional<torch::lazy::Value>& bias, const ::std::optional<torch::lazy::Value>& running_mean, const ::std::optional<torch::lazy::Value>& running_var, const bool& training, const double& momentum, const double& eps) const {
+    size_t i = 0;
+    return (operand(i++) == input &&
+        nullable_operand(i++) == weight.value_or(kNullValue) &&
+        nullable_operand(i++) == bias.value_or(kNullValue) &&
+        nullable_operand(i++) == running_mean.value_or(kNullValue) &&
+        nullable_operand(i++) == running_var.value_or(kNullValue) &&
+        this->training == training &&
+        this->momentum == momentum &&
+        this->eps == eps);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(8);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(has_weight ? loctx->GetOutputOp(operand(i++)) : nullptr);
+    arguments.emplace_back(has_bias ? loctx->GetOutputOp(operand(i++)) : nullptr);
+    arguments.emplace_back(has_running_mean ? loctx->GetOutputOp(operand(i++)) : nullptr);
+    arguments.emplace_back(has_running_var ? loctx->GetOutputOp(operand(i++)) : nullptr);
+    arguments.emplace_back("training", training);
+    arguments.emplace_back("momentum", momentum);
+    arguments.emplace_back("eps", eps);
+
+    torch::lazy::TSOpVector native_batch_norm_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(native_batch_norm_out.size(), 3);
+
+    return native_batch_norm_out;
+
+  }
+
+
+  bool training;
+  double momentum;
+  double eps;
+  bool has_weight: 1;
+  bool has_bias: 1;
+  bool has_running_mean: 1;
+  bool has_running_var: 1;
+
+};
+
+class NativeBatchNormBackward : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::native_batch_norm_backward);
+  }
+
+  NativeBatchNormBackward(const torch::lazy::Value& grad_out, const torch::lazy::Value& input, const ::std::optional<torch::lazy::Value>& weight, const ::std::optional<torch::lazy::Value>& running_mean, const ::std::optional<torch::lazy::Value>& running_var, const ::std::optional<torch::lazy::Value>& save_mean, const ::std::optional<torch::lazy::Value>& save_invstd, const bool& train, const double& eps, const ::std::vector<bool>& output_mask, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              NativeBatchNormBackward::ClassOpKind(),
+              OpList{grad_out, input, weight.value_or(kNullValue), running_mean.value_or(kNullValue), running_var.value_or(kNullValue), save_mean.value_or(kNullValue), save_invstd.value_or(kNullValue)},
+              std::move(shapes),
+              /* num_outputs */ 3,
+              torch::lazy::MHash(train, eps, output_mask)),
+        train(train),
+        eps(eps),
+        output_mask(output_mask)
+  {
+    has_weight = !!weight;
+    has_running_mean = !!running_mean;
+    has_running_var = !!running_var;
+    has_save_mean = !!save_mean;
+    has_save_invstd = !!save_invstd;
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", train=" << train;
+    ss << ", eps=" << eps;
+    ss << ", output_mask=" << output_mask;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& grad_out, const torch::lazy::Value& input, const ::std::optional<torch::lazy::Value>& weight, const ::std::optional<torch::lazy::Value>& running_mean, const ::std::optional<torch::lazy::Value>& running_var, const ::std::optional<torch::lazy::Value>& save_mean, const ::std::optional<torch::lazy::Value>& save_invstd, const bool& train, const double& eps, const ::std::vector<bool>& output_mask) const {
+    size_t i = 0;
+    return (operand(i++) == grad_out &&
+        operand(i++) == input &&
+        nullable_operand(i++) == weight.value_or(kNullValue) &&
+        nullable_operand(i++) == running_mean.value_or(kNullValue) &&
+        nullable_operand(i++) == running_var.value_or(kNullValue) &&
+        nullable_operand(i++) == save_mean.value_or(kNullValue) &&
+        nullable_operand(i++) == save_invstd.value_or(kNullValue) &&
+        this->train == train &&
+        this->eps == eps &&
+        this->output_mask == output_mask);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(10);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(has_weight ? loctx->GetOutputOp(operand(i++)) : nullptr);
+    arguments.emplace_back(has_running_mean ? loctx->GetOutputOp(operand(i++)) : nullptr);
+    arguments.emplace_back(has_running_var ? loctx->GetOutputOp(operand(i++)) : nullptr);
+    arguments.emplace_back(has_save_mean ? loctx->GetOutputOp(operand(i++)) : nullptr);
+    arguments.emplace_back(has_save_invstd ? loctx->GetOutputOp(operand(i++)) : nullptr);
+    arguments.emplace_back("train", train);
+    arguments.emplace_back("eps", eps);
+    arguments.emplace_back("output_mask", output_mask);
+
+    torch::lazy::TSOpVector native_batch_norm_backward_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(native_batch_norm_backward_out.size(), 3);
+
+    return native_batch_norm_backward_out;
+
+  }
+
+
+  bool train;
+  double eps;
+  ::std::vector<bool> output_mask;
+  bool has_weight: 1;
+  bool has_running_mean: 1;
+  bool has_running_var: 1;
+  bool has_save_mean: 1;
+  bool has_save_invstd: 1;
+
+};
+
+class NativeDropout : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::native_dropout);
+  }
+
+  NativeDropout(const torch::lazy::Value& input, const double& p, const ::std::optional<bool>& train, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              NativeDropout::ClassOpKind(),
+              OpList{input},
+              std::move(shapes),
+              /* num_outputs */ 2,
+              torch::lazy::MHash(p, train)),
+        p(p),
+        train(train)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", p=" << p;
+    if (train.has_value()) {
+      ss << ", train=" << train.value();
+    } else {
+      ss << ", train=null";
+    }
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& input, const double& p, const ::std::optional<bool>& train) const {
+    size_t i = 0;
+    return (operand(i++) == input &&
+        this->p == p &&
+        ((!this->train&&!train) || (this->train&&train && *(this->train) == *train)));
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(3);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("p", p);
+    arguments.emplace_back("train", train);
+
+    torch::lazy::TSOpVector native_dropout_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(native_dropout_out.size(), 2);
+
+    return native_dropout_out;
+
+  }
+
+
+  double p;
+  ::std::optional<bool> train;
+
+
+};
+
+class NativeDropoutBackward : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::native_dropout_backward);
+  }
+
+  NativeDropoutBackward(const torch::lazy::Value& grad_output, const torch::lazy::Value& mask, const double& scale, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              NativeDropoutBackward::ClassOpKind(),
+              OpList{grad_output, mask},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(scale)),
+        scale(scale)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", scale=" << scale;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& grad_output, const torch::lazy::Value& mask, const double& scale) const {
+    size_t i = 0;
+    return (operand(i++) == grad_output &&
+        operand(i++) == mask &&
+        this->scale == scale);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(3);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("scale", scale);
+
+    torch::lazy::TSOpVector native_dropout_backward_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(native_dropout_backward_out.size(), 1);
+
+    return native_dropout_backward_out;
+
+  }
+
+
+  double scale;
+
+
+};
+
+class NativeLayerNorm : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::native_layer_norm);
+  }
+
+  NativeLayerNorm(const torch::lazy::Value& input, const ::std::vector<int64_t>& normalized_shape, const ::std::optional<torch::lazy::Value>& weight, const ::std::optional<torch::lazy::Value>& bias, const double& eps, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              NativeLayerNorm::ClassOpKind(),
+              OpList{input, weight.value_or(kNullValue), bias.value_or(kNullValue)},
+              std::move(shapes),
+              /* num_outputs */ 3,
+              torch::lazy::MHash(normalized_shape, eps)),
+        normalized_shape(normalized_shape),
+        eps(eps)
+  {
+    has_weight = !!weight;
+    has_bias = !!bias;
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", normalized_shape=" << normalized_shape;
+    ss << ", eps=" << eps;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& input, const ::std::vector<int64_t>& normalized_shape, const ::std::optional<torch::lazy::Value>& weight, const ::std::optional<torch::lazy::Value>& bias, const double& eps) const {
+    size_t i = 0;
+    return (operand(i++) == input &&
+        nullable_operand(i++) == weight.value_or(kNullValue) &&
+        nullable_operand(i++) == bias.value_or(kNullValue) &&
+        this->normalized_shape == normalized_shape &&
+        this->eps == eps);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(5);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("normalized_shape", normalized_shape);
+    arguments.emplace_back(has_weight ? loctx->GetOutputOp(operand(i++)) : nullptr);
+    arguments.emplace_back(has_bias ? loctx->GetOutputOp(operand(i++)) : nullptr);
+    arguments.emplace_back("eps", eps);
+
+    torch::lazy::TSOpVector native_layer_norm_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(native_layer_norm_out.size(), 3);
+
+    return native_layer_norm_out;
+
+  }
+
+
+  ::std::vector<int64_t> normalized_shape;
+  double eps;
+  bool has_weight: 1;
+  bool has_bias: 1;
+
+};
+
+class NativeLayerNormBackward : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::native_layer_norm_backward);
+  }
+
+  NativeLayerNormBackward(const torch::lazy::Value& grad_out, const torch::lazy::Value& input, const ::std::vector<int64_t>& normalized_shape, const torch::lazy::Value& mean, const torch::lazy::Value& rstd, const ::std::optional<torch::lazy::Value>& weight, const ::std::optional<torch::lazy::Value>& bias, const ::std::vector<bool>& output_mask, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              NativeLayerNormBackward::ClassOpKind(),
+              OpList{grad_out, input, mean, rstd, weight.value_or(kNullValue), bias.value_or(kNullValue)},
+              std::move(shapes),
+              /* num_outputs */ 3,
+              torch::lazy::MHash(normalized_shape, output_mask)),
+        normalized_shape(normalized_shape),
+        output_mask(output_mask)
+  {
+    has_weight = !!weight;
+    has_bias = !!bias;
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", normalized_shape=" << normalized_shape;
+    ss << ", output_mask=" << output_mask;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& grad_out, const torch::lazy::Value& input, const ::std::vector<int64_t>& normalized_shape, const torch::lazy::Value& mean, const torch::lazy::Value& rstd, const ::std::optional<torch::lazy::Value>& weight, const ::std::optional<torch::lazy::Value>& bias, const ::std::vector<bool>& output_mask) const {
+    size_t i = 0;
+    return (operand(i++) == grad_out &&
+        operand(i++) == input &&
+        operand(i++) == mean &&
+        operand(i++) == rstd &&
+        nullable_operand(i++) == weight.value_or(kNullValue) &&
+        nullable_operand(i++) == bias.value_or(kNullValue) &&
+        this->normalized_shape == normalized_shape &&
+        this->output_mask == output_mask);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(8);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("normalized_shape", normalized_shape);
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(has_weight ? loctx->GetOutputOp(operand(i++)) : nullptr);
+    arguments.emplace_back(has_bias ? loctx->GetOutputOp(operand(i++)) : nullptr);
+    arguments.emplace_back("output_mask", output_mask);
+
+    torch::lazy::TSOpVector native_layer_norm_backward_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(native_layer_norm_backward_out.size(), 3);
+
+    return native_layer_norm_backward_out;
+
+  }
+
+
+  ::std::vector<int64_t> normalized_shape;
+  ::std::vector<bool> output_mask;
+  bool has_weight: 1;
+  bool has_bias: 1;
+
+};
+
+class NeScalar : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::ne);
+  }
+
+  NeScalar(const torch::lazy::Value& self, const torch::lazy::Value& other, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              NeScalar::ClassOpKind(),
+              OpList{self, other},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& other) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == other);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector ne_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(ne_out.size(), 1);
+
+    return ne_out;
+
+  }
+
+
+
+
+
+};
+
+class NeTensor : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::ne);
+  }
+
+  NeTensor(const torch::lazy::Value& self, const torch::lazy::Value& other, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              NeTensor::ClassOpKind(),
+              OpList{self, other},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& other) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == other);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector ne_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(ne_out.size(), 1);
+
+    return ne_out;
+
+  }
+
+
+
+
+
+};
+
+class Neg : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::neg);
+  }
+
+  Neg(const torch::lazy::Value& self, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Neg::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self) const {
+    size_t i = 0;
+    return (operand(i++) == self);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector neg_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(neg_out.size(), 1);
+
+    return neg_out;
+
+  }
+
+
+
+
+
+};
+
+class NllLoss2dBackward : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::nll_loss2d_backward);
+  }
+
+  NllLoss2dBackward(const torch::lazy::Value& grad_output, const torch::lazy::Value& self, const torch::lazy::Value& target, const ::std::optional<torch::lazy::Value>& weight, const int64_t& reduction, const int64_t& ignore_index, const torch::lazy::Value& total_weight, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              NllLoss2dBackward::ClassOpKind(),
+              OpList{grad_output, self, target, weight.value_or(kNullValue), total_weight},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(reduction, ignore_index)),
+        reduction(reduction),
+        ignore_index(ignore_index)
+  {
+    has_weight = !!weight;
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", reduction=" << reduction;
+    ss << ", ignore_index=" << ignore_index;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& grad_output, const torch::lazy::Value& self, const torch::lazy::Value& target, const ::std::optional<torch::lazy::Value>& weight, const int64_t& reduction, const int64_t& ignore_index, const torch::lazy::Value& total_weight) const {
+    size_t i = 0;
+    return (operand(i++) == grad_output &&
+        operand(i++) == self &&
+        operand(i++) == target &&
+        nullable_operand(i++) == weight.value_or(kNullValue) &&
+        operand(i++) == total_weight &&
+        this->reduction == reduction &&
+        this->ignore_index == ignore_index);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(7);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(has_weight ? loctx->GetOutputOp(operand(i++)) : nullptr);
+    arguments.emplace_back("reduction", reduction);
+    arguments.emplace_back("ignore_index", ignore_index);
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector nll_loss2d_backward_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(nll_loss2d_backward_out.size(), 1);
+
+    return nll_loss2d_backward_out;
+
+  }
+
+
+  int64_t reduction;
+  int64_t ignore_index;
+  bool has_weight: 1;
+
+};
+
+class NllLoss2dForward : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::nll_loss2d_forward);
+  }
+
+  NllLoss2dForward(const torch::lazy::Value& self, const torch::lazy::Value& target, const ::std::optional<torch::lazy::Value>& weight, const int64_t& reduction, const int64_t& ignore_index, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              NllLoss2dForward::ClassOpKind(),
+              OpList{self, target, weight.value_or(kNullValue)},
+              std::move(shapes),
+              /* num_outputs */ 2,
+              torch::lazy::MHash(reduction, ignore_index)),
+        reduction(reduction),
+        ignore_index(ignore_index)
+  {
+    has_weight = !!weight;
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", reduction=" << reduction;
+    ss << ", ignore_index=" << ignore_index;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& target, const ::std::optional<torch::lazy::Value>& weight, const int64_t& reduction, const int64_t& ignore_index) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == target &&
+        nullable_operand(i++) == weight.value_or(kNullValue) &&
+        this->reduction == reduction &&
+        this->ignore_index == ignore_index);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(5);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(has_weight ? loctx->GetOutputOp(operand(i++)) : nullptr);
+    arguments.emplace_back("reduction", reduction);
+    arguments.emplace_back("ignore_index", ignore_index);
+
+    torch::lazy::TSOpVector nll_loss2d_forward_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(nll_loss2d_forward_out.size(), 2);
+
+    return nll_loss2d_forward_out;
+
+  }
+
+
+  int64_t reduction;
+  int64_t ignore_index;
+  bool has_weight: 1;
+
+};
+
+class NllLossBackward : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::nll_loss_backward);
+  }
+
+  NllLossBackward(const torch::lazy::Value& grad_output, const torch::lazy::Value& self, const torch::lazy::Value& target, const ::std::optional<torch::lazy::Value>& weight, const int64_t& reduction, const int64_t& ignore_index, const torch::lazy::Value& total_weight, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              NllLossBackward::ClassOpKind(),
+              OpList{grad_output, self, target, weight.value_or(kNullValue), total_weight},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(reduction, ignore_index)),
+        reduction(reduction),
+        ignore_index(ignore_index)
+  {
+    has_weight = !!weight;
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", reduction=" << reduction;
+    ss << ", ignore_index=" << ignore_index;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& grad_output, const torch::lazy::Value& self, const torch::lazy::Value& target, const ::std::optional<torch::lazy::Value>& weight, const int64_t& reduction, const int64_t& ignore_index, const torch::lazy::Value& total_weight) const {
+    size_t i = 0;
+    return (operand(i++) == grad_output &&
+        operand(i++) == self &&
+        operand(i++) == target &&
+        nullable_operand(i++) == weight.value_or(kNullValue) &&
+        operand(i++) == total_weight &&
+        this->reduction == reduction &&
+        this->ignore_index == ignore_index);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(7);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(has_weight ? loctx->GetOutputOp(operand(i++)) : nullptr);
+    arguments.emplace_back("reduction", reduction);
+    arguments.emplace_back("ignore_index", ignore_index);
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector nll_loss_backward_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(nll_loss_backward_out.size(), 1);
+
+    return nll_loss_backward_out;
+
+  }
+
+
+  int64_t reduction;
+  int64_t ignore_index;
+  bool has_weight: 1;
+
+};
+
+class NllLossForward : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::nll_loss_forward);
+  }
+
+  NllLossForward(const torch::lazy::Value& self, const torch::lazy::Value& target, const ::std::optional<torch::lazy::Value>& weight, const int64_t& reduction, const int64_t& ignore_index, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              NllLossForward::ClassOpKind(),
+              OpList{self, target, weight.value_or(kNullValue)},
+              std::move(shapes),
+              /* num_outputs */ 2,
+              torch::lazy::MHash(reduction, ignore_index)),
+        reduction(reduction),
+        ignore_index(ignore_index)
+  {
+    has_weight = !!weight;
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", reduction=" << reduction;
+    ss << ", ignore_index=" << ignore_index;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& target, const ::std::optional<torch::lazy::Value>& weight, const int64_t& reduction, const int64_t& ignore_index) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == target &&
+        nullable_operand(i++) == weight.value_or(kNullValue) &&
+        this->reduction == reduction &&
+        this->ignore_index == ignore_index);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(5);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(has_weight ? loctx->GetOutputOp(operand(i++)) : nullptr);
+    arguments.emplace_back("reduction", reduction);
+    arguments.emplace_back("ignore_index", ignore_index);
+
+    torch::lazy::TSOpVector nll_loss_forward_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(nll_loss_forward_out.size(), 2);
+
+    return nll_loss_forward_out;
+
+  }
+
+
+  int64_t reduction;
+  int64_t ignore_index;
+  bool has_weight: 1;
+
+};
+
+class Nonzero : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::nonzero);
+  }
+
+  Nonzero(const torch::lazy::Value& self, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Nonzero::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self) const {
+    size_t i = 0;
+    return (operand(i++) == self);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector nonzero_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(nonzero_out.size(), 1);
+
+    return nonzero_out;
+
+  }
+
+
+
+
+
+};
+
+class NormScalaroptDim : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::norm);
+  }
+
+  NormScalaroptDim(const torch::lazy::Value& self, const ::std::optional<torch::lazy::Value>& p, const ::std::vector<int64_t>& dim, const bool& keepdim, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              NormScalaroptDim::ClassOpKind(),
+              OpList{self, p.value_or(kNullValue)},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(dim, keepdim)),
+        dim(dim),
+        keepdim(keepdim)
+  {
+    has_p = !!p;
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", dim=" << dim;
+    ss << ", keepdim=" << keepdim;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const ::std::optional<torch::lazy::Value>& p, const ::std::vector<int64_t>& dim, const bool& keepdim) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        nullable_operand(i++) == p.value_or(kNullValue) &&
+        this->dim == dim &&
+        this->keepdim == keepdim);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(4);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(has_p ? loctx->GetOutputOp(operand(i++)) : nullptr);
+    arguments.emplace_back("dim", dim);
+    arguments.emplace_back("keepdim", keepdim);
+
+    torch::lazy::TSOpVector norm_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(norm_out.size(), 1);
+
+    return norm_out;
+
+  }
+
+
+  ::std::vector<int64_t> dim;
+  bool keepdim;
+  bool has_p: 1;
+
+};
+
+class NormalFunctional : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::normal_functional);
+  }
+
+  NormalFunctional(const torch::lazy::Value& self, const double& mean, const double& std, const ::std::optional<at::Generator>& generator, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              NormalFunctional::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(mean, std, generator)),
+        mean(mean),
+        std(std),
+        generator(generator)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", mean=" << mean;
+    ss << ", std=" << std;
+    if (generator.has_value()) {
+      ss << ", generator=" << "torch.Generator()";
+    } else {
+      ss << ", generator=null";
+    }
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const double& mean, const double& std, const ::std::optional<at::Generator>& generator) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        this->mean == mean &&
+        this->std == std &&
+        ((!this->generator&&!generator) || (this->generator&&generator && *(this->generator) == *generator)));
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(3);
+    kwarguments.reserve(1);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("mean", mean);
+    arguments.emplace_back("std", std);
+    kwarguments.emplace_back("generator", generator);
+    torch::lazy::TSOpVector normal_functional_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(normal_functional_out.size(), 1);
+
+    return normal_functional_out;
+
+  }
+
+
+  double mean;
+  double std;
+  ::std::optional<at::Generator> generator;
+
+
+};
+
+class PermuteCopy : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::permute_copy);
+  }
+
+  PermuteCopy(const torch::lazy::Value& self, const ::std::vector<int64_t>& dims, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              PermuteCopy::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(dims)),
+        dims(dims)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", dims=" << dims;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const ::std::vector<int64_t>& dims) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        this->dims == dims);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("dims", dims);
+
+    torch::lazy::TSOpVector permute_copy_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(permute_copy_out.size(), 1);
+
+    return permute_copy_out;
+
+  }
+
+
+  ::std::vector<int64_t> dims;
+
+
+};
+
+class PowTensorTensor : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::pow);
+  }
+
+  PowTensorTensor(const torch::lazy::Value& self, const torch::lazy::Value& exponent, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              PowTensorTensor::ClassOpKind(),
+              OpList{self, exponent},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& exponent) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == exponent);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector pow_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(pow_out.size(), 1);
+
+    return pow_out;
+
+  }
+
+
+
+
+
+};
+
+class PowTensorScalar : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::pow);
+  }
+
+  PowTensorScalar(const torch::lazy::Value& self, const torch::lazy::Value& exponent, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              PowTensorScalar::ClassOpKind(),
+              OpList{self, exponent},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& exponent) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == exponent);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector pow_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(pow_out.size(), 1);
+
+    return pow_out;
+
+  }
+
+
+
+
+
+};
+
+class RandomFrom : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::random);
+  }
+
+  RandomFrom(const torch::lazy::Value& self, const int64_t& from, const ::std::optional<int64_t>& to, const ::std::optional<at::Generator>& generator, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              RandomFrom::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(from, to, generator)),
+        from(from),
+        to(to),
+        generator(generator)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", from=" << from;
+    if (to.has_value()) {
+      ss << ", to=" << to.value();
+    } else {
+      ss << ", to=null";
+    }
+    if (generator.has_value()) {
+      ss << ", generator=" << "torch.Generator()";
+    } else {
+      ss << ", generator=null";
+    }
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const int64_t& from, const ::std::optional<int64_t>& to, const ::std::optional<at::Generator>& generator) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        this->from == from &&
+        ((!this->to&&!to) || (this->to&&to && *(this->to) == *to)) &&
+        ((!this->generator&&!generator) || (this->generator&&generator && *(this->generator) == *generator)));
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(3);
+    kwarguments.reserve(1);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("from", from);
+    arguments.emplace_back("to", to);
+    kwarguments.emplace_back("generator", generator);
+    torch::lazy::TSOpVector random_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(random_out.size(), 1);
+
+    return random_out;
+
+  }
+
+
+  int64_t from;
+  ::std::optional<int64_t> to;
+  ::std::optional<at::Generator> generator;
+
+
+};
+
+class RandomTo : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::random);
+  }
+
+  RandomTo(const torch::lazy::Value& self, const int64_t& to, const ::std::optional<at::Generator>& generator, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              RandomTo::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(to, generator)),
+        to(to),
+        generator(generator)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", to=" << to;
+    if (generator.has_value()) {
+      ss << ", generator=" << "torch.Generator()";
+    } else {
+      ss << ", generator=null";
+    }
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const int64_t& to, const ::std::optional<at::Generator>& generator) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        this->to == to &&
+        ((!this->generator&&!generator) || (this->generator&&generator && *(this->generator) == *generator)));
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(1);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("to", to);
+    kwarguments.emplace_back("generator", generator);
+    torch::lazy::TSOpVector random_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(random_out.size(), 1);
+
+    return random_out;
+
+  }
+
+
+  int64_t to;
+  ::std::optional<at::Generator> generator;
+
+
+};
+
+class Random : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::random);
+  }
+
+  Random(const torch::lazy::Value& self, const ::std::optional<at::Generator>& generator, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Random::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(generator)),
+        generator(generator)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    if (generator.has_value()) {
+      ss << ", generator=" << "torch.Generator()";
+    } else {
+      ss << ", generator=null";
+    }
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const ::std::optional<at::Generator>& generator) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        ((!this->generator&&!generator) || (this->generator&&generator && *(this->generator) == *generator)));
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(1);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    kwarguments.emplace_back("generator", generator);
+    torch::lazy::TSOpVector random_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(random_out.size(), 1);
+
+    return random_out;
+
+  }
+
+
+  ::std::optional<at::Generator> generator;
+
+
+};
+
+class Reciprocal : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::reciprocal);
+  }
+
+  Reciprocal(const torch::lazy::Value& self, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Reciprocal::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self) const {
+    size_t i = 0;
+    return (operand(i++) == self);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector reciprocal_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(reciprocal_out.size(), 1);
+
+    return reciprocal_out;
+
+  }
+
+
+
+
+
+};
+
+class Relu : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::relu);
+  }
+
+  Relu(const torch::lazy::Value& self, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Relu::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self) const {
+    size_t i = 0;
+    return (operand(i++) == self);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector relu_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(relu_out.size(), 1);
+
+    return relu_out;
+
+  }
+
+
+
+
+
+};
+
+class RemainderTensor : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::remainder);
+  }
+
+  RemainderTensor(const torch::lazy::Value& self, const torch::lazy::Value& other, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              RemainderTensor::ClassOpKind(),
+              OpList{self, other},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& other) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == other);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector remainder_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(remainder_out.size(), 1);
+
+    return remainder_out;
+
+  }
+
+
+
+
+
+};
+
+class Repeat : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::repeat);
+  }
+
+  Repeat(const torch::lazy::Value& self, const ::std::vector<int64_t>& repeats, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Repeat::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(repeats)),
+        repeats(repeats)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", repeats=" << repeats;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const ::std::vector<int64_t>& repeats) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        this->repeats == repeats);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("repeats", repeats);
+
+    torch::lazy::TSOpVector repeat_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(repeat_out.size(), 1);
+
+    return repeat_out;
+
+  }
+
+
+  ::std::vector<int64_t> repeats;
+
+
+};
+
+class Rsqrt : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::rsqrt);
+  }
+
+  Rsqrt(const torch::lazy::Value& self, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Rsqrt::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self) const {
+    size_t i = 0;
+    return (operand(i++) == self);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector rsqrt_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(rsqrt_out.size(), 1);
+
+    return rsqrt_out;
+
+  }
+
+
+
+
+
+};
+
+class ScatterAdd : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::scatter_add);
+  }
+
+  ScatterAdd(const torch::lazy::Value& self, const int64_t& dim, const torch::lazy::Value& index, const torch::lazy::Value& src, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              ScatterAdd::ClassOpKind(),
+              OpList{self, index, src},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(dim)),
+        dim(dim)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", dim=" << dim;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const int64_t& dim, const torch::lazy::Value& index, const torch::lazy::Value& src) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == index &&
+        operand(i++) == src &&
+        this->dim == dim);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(4);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("dim", dim);
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector scatter_add_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(scatter_add_out.size(), 1);
+
+    return scatter_add_out;
+
+  }
+
+
+  int64_t dim;
+
+
+};
+
+class SelectCopyInt : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::select_copy);
+  }
+
+  SelectCopyInt(const torch::lazy::Value& self, const int64_t& dim, const int64_t& index, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              SelectCopyInt::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(dim, index)),
+        dim(dim),
+        index(index)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", dim=" << dim;
+    ss << ", index=" << index;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const int64_t& dim, const int64_t& index) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        this->dim == dim &&
+        this->index == index);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(3);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("dim", dim);
+    arguments.emplace_back("index", index);
+
+    torch::lazy::TSOpVector select_copy_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(select_copy_out.size(), 1);
+
+    return select_copy_out;
+
+  }
+
+
+  int64_t dim;
+  int64_t index;
+
+
+};
+
+class SelectScatter : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::select_scatter);
+  }
+
+  SelectScatter(const torch::lazy::Value& self, const torch::lazy::Value& src, const int64_t& dim, const int64_t& index, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              SelectScatter::ClassOpKind(),
+              OpList{self, src},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(dim, index)),
+        dim(dim),
+        index(index)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", dim=" << dim;
+    ss << ", index=" << index;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& src, const int64_t& dim, const int64_t& index) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == src &&
+        this->dim == dim &&
+        this->index == index);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(4);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("dim", dim);
+    arguments.emplace_back("index", index);
+
+    torch::lazy::TSOpVector select_scatter_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(select_scatter_out.size(), 1);
+
+    return select_scatter_out;
+
+  }
+
+
+  int64_t dim;
+  int64_t index;
+
+
+};
+
+class Selu : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::selu);
+  }
+
+  Selu(const torch::lazy::Value& self, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Selu::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self) const {
+    size_t i = 0;
+    return (operand(i++) == self);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector selu_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(selu_out.size(), 1);
+
+    return selu_out;
+
+  }
+
+
+
+
+
+};
+
+class Sgn : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::sgn);
+  }
+
+  Sgn(const torch::lazy::Value& self, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Sgn::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self) const {
+    size_t i = 0;
+    return (operand(i++) == self);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector sgn_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(sgn_out.size(), 1);
+
+    return sgn_out;
+
+  }
+
+
+
+
+
+};
+
+class Sigmoid : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::sigmoid);
+  }
+
+  Sigmoid(const torch::lazy::Value& self, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Sigmoid::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self) const {
+    size_t i = 0;
+    return (operand(i++) == self);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector sigmoid_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(sigmoid_out.size(), 1);
+
+    return sigmoid_out;
+
+  }
+
+
+
+
+
+};
+
+class SigmoidBackward : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(c10::Symbol::fromQualString("aten::sigmoid_backward"));
+  }
+
+  SigmoidBackward(const torch::lazy::Value& grad_output, const torch::lazy::Value& output, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              SigmoidBackward::ClassOpKind(),
+              OpList{grad_output, output},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& grad_output, const torch::lazy::Value& output) const {
+    size_t i = 0;
+    return (operand(i++) == grad_output &&
+        operand(i++) == output);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector sigmoid_backward_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(sigmoid_backward_out.size(), 1);
+
+    return sigmoid_backward_out;
+
+  }
+
+
+
+
+
+};
+
+class Silu : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::silu);
+  }
+
+  Silu(const torch::lazy::Value& self, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Silu::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self) const {
+    size_t i = 0;
+    return (operand(i++) == self);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector silu_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(silu_out.size(), 1);
+
+    return silu_out;
+
+  }
+
+
+
+
+
+};
+
+class SliceCopyTensor : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::slice_copy);
+  }
+
+  SliceCopyTensor(const torch::lazy::Value& self, const int64_t& dim, const ::std::optional<torch::lazy::Value>& start, const ::std::optional<torch::lazy::Value>& end, const torch::lazy::Value& step, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              SliceCopyTensor::ClassOpKind(),
+              OpList{self, start.value_or(kNullValue), end.value_or(kNullValue), step},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(dim)),
+        dim(dim)
+  {
+    has_start = !!start;
+    has_end = !!end;
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", dim=" << dim;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const int64_t& dim, const ::std::optional<torch::lazy::Value>& start, const ::std::optional<torch::lazy::Value>& end, const torch::lazy::Value& step) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        nullable_operand(i++) == start.value_or(kNullValue) &&
+        nullable_operand(i++) == end.value_or(kNullValue) &&
+        operand(i++) == step &&
+        this->dim == dim);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(5);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("dim", dim);
+    arguments.emplace_back(has_start ? loctx->GetOutputOp(operand(i++)) : nullptr);
+    arguments.emplace_back(has_end ? loctx->GetOutputOp(operand(i++)) : nullptr);
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector slice_copy_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(slice_copy_out.size(), 1);
+
+    return slice_copy_out;
+
+  }
+
+
+  int64_t dim;
+  bool has_start: 1;
+  bool has_end: 1;
+
+};
+
+class SliceScatter : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::slice_scatter);
+  }
+
+  SliceScatter(const torch::lazy::Value& self, const torch::lazy::Value& src, const int64_t& dim, const ::std::optional<torch::lazy::Value>& start, const ::std::optional<torch::lazy::Value>& end, const torch::lazy::Value& step, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              SliceScatter::ClassOpKind(),
+              OpList{self, src, start.value_or(kNullValue), end.value_or(kNullValue), step},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(dim)),
+        dim(dim)
+  {
+    has_start = !!start;
+    has_end = !!end;
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", dim=" << dim;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& src, const int64_t& dim, const ::std::optional<torch::lazy::Value>& start, const ::std::optional<torch::lazy::Value>& end, const torch::lazy::Value& step) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == src &&
+        nullable_operand(i++) == start.value_or(kNullValue) &&
+        nullable_operand(i++) == end.value_or(kNullValue) &&
+        operand(i++) == step &&
+        this->dim == dim);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(6);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("dim", dim);
+    arguments.emplace_back(has_start ? loctx->GetOutputOp(operand(i++)) : nullptr);
+    arguments.emplace_back(has_end ? loctx->GetOutputOp(operand(i++)) : nullptr);
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector slice_scatter_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(slice_scatter_out.size(), 1);
+
+    return slice_scatter_out;
+
+  }
+
+
+  int64_t dim;
+  bool has_start: 1;
+  bool has_end: 1;
+
+};
+
+class SmoothL1Loss : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::smooth_l1_loss);
+  }
+
+  SmoothL1Loss(const torch::lazy::Value& self, const torch::lazy::Value& target, const int64_t& reduction, const double& beta, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              SmoothL1Loss::ClassOpKind(),
+              OpList{self, target},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(reduction, beta)),
+        reduction(reduction),
+        beta(beta)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", reduction=" << reduction;
+    ss << ", beta=" << beta;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& target, const int64_t& reduction, const double& beta) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == target &&
+        this->reduction == reduction &&
+        this->beta == beta);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(4);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("reduction", reduction);
+    arguments.emplace_back("beta", beta);
+
+    torch::lazy::TSOpVector smooth_l1_loss_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(smooth_l1_loss_out.size(), 1);
+
+    return smooth_l1_loss_out;
+
+  }
+
+
+  int64_t reduction;
+  double beta;
+
+
+};
+
+class SmoothL1LossBackward : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::smooth_l1_loss_backward);
+  }
+
+  SmoothL1LossBackward(const torch::lazy::Value& grad_output, const torch::lazy::Value& self, const torch::lazy::Value& target, const int64_t& reduction, const double& beta, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              SmoothL1LossBackward::ClassOpKind(),
+              OpList{grad_output, self, target},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(reduction, beta)),
+        reduction(reduction),
+        beta(beta)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", reduction=" << reduction;
+    ss << ", beta=" << beta;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& grad_output, const torch::lazy::Value& self, const torch::lazy::Value& target, const int64_t& reduction, const double& beta) const {
+    size_t i = 0;
+    return (operand(i++) == grad_output &&
+        operand(i++) == self &&
+        operand(i++) == target &&
+        this->reduction == reduction &&
+        this->beta == beta);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(5);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("reduction", reduction);
+    arguments.emplace_back("beta", beta);
+
+    torch::lazy::TSOpVector smooth_l1_loss_backward_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(smooth_l1_loss_backward_out.size(), 1);
+
+    return smooth_l1_loss_backward_out;
+
+  }
+
+
+  int64_t reduction;
+  double beta;
+
+
+};
+
+class Softplus : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::softplus);
+  }
+
+  Softplus(const torch::lazy::Value& self, const torch::lazy::Value& beta, const torch::lazy::Value& threshold, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Softplus::ClassOpKind(),
+              OpList{self, beta, threshold},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& beta, const torch::lazy::Value& threshold) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == beta &&
+        operand(i++) == threshold);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(3);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector softplus_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(softplus_out.size(), 1);
+
+    return softplus_out;
+
+  }
+
+
+
+
+
+};
+
+class SoftplusBackward : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::softplus_backward);
+  }
+
+  SoftplusBackward(const torch::lazy::Value& grad_output, const torch::lazy::Value& self, const torch::lazy::Value& beta, const torch::lazy::Value& threshold, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              SoftplusBackward::ClassOpKind(),
+              OpList{grad_output, self, beta, threshold},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& grad_output, const torch::lazy::Value& self, const torch::lazy::Value& beta, const torch::lazy::Value& threshold) const {
+    size_t i = 0;
+    return (operand(i++) == grad_output &&
+        operand(i++) == self &&
+        operand(i++) == beta &&
+        operand(i++) == threshold);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(4);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector softplus_backward_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(softplus_backward_out.size(), 1);
+
+    return softplus_backward_out;
+
+  }
+
+
+
+
+
+};
+
+class Sort : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::sort);
+  }
+
+  Sort(const torch::lazy::Value& self, const int64_t& dim, const bool& descending, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Sort::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 2,
+              torch::lazy::MHash(dim, descending)),
+        dim(dim),
+        descending(descending)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", dim=" << dim;
+    ss << ", descending=" << descending;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const int64_t& dim, const bool& descending) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        this->dim == dim &&
+        this->descending == descending);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(3);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("dim", dim);
+    arguments.emplace_back("descending", descending);
+
+    torch::lazy::TSOpVector sort_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(sort_out.size(), 2);
+
+    return sort_out;
+
+  }
+
+
+  int64_t dim;
+  bool descending;
+
+
+};
+
+class Sqrt : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::sqrt);
+  }
+
+  Sqrt(const torch::lazy::Value& self, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Sqrt::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self) const {
+    size_t i = 0;
+    return (operand(i++) == self);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector sqrt_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(sqrt_out.size(), 1);
+
+    return sqrt_out;
+
+  }
+
+
+
+
+
+};
+
+class SqueezeCopy : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::squeeze_copy);
+  }
+
+  SqueezeCopy(const torch::lazy::Value& self, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              SqueezeCopy::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self) const {
+    size_t i = 0;
+    return (operand(i++) == self);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector squeeze_copy_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(squeeze_copy_out.size(), 1);
+
+    return squeeze_copy_out;
+
+  }
+
+
+
+
+
+};
+
+class SqueezeCopyDim : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::squeeze_copy);
+  }
+
+  SqueezeCopyDim(const torch::lazy::Value& self, const int64_t& dim, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              SqueezeCopyDim::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(dim)),
+        dim(dim)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", dim=" << dim;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const int64_t& dim) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        this->dim == dim);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("dim", dim);
+
+    torch::lazy::TSOpVector squeeze_copy_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(squeeze_copy_out.size(), 1);
+
+    return squeeze_copy_out;
+
+  }
+
+
+  int64_t dim;
+
+
+};
+
+class SqueezeCopyDims : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::squeeze_copy);
+  }
+
+  SqueezeCopyDims(const torch::lazy::Value& self, const ::std::vector<int64_t>& dim, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              SqueezeCopyDims::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(dim)),
+        dim(dim)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", dim=" << dim;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const ::std::vector<int64_t>& dim) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        this->dim == dim);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("dim", dim);
+
+    torch::lazy::TSOpVector squeeze_copy_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(squeeze_copy_out.size(), 1);
+
+    return squeeze_copy_out;
+
+  }
+
+
+  ::std::vector<int64_t> dim;
+
+
+};
+
+class Stack : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::stack);
+  }
+
+  Stack(const torch::lazy::Value& tensors, const int64_t& dim, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Stack::ClassOpKind(),
+              OpList{tensors},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(dim)),
+        dim(dim)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", dim=" << dim;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& tensors, const int64_t& dim) const {
+    size_t i = 0;
+    return (operand(i++) == tensors &&
+        this->dim == dim);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("dim", dim);
+
+    torch::lazy::TSOpVector stack_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(stack_out.size(), 1);
+
+    return stack_out;
+
+  }
+
+
+  int64_t dim;
+
+
+};
+
+class Std : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::std);
+  }
+
+  Std(const torch::lazy::Value& self, const bool& unbiased, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Std::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(unbiased)),
+        unbiased(unbiased)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", unbiased=" << unbiased;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const bool& unbiased) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        this->unbiased == unbiased);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("unbiased", unbiased);
+
+    torch::lazy::TSOpVector std_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(std_out.size(), 1);
+
+    return std_out;
+
+  }
+
+
+  bool unbiased;
+
+
+};
+
+class StdDim : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::std);
+  }
+
+  StdDim(const torch::lazy::Value& self, const ::std::optional<::std::vector<int64_t>>& dim, const bool& unbiased, const bool& keepdim, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              StdDim::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(dim, unbiased, keepdim)),
+        dim(dim),
+        unbiased(unbiased),
+        keepdim(keepdim)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    if (dim.has_value()) {
+      ss << ", dim=" << dim.value();
+    } else {
+      ss << ", dim=null";
+    }
+    ss << ", unbiased=" << unbiased;
+    ss << ", keepdim=" << keepdim;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const ::std::optional<::std::vector<int64_t>>& dim, const bool& unbiased, const bool& keepdim) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        ((!this->dim&&!dim) || (this->dim&&dim && *(this->dim) == *dim)) &&
+        this->unbiased == unbiased &&
+        this->keepdim == keepdim);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(4);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("dim", dim);
+    arguments.emplace_back("unbiased", unbiased);
+    arguments.emplace_back("keepdim", keepdim);
+
+    torch::lazy::TSOpVector std_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(std_out.size(), 1);
+
+    return std_out;
+
+  }
+
+
+  ::std::optional<::std::vector<int64_t>> dim;
+  bool unbiased;
+  bool keepdim;
+
+
+};
+
+class StdCorrection : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::std);
+  }
+
+  StdCorrection(const torch::lazy::Value& self, const ::std::optional<::std::vector<int64_t>>& dim, const ::std::optional<torch::lazy::Value>& correction, const bool& keepdim, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              StdCorrection::ClassOpKind(),
+              OpList{self, correction.value_or(kNullValue)},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(dim, keepdim)),
+        dim(dim),
+        keepdim(keepdim)
+  {
+    has_correction = !!correction;
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    if (dim.has_value()) {
+      ss << ", dim=" << dim.value();
+    } else {
+      ss << ", dim=null";
+    }
+    ss << ", keepdim=" << keepdim;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const ::std::optional<::std::vector<int64_t>>& dim, const ::std::optional<torch::lazy::Value>& correction, const bool& keepdim) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        nullable_operand(i++) == correction.value_or(kNullValue) &&
+        ((!this->dim&&!dim) || (this->dim&&dim && *(this->dim) == *dim)) &&
+        this->keepdim == keepdim);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(2);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("dim", dim);
+    kwarguments.emplace_back("correction", has_correction ? loctx->GetOutputOp(operand(i++)) : nullptr);
+    kwarguments.emplace_back("keepdim", keepdim);
+    torch::lazy::TSOpVector std_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(std_out.size(), 1);
+
+    return std_out;
+
+  }
+
+
+  ::std::optional<::std::vector<int64_t>> dim;
+  bool keepdim;
+  bool has_correction: 1;
+
+};
+
+class SubTensor : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::sub);
+  }
+
+  SubTensor(const torch::lazy::Value& self, const torch::lazy::Value& other, const torch::lazy::Value& alpha, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              SubTensor::ClassOpKind(),
+              OpList{self, other, alpha},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& other, const torch::lazy::Value& alpha) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == other &&
+        operand(i++) == alpha);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(1);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    kwarguments.emplace_back("alpha", loctx->GetOutputOp(operand(i++)));
+    torch::lazy::TSOpVector sub_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(sub_out.size(), 1);
+
+    return sub_out;
+
+  }
+
+
+
+
+
+};
+
+class Sum : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::sum);
+  }
+
+  Sum(const torch::lazy::Value& self, const ::std::optional<at::ScalarType>& dtype, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Sum::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(dtype)),
+        dtype(dtype)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    if (dtype.has_value()) {
+      ss << ", dtype=" << dtype.value();
+    } else {
+      ss << ", dtype=null";
+    }
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const ::std::optional<at::ScalarType>& dtype) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        ((!this->dtype&&!dtype) || (this->dtype&&dtype && *(this->dtype) == *dtype)));
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(1);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    kwarguments.emplace_back("dtype", dtype);
+    torch::lazy::TSOpVector sum_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(sum_out.size(), 1);
+
+    return sum_out;
+
+  }
+
+
+  ::std::optional<at::ScalarType> dtype;
+
+
+};
+
+class SumDimIntlist : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::sum);
+  }
+
+  SumDimIntlist(const torch::lazy::Value& self, const ::std::optional<::std::vector<int64_t>>& dim, const bool& keepdim, const ::std::optional<at::ScalarType>& dtype, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              SumDimIntlist::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(dim, keepdim, dtype)),
+        dim(dim),
+        keepdim(keepdim),
+        dtype(dtype)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    if (dim.has_value()) {
+      ss << ", dim=" << dim.value();
+    } else {
+      ss << ", dim=null";
+    }
+    ss << ", keepdim=" << keepdim;
+    if (dtype.has_value()) {
+      ss << ", dtype=" << dtype.value();
+    } else {
+      ss << ", dtype=null";
+    }
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const ::std::optional<::std::vector<int64_t>>& dim, const bool& keepdim, const ::std::optional<at::ScalarType>& dtype) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        ((!this->dim&&!dim) || (this->dim&&dim && *(this->dim) == *dim)) &&
+        this->keepdim == keepdim &&
+        ((!this->dtype&&!dtype) || (this->dtype&&dtype && *(this->dtype) == *dtype)));
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(3);
+    kwarguments.reserve(1);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("dim", dim);
+    arguments.emplace_back("keepdim", keepdim);
+    kwarguments.emplace_back("dtype", dtype);
+    torch::lazy::TSOpVector sum_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(sum_out.size(), 1);
+
+    return sum_out;
+
+  }
+
+
+  ::std::optional<::std::vector<int64_t>> dim;
+  bool keepdim;
+  ::std::optional<at::ScalarType> dtype;
+
+
+};
+
+class TCopy : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::t_copy);
+  }
+
+  TCopy(const torch::lazy::Value& self, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              TCopy::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self) const {
+    size_t i = 0;
+    return (operand(i++) == self);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector t_copy_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(t_copy_out.size(), 1);
+
+    return t_copy_out;
+
+  }
+
+
+
+
+
+};
+
+class Tanh : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::tanh);
+  }
+
+  Tanh(const torch::lazy::Value& self, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Tanh::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self) const {
+    size_t i = 0;
+    return (operand(i++) == self);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector tanh_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(tanh_out.size(), 1);
+
+    return tanh_out;
+
+  }
+
+
+
+
+
+};
+
+class TanhBackward : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::tanh_backward);
+  }
+
+  TanhBackward(const torch::lazy::Value& grad_output, const torch::lazy::Value& output, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              TanhBackward::ClassOpKind(),
+              OpList{grad_output, output},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& grad_output, const torch::lazy::Value& output) const {
+    size_t i = 0;
+    return (operand(i++) == grad_output &&
+        operand(i++) == output);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector tanh_backward_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(tanh_backward_out.size(), 1);
+
+    return tanh_backward_out;
+
+  }
+
+
+
+
+
+};
+
+class Threshold : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::threshold);
+  }
+
+  Threshold(const torch::lazy::Value& self, const torch::lazy::Value& threshold, const torch::lazy::Value& value, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Threshold::ClassOpKind(),
+              OpList{self, threshold, value},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const torch::lazy::Value& threshold, const torch::lazy::Value& value) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        operand(i++) == threshold &&
+        operand(i++) == value);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(3);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector threshold_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(threshold_out.size(), 1);
+
+    return threshold_out;
+
+  }
+
+
+
+
+
+};
+
+class ThresholdBackward : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::threshold_backward);
+  }
+
+  ThresholdBackward(const torch::lazy::Value& grad_output, const torch::lazy::Value& self, const torch::lazy::Value& threshold, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              ThresholdBackward::ClassOpKind(),
+              OpList{grad_output, self, threshold},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& grad_output, const torch::lazy::Value& self, const torch::lazy::Value& threshold) const {
+    size_t i = 0;
+    return (operand(i++) == grad_output &&
+        operand(i++) == self &&
+        operand(i++) == threshold);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(3);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector threshold_backward_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(threshold_backward_out.size(), 1);
+
+    return threshold_backward_out;
+
+  }
+
+
+
+
+
+};
+
+class Topk : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::topk);
+  }
+
+  Topk(const torch::lazy::Value& self, const int64_t& k, const int64_t& dim, const bool& largest, const bool& sorted, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Topk::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 2,
+              torch::lazy::MHash(k, dim, largest, sorted)),
+        k(k),
+        dim(dim),
+        largest(largest),
+        sorted(sorted)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", k=" << k;
+    ss << ", dim=" << dim;
+    ss << ", largest=" << largest;
+    ss << ", sorted=" << sorted;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const int64_t& k, const int64_t& dim, const bool& largest, const bool& sorted) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        this->k == k &&
+        this->dim == dim &&
+        this->largest == largest &&
+        this->sorted == sorted);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(5);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("k", k);
+    arguments.emplace_back("dim", dim);
+    arguments.emplace_back("largest", largest);
+    arguments.emplace_back("sorted", sorted);
+
+    torch::lazy::TSOpVector topk_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(topk_out.size(), 2);
+
+    return topk_out;
+
+  }
+
+
+  int64_t k;
+  int64_t dim;
+  bool largest;
+  bool sorted;
+
+
+};
+
+class Trace : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::trace);
+  }
+
+  Trace(const torch::lazy::Value& self, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Trace::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self) const {
+    size_t i = 0;
+    return (operand(i++) == self);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector trace_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(trace_out.size(), 1);
+
+    return trace_out;
+
+  }
+
+
+
+
+
+};
+
+class TransposeCopyInt : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::transpose_copy);
+  }
+
+  TransposeCopyInt(const torch::lazy::Value& self, const int64_t& dim0, const int64_t& dim1, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              TransposeCopyInt::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(dim0, dim1)),
+        dim0(dim0),
+        dim1(dim1)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", dim0=" << dim0;
+    ss << ", dim1=" << dim1;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const int64_t& dim0, const int64_t& dim1) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        this->dim0 == dim0 &&
+        this->dim1 == dim1);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(3);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("dim0", dim0);
+    arguments.emplace_back("dim1", dim1);
+
+    torch::lazy::TSOpVector transpose_copy_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(transpose_copy_out.size(), 1);
+
+    return transpose_copy_out;
+
+  }
+
+
+  int64_t dim0;
+  int64_t dim1;
+
+
+};
+
+class Tril : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::tril);
+  }
+
+  Tril(const torch::lazy::Value& self, const int64_t& diagonal, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Tril::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(diagonal)),
+        diagonal(diagonal)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", diagonal=" << diagonal;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const int64_t& diagonal) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        this->diagonal == diagonal);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("diagonal", diagonal);
+
+    torch::lazy::TSOpVector tril_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(tril_out.size(), 1);
+
+    return tril_out;
+
+  }
+
+
+  int64_t diagonal;
+
+
+};
+
+class Triu : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::triu);
+  }
+
+  Triu(const torch::lazy::Value& self, const int64_t& diagonal, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Triu::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(diagonal)),
+        diagonal(diagonal)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", diagonal=" << diagonal;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const int64_t& diagonal) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        this->diagonal == diagonal);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("diagonal", diagonal);
+
+    torch::lazy::TSOpVector triu_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(triu_out.size(), 1);
+
+    return triu_out;
+
+  }
+
+
+  int64_t diagonal;
+
+
+};
+
+class Trunc : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::trunc);
+  }
+
+  Trunc(const torch::lazy::Value& self, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Trunc::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self) const {
+    size_t i = 0;
+    return (operand(i++) == self);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector trunc_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(trunc_out.size(), 1);
+
+    return trunc_out;
+
+  }
+
+
+
+
+
+};
+
+class UnfoldCopy : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::unfold_copy);
+  }
+
+  UnfoldCopy(const torch::lazy::Value& self, const int64_t& dimension, const int64_t& size, const int64_t& step, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              UnfoldCopy::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(dimension, size, step)),
+        dimension(dimension),
+        size(size),
+        step(step)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", dimension=" << dimension;
+    ss << ", size=" << size;
+    ss << ", step=" << step;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const int64_t& dimension, const int64_t& size, const int64_t& step) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        this->dimension == dimension &&
+        this->size == size &&
+        this->step == step);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(4);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("dimension", dimension);
+    arguments.emplace_back("size", size);
+    arguments.emplace_back("step", step);
+
+    torch::lazy::TSOpVector unfold_copy_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(unfold_copy_out.size(), 1);
+
+    return unfold_copy_out;
+
+  }
+
+
+  int64_t dimension;
+  int64_t size;
+  int64_t step;
+
+
+};
+
+class Uniform : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::uniform);
+  }
+
+  Uniform(const torch::lazy::Value& self, const double& from, const double& to, const ::std::optional<at::Generator>& generator, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Uniform::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(from, to, generator)),
+        from(from),
+        to(to),
+        generator(generator)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", from=" << from;
+    ss << ", to=" << to;
+    if (generator.has_value()) {
+      ss << ", generator=" << "torch.Generator()";
+    } else {
+      ss << ", generator=null";
+    }
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const double& from, const double& to, const ::std::optional<at::Generator>& generator) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        this->from == from &&
+        this->to == to &&
+        ((!this->generator&&!generator) || (this->generator&&generator && *(this->generator) == *generator)));
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(3);
+    kwarguments.reserve(1);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("from", from);
+    arguments.emplace_back("to", to);
+    kwarguments.emplace_back("generator", generator);
+    torch::lazy::TSOpVector uniform_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(uniform_out.size(), 1);
+
+    return uniform_out;
+
+  }
+
+
+  double from;
+  double to;
+  ::std::optional<at::Generator> generator;
+
+
+};
+
+class UnsqueezeCopy : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::unsqueeze_copy);
+  }
+
+  UnsqueezeCopy(const torch::lazy::Value& self, const int64_t& dim, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              UnsqueezeCopy::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(dim)),
+        dim(dim)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", dim=" << dim;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const int64_t& dim) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        this->dim == dim);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("dim", dim);
+
+    torch::lazy::TSOpVector unsqueeze_copy_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(unsqueeze_copy_out.size(), 1);
+
+    return unsqueeze_copy_out;
+
+  }
+
+
+  int64_t dim;
+
+
+};
+
+class UpsampleBilinear2d : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::upsample_bilinear2d);
+  }
+
+  UpsampleBilinear2d(const torch::lazy::Value& self, const ::std::vector<int64_t>& output_size, const bool& align_corners, const ::std::optional<double>& scales_h, const ::std::optional<double>& scales_w, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              UpsampleBilinear2d::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(output_size, align_corners, scales_h, scales_w)),
+        output_size(output_size),
+        align_corners(align_corners),
+        scales_h(scales_h),
+        scales_w(scales_w)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", output_size=" << output_size;
+    ss << ", align_corners=" << align_corners;
+    if (scales_h.has_value()) {
+      ss << ", scales_h=" << scales_h.value();
+    } else {
+      ss << ", scales_h=null";
+    }
+    if (scales_w.has_value()) {
+      ss << ", scales_w=" << scales_w.value();
+    } else {
+      ss << ", scales_w=null";
+    }
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const ::std::vector<int64_t>& output_size, const bool& align_corners, const ::std::optional<double>& scales_h, const ::std::optional<double>& scales_w) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        this->output_size == output_size &&
+        this->align_corners == align_corners &&
+        ((!this->scales_h&&!scales_h) || (this->scales_h&&scales_h && *(this->scales_h) == *scales_h)) &&
+        ((!this->scales_w&&!scales_w) || (this->scales_w&&scales_w && *(this->scales_w) == *scales_w)));
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(5);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("output_size", output_size);
+    arguments.emplace_back("align_corners", align_corners);
+    arguments.emplace_back("scales_h", scales_h);
+    arguments.emplace_back("scales_w", scales_w);
+
+    torch::lazy::TSOpVector upsample_bilinear2d_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(upsample_bilinear2d_out.size(), 1);
+
+    return upsample_bilinear2d_out;
+
+  }
+
+
+  ::std::vector<int64_t> output_size;
+  bool align_corners;
+  ::std::optional<double> scales_h;
+  ::std::optional<double> scales_w;
+
+
+};
+
+class UpsampleBilinear2dBackward : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::upsample_bilinear2d_backward);
+  }
+
+  UpsampleBilinear2dBackward(const torch::lazy::Value& grad_output, const ::std::vector<int64_t>& output_size, const ::std::vector<int64_t>& input_size, const bool& align_corners, const ::std::optional<double>& scales_h, const ::std::optional<double>& scales_w, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              UpsampleBilinear2dBackward::ClassOpKind(),
+              OpList{grad_output},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(output_size, input_size, align_corners, scales_h, scales_w)),
+        output_size(output_size),
+        input_size(input_size),
+        align_corners(align_corners),
+        scales_h(scales_h),
+        scales_w(scales_w)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", output_size=" << output_size;
+    ss << ", input_size=" << input_size;
+    ss << ", align_corners=" << align_corners;
+    if (scales_h.has_value()) {
+      ss << ", scales_h=" << scales_h.value();
+    } else {
+      ss << ", scales_h=null";
+    }
+    if (scales_w.has_value()) {
+      ss << ", scales_w=" << scales_w.value();
+    } else {
+      ss << ", scales_w=null";
+    }
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& grad_output, const ::std::vector<int64_t>& output_size, const ::std::vector<int64_t>& input_size, const bool& align_corners, const ::std::optional<double>& scales_h, const ::std::optional<double>& scales_w) const {
+    size_t i = 0;
+    return (operand(i++) == grad_output &&
+        this->output_size == output_size &&
+        this->input_size == input_size &&
+        this->align_corners == align_corners &&
+        ((!this->scales_h&&!scales_h) || (this->scales_h&&scales_h && *(this->scales_h) == *scales_h)) &&
+        ((!this->scales_w&&!scales_w) || (this->scales_w&&scales_w && *(this->scales_w) == *scales_w)));
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(6);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("output_size", output_size);
+    arguments.emplace_back("input_size", input_size);
+    arguments.emplace_back("align_corners", align_corners);
+    arguments.emplace_back("scales_h", scales_h);
+    arguments.emplace_back("scales_w", scales_w);
+
+    torch::lazy::TSOpVector upsample_bilinear2d_backward_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(upsample_bilinear2d_backward_out.size(), 1);
+
+    return upsample_bilinear2d_backward_out;
+
+  }
+
+
+  ::std::vector<int64_t> output_size;
+  ::std::vector<int64_t> input_size;
+  bool align_corners;
+  ::std::optional<double> scales_h;
+  ::std::optional<double> scales_w;
+
+
+};
+
+class UpsampleNearest2d : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::upsample_nearest2d);
+  }
+
+  UpsampleNearest2d(const torch::lazy::Value& self, const ::std::vector<int64_t>& output_size, const ::std::optional<double>& scales_h, const ::std::optional<double>& scales_w, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              UpsampleNearest2d::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(output_size, scales_h, scales_w)),
+        output_size(output_size),
+        scales_h(scales_h),
+        scales_w(scales_w)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", output_size=" << output_size;
+    if (scales_h.has_value()) {
+      ss << ", scales_h=" << scales_h.value();
+    } else {
+      ss << ", scales_h=null";
+    }
+    if (scales_w.has_value()) {
+      ss << ", scales_w=" << scales_w.value();
+    } else {
+      ss << ", scales_w=null";
+    }
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const ::std::vector<int64_t>& output_size, const ::std::optional<double>& scales_h, const ::std::optional<double>& scales_w) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        this->output_size == output_size &&
+        ((!this->scales_h&&!scales_h) || (this->scales_h&&scales_h && *(this->scales_h) == *scales_h)) &&
+        ((!this->scales_w&&!scales_w) || (this->scales_w&&scales_w && *(this->scales_w) == *scales_w)));
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(4);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("output_size", output_size);
+    arguments.emplace_back("scales_h", scales_h);
+    arguments.emplace_back("scales_w", scales_w);
+
+    torch::lazy::TSOpVector upsample_nearest2d_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(upsample_nearest2d_out.size(), 1);
+
+    return upsample_nearest2d_out;
+
+  }
+
+
+  ::std::vector<int64_t> output_size;
+  ::std::optional<double> scales_h;
+  ::std::optional<double> scales_w;
+
+
+};
+
+class UpsampleNearest2dBackward : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::upsample_nearest2d_backward);
+  }
+
+  UpsampleNearest2dBackward(const torch::lazy::Value& grad_output, const ::std::vector<int64_t>& output_size, const ::std::vector<int64_t>& input_size, const ::std::optional<double>& scales_h, const ::std::optional<double>& scales_w, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              UpsampleNearest2dBackward::ClassOpKind(),
+              OpList{grad_output},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(output_size, input_size, scales_h, scales_w)),
+        output_size(output_size),
+        input_size(input_size),
+        scales_h(scales_h),
+        scales_w(scales_w)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", output_size=" << output_size;
+    ss << ", input_size=" << input_size;
+    if (scales_h.has_value()) {
+      ss << ", scales_h=" << scales_h.value();
+    } else {
+      ss << ", scales_h=null";
+    }
+    if (scales_w.has_value()) {
+      ss << ", scales_w=" << scales_w.value();
+    } else {
+      ss << ", scales_w=null";
+    }
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& grad_output, const ::std::vector<int64_t>& output_size, const ::std::vector<int64_t>& input_size, const ::std::optional<double>& scales_h, const ::std::optional<double>& scales_w) const {
+    size_t i = 0;
+    return (operand(i++) == grad_output &&
+        this->output_size == output_size &&
+        this->input_size == input_size &&
+        ((!this->scales_h&&!scales_h) || (this->scales_h&&scales_h && *(this->scales_h) == *scales_h)) &&
+        ((!this->scales_w&&!scales_w) || (this->scales_w&&scales_w && *(this->scales_w) == *scales_w)));
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(5);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("output_size", output_size);
+    arguments.emplace_back("input_size", input_size);
+    arguments.emplace_back("scales_h", scales_h);
+    arguments.emplace_back("scales_w", scales_w);
+
+    torch::lazy::TSOpVector upsample_nearest2d_backward_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(upsample_nearest2d_backward_out.size(), 1);
+
+    return upsample_nearest2d_backward_out;
+
+  }
+
+
+  ::std::vector<int64_t> output_size;
+  ::std::vector<int64_t> input_size;
+  ::std::optional<double> scales_h;
+  ::std::optional<double> scales_w;
+
+
+};
+
+class ViewCopy : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::view_copy);
+  }
+
+  ViewCopy(const torch::lazy::Value& self, const ::std::vector<int64_t>& size, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              ViewCopy::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(size)),
+        size(size)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", size=" << size;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const ::std::vector<int64_t>& size) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        this->size == size);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("size", size);
+
+    torch::lazy::TSOpVector view_copy_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(view_copy_out.size(), 1);
+
+    return view_copy_out;
+
+  }
+
+
+  ::std::vector<int64_t> size;
+
+
+};
+
+class ViewCopyDtype : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::view_copy);
+  }
+
+  ViewCopyDtype(const torch::lazy::Value& self, const at::ScalarType& dtype, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              ViewCopyDtype::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(dtype)),
+        dtype(dtype)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", dtype=" << dtype;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self, const at::ScalarType& dtype) const {
+    size_t i = 0;
+    return (operand(i++) == self &&
+        this->dtype == dtype);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(2);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    arguments.emplace_back("dtype", dtype);
+
+    torch::lazy::TSOpVector view_copy_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(view_copy_out.size(), 1);
+
+    return view_copy_out;
+
+  }
+
+
+  at::ScalarType dtype;
+
+
+};
+
+class Zero : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::zero);
+  }
+
+  Zero(const torch::lazy::Value& self, std::vector<torch::lazy::Shape>&& shapes)
+      : TsNode(
+              Zero::ClassOpKind(),
+              OpList{self},
+              std::move(shapes),
+              /* num_outputs */ 1,
+              torch::lazy::MHash())
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& self) const {
+    size_t i = 0;
+    return (operand(i++) == self);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+        std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(0);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+
+    torch::lazy::TSOpVector zero_out = torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(zero_out.size(), 1);
+
+    return zero_out;
+
+  }
+
+
+
+
+
+};
+
+} // namespace lazy
+} // namespace torch
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/generated/LazyNativeFunctions.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/generated/LazyNativeFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..43dca3732234d8881797c73074c521c789339325
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/generated/LazyNativeFunctions.h
@@ -0,0 +1,210 @@
+#pragma once
+
+// an external backend might generate file within its code tree
+// and check all the source files within the tree with clang-format.
+// so, disable it since the backend might have a different config.
+// clang-format off
+
+// Autogenerated file by gen_backend_stubs.py. Do not edit directly!
+
+#include <ATen/Tensor.h>
+
+namespace torch {
+namespace lazy {
+
+struct LazyNativeFunctions {
+
+static ::std::tuple<at::Tensor,at::Tensor,at::Tensor> convolution_backward(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalIntArrayRef bias_sizes, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, ::std::array<bool,3> output_mask);
+static ::std::tuple<at::Tensor,at::Tensor,at::Tensor> native_batch_norm(const at::Tensor & input, const ::std::optional<at::Tensor> & weight, const ::std::optional<at::Tensor> & bias, const ::std::optional<at::Tensor> & running_mean, const ::std::optional<at::Tensor> & running_var, bool training, double momentum, double eps);
+static ::std::tuple<at::Tensor,at::Tensor,at::Tensor> native_batch_norm_backward(const at::Tensor & grad_out, const at::Tensor & input, const ::std::optional<at::Tensor> & weight, const ::std::optional<at::Tensor> & running_mean, const ::std::optional<at::Tensor> & running_var, const ::std::optional<at::Tensor> & save_mean, const ::std::optional<at::Tensor> & save_invstd, bool train, double eps, ::std::array<bool,3> output_mask);
+static ::std::tuple<at::Tensor,at::Tensor,at::Tensor> native_layer_norm(const at::Tensor & input, at::IntArrayRef normalized_shape, const ::std::optional<at::Tensor> & weight, const ::std::optional<at::Tensor> & bias, double eps);
+static ::std::tuple<at::Tensor,at::Tensor,at::Tensor> native_layer_norm_backward(const at::Tensor & grad_out, const at::Tensor & input, at::IntArrayRef normalized_shape, const at::Tensor & mean, const at::Tensor & rstd, const ::std::optional<at::Tensor> & weight, const ::std::optional<at::Tensor> & bias, ::std::array<bool,3> output_mask);
+static ::std::tuple<at::Tensor,at::Tensor> grid_sampler_2d_backward(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners, ::std::array<bool,2> output_mask);
+static ::std::tuple<at::Tensor,at::Tensor> log_sigmoid_forward(const at::Tensor & self);
+static ::std::tuple<at::Tensor,at::Tensor> max(const at::Tensor & self, int64_t dim, bool keepdim);
+static ::std::tuple<at::Tensor,at::Tensor> max_pool2d_with_indices(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode);
+static ::std::tuple<at::Tensor,at::Tensor> max_pool3d_with_indices(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode);
+static ::std::tuple<at::Tensor,at::Tensor> native_dropout(const at::Tensor & input, double p, ::std::optional<bool> train);
+static ::std::tuple<at::Tensor,at::Tensor> nll_loss2d_forward(const at::Tensor & self, const at::Tensor & target, const ::std::optional<at::Tensor> & weight, int64_t reduction, int64_t ignore_index);
+static ::std::tuple<at::Tensor,at::Tensor> nll_loss_forward(const at::Tensor & self, const at::Tensor & target, const ::std::optional<at::Tensor> & weight, int64_t reduction, int64_t ignore_index);
+static ::std::tuple<at::Tensor,at::Tensor> sort(const at::Tensor & self, int64_t dim, bool descending);
+static ::std::tuple<at::Tensor,at::Tensor> topk(const at::Tensor & self, int64_t k, int64_t dim, bool largest, bool sorted);
+static at::Tensor & arange_out(const at::Scalar & start, const at::Scalar & end, const at::Scalar & step, at::Tensor & out);
+static at::Tensor & fill_(at::Tensor & self, const at::Scalar & value);
+static at::Tensor & logsumexp_out(const at::Tensor & self, at::IntArrayRef dim, bool keepdim, at::Tensor & out);
+static at::Tensor _adaptive_avg_pool2d(const at::Tensor & self, at::IntArrayRef output_size);
+static at::Tensor _adaptive_avg_pool2d_backward(const at::Tensor & grad_output, const at::Tensor & self);
+static at::Tensor _copy_from(const at::Tensor & self, const at::Tensor & dst, bool non_blocking);
+static at::Tensor _copy_from_and_resize(const at::Tensor & self, const at::Tensor & dst);
+static at::Tensor _log_softmax(const at::Tensor & self, int64_t dim, bool half_to_float);
+static at::Tensor _log_softmax_backward_data(const at::Tensor & grad_output, const at::Tensor & output, int64_t dim, at::ScalarType input_dtype);
+static at::Tensor _reshape_alias_copy_symint(const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride);
+static at::Tensor _softmax(const at::Tensor & self, int64_t dim, bool half_to_float);
+static at::Tensor _softmax_backward_data(const at::Tensor & grad_output, const at::Tensor & output, int64_t dim, at::ScalarType input_dtype);
+static at::Tensor _to_copy(const at::Tensor & self, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory, bool non_blocking, ::std::optional<at::MemoryFormat> memory_format);
+static at::Tensor _trilinear(const at::Tensor & i1, const at::Tensor & i2, const at::Tensor & i3, at::IntArrayRef expand1, at::IntArrayRef expand2, at::IntArrayRef expand3, at::IntArrayRef sumdim, int64_t unroll_dim);
+static at::Tensor _unsafe_view(const at::Tensor & self, at::IntArrayRef size);
+static at::Tensor abs(const at::Tensor & self);
+static at::Tensor add(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha);
+static at::Tensor addcdiv(const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value);
+static at::Tensor addcmul(const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value);
+static at::Tensor addmm(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha);
+static at::Tensor alias_copy(const at::Tensor & self);
+static at::Tensor all(const at::Tensor & self);
+static at::Tensor any(const at::Tensor & self);
+static at::Tensor as_strided_copy_symint(const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, ::std::optional<c10::SymInt> storage_offset);
+static at::Tensor as_strided_scatter_symint(const at::Tensor & self, const at::Tensor & src, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, ::std::optional<c10::SymInt> storage_offset);
+static at::Tensor avg_pool2d(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, ::std::optional<int64_t> divisor_override);
+static at::Tensor avg_pool2d_backward(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, ::std::optional<int64_t> divisor_override);
+static at::Tensor baddbmm(const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta, const at::Scalar & alpha);
+static at::Tensor bernoulli(const at::Tensor & self, ::std::optional<at::Generator> generator);
+static at::Tensor bernoulli(const at::Tensor & self, double p, ::std::optional<at::Generator> generator);
+static at::Tensor binary_cross_entropy(const at::Tensor & self, const at::Tensor & target, const ::std::optional<at::Tensor> & weight, int64_t reduction);
+static at::Tensor binary_cross_entropy_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const ::std::optional<at::Tensor> & weight, int64_t reduction);
+static at::Tensor bitwise_and(const at::Tensor & self, const at::Tensor & other);
+static at::Tensor bitwise_or(const at::Tensor & self, const at::Tensor & other);
+static at::Tensor block_diag(at::TensorList tensors);
+static at::Tensor bmm(const at::Tensor & self, const at::Tensor & mat2);
+static at::Tensor cat(const at::ITensorListRef & tensors, int64_t dim);
+static at::Tensor clamp(const at::Tensor & self, const ::std::optional<at::Scalar> & min, const ::std::optional<at::Scalar> & max);
+static at::Tensor clamp_min(const at::Tensor & self, const at::Scalar & min);
+static at::Tensor clone(const at::Tensor & self, ::std::optional<at::MemoryFormat> memory_format);
+static at::Tensor constant_pad_nd(const at::Tensor & self, at::IntArrayRef pad, const at::Scalar & value);
+static at::Tensor convolution(const at::Tensor & input, const at::Tensor & weight, const ::std::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups);
+static at::Tensor cos(const at::Tensor & self);
+static at::Tensor cumsum(const at::Tensor & self, int64_t dim, ::std::optional<at::ScalarType> dtype);
+static at::Tensor detach_copy(const at::Tensor & self);
+static at::Tensor diag_embed(const at::Tensor & self, int64_t offset, int64_t dim1, int64_t dim2);
+static at::Tensor diagonal_backward_symint(const at::Tensor & grad_output, c10::SymIntArrayRef input_sizes, int64_t offset, int64_t dim1, int64_t dim2);
+static at::Tensor diagonal_copy(const at::Tensor & self, int64_t offset, int64_t dim1, int64_t dim2);
+static at::Tensor diagonal_scatter(const at::Tensor & self, const at::Tensor & src, int64_t offset, int64_t dim1, int64_t dim2);
+static at::Tensor div(const at::Tensor & self, const at::Tensor & other);
+static at::Tensor div(const at::Tensor & self, const at::Tensor & other, ::std::optional<c10::string_view> rounding_mode);
+static at::Tensor elu(const at::Tensor & self, const at::Scalar & alpha, const at::Scalar & scale, const at::Scalar & input_scale);
+static at::Tensor elu_backward(const at::Tensor & grad_output, const at::Scalar & alpha, const at::Scalar & scale, const at::Scalar & input_scale, bool is_result, const at::Tensor & self_or_result);
+static at::Tensor embedding(const at::Tensor & weight, const at::Tensor & indices, int64_t padding_idx, bool scale_grad_by_freq, bool sparse);
+static at::Tensor embedding_dense_backward(const at::Tensor & grad_output, const at::Tensor & indices, int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq);
+static at::Tensor empty_strided_symint(c10::SymIntArrayRef size, c10::SymIntArrayRef stride, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory);
+static at::Tensor empty_symint(c10::SymIntArrayRef size, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory, ::std::optional<at::MemoryFormat> memory_format);
+static at::Tensor eq(const at::Tensor & self, const at::Scalar & other);
+static at::Tensor eq(const at::Tensor & self, const at::Tensor & other);
+static at::Tensor exp(const at::Tensor & self);
+static at::Tensor expand_copy_symint(const at::Tensor & self, c10::SymIntArrayRef size, bool implicit);
+static at::Tensor flip(const at::Tensor & self, at::IntArrayRef dims);
+static at::Tensor floor(const at::Tensor & self);
+static at::Tensor frac(const at::Tensor & self);
+static at::Tensor gather(const at::Tensor & self, int64_t dim, const at::Tensor & index, bool sparse_grad);
+static at::Tensor ge(const at::Tensor & self, const at::Scalar & other);
+static at::Tensor ge(const at::Tensor & self, const at::Tensor & other);
+static at::Tensor gelu(const at::Tensor & self, c10::string_view approximate);
+static at::Tensor gelu_backward(const at::Tensor & grad_output, const at::Tensor & self, c10::string_view approximate);
+static at::Tensor glu(const at::Tensor & self, int64_t dim);
+static at::Tensor glu_backward(const at::Tensor & grad_output, const at::Tensor & self, int64_t dim);
+static at::Tensor glu_jvp(const at::Tensor & glu, const at::Tensor & x, const at::Tensor & dx, int64_t dim);
+static at::Tensor grid_sampler_2d(const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners);
+static at::Tensor gt(const at::Tensor & self, const at::Scalar & other);
+static at::Tensor gt(const at::Tensor & self, const at::Tensor & other);
+static at::Tensor hardsigmoid(const at::Tensor & self);
+static at::Tensor index_select(const at::Tensor & self, int64_t dim, const at::Tensor & index);
+static at::Tensor le(const at::Tensor & self, const at::Scalar & other);
+static at::Tensor le(const at::Tensor & self, const at::Tensor & other);
+static at::Tensor leaky_relu(const at::Tensor & self, const at::Scalar & negative_slope);
+static at::Tensor leaky_relu_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & negative_slope, bool self_is_result);
+static at::Tensor lift(const at::Tensor & self);
+static at::Tensor lift_fresh(const at::Tensor & self);
+static at::Tensor linalg_pinv(const at::Tensor & self, const ::std::optional<at::Tensor> & atol, const ::std::optional<at::Tensor> & rtol, bool hermitian);
+static at::Tensor log(const at::Tensor & self);
+static at::Tensor log2(const at::Tensor & self);
+static at::Tensor log_sigmoid_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & buffer);
+static at::Tensor logdet(const at::Tensor & self);
+static at::Tensor lt(const at::Tensor & self, const at::Scalar & other);
+static at::Tensor lt(const at::Tensor & self, const at::Tensor & other);
+static at::Tensor masked_fill(const at::Tensor & self, const at::Tensor & mask, const at::Scalar & value);
+static at::Tensor masked_fill(const at::Tensor & self, const at::Tensor & mask, const at::Tensor & value);
+static at::Tensor max(const at::Tensor & self);
+static at::Tensor max_pool2d_with_indices_backward(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, const at::Tensor & indices);
+static at::Tensor max_pool3d_with_indices_backward(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, const at::Tensor & indices);
+static at::Tensor maximum(const at::Tensor & self, const at::Tensor & other);
+static at::Tensor mean(const at::Tensor & self, ::std::optional<at::ScalarType> dtype);
+static at::Tensor mean(const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim, ::std::optional<at::ScalarType> dtype);
+static at::Tensor min(const at::Tensor & self);
+static at::Tensor minimum(const at::Tensor & self, const at::Tensor & other);
+static at::Tensor mm(const at::Tensor & self, const at::Tensor & mat2);
+static at::Tensor mul(const at::Tensor & self, const at::Tensor & other);
+static at::Tensor mv(const at::Tensor & self, const at::Tensor & vec);
+static at::Tensor narrow_copy_symint(const at::Tensor & self, int64_t dim, c10::SymInt start, c10::SymInt length);
+static at::Tensor native_dropout_backward(const at::Tensor & grad_output, const at::Tensor & mask, double scale);
+static at::Tensor ne(const at::Tensor & self, const at::Scalar & other);
+static at::Tensor ne(const at::Tensor & self, const at::Tensor & other);
+static at::Tensor neg(const at::Tensor & self);
+static at::Tensor new_empty_strided_symint(const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, ::std::optional<at::ScalarType> dtype, ::std::optional<at::Layout> layout, ::std::optional<at::Device> device, ::std::optional<bool> pin_memory);
+static at::Tensor nll_loss2d_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const ::std::optional<at::Tensor> & weight, int64_t reduction, int64_t ignore_index, const at::Tensor & total_weight);
+static at::Tensor nll_loss_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const ::std::optional<at::Tensor> & weight, int64_t reduction, int64_t ignore_index, const at::Tensor & total_weight);
+static at::Tensor nonzero(const at::Tensor & self);
+static at::Tensor norm(const at::Tensor & self, const ::std::optional<at::Scalar> & p, at::IntArrayRef dim, bool keepdim);
+static at::Tensor normal_functional(const at::Tensor & self, double mean, double std, ::std::optional<at::Generator> generator);
+static at::Tensor permute_copy(const at::Tensor & self, at::IntArrayRef dims);
+static at::Tensor pixel_shuffle(const at::Tensor & self, int64_t upscale_factor);
+static at::Tensor pixel_unshuffle(const at::Tensor & self, int64_t downscale_factor);
+static at::Tensor pow(const at::Tensor & self, const at::Scalar & exponent);
+static at::Tensor pow(const at::Tensor & self, const at::Tensor & exponent);
+static at::Tensor random(const at::Tensor & self, ::std::optional<at::Generator> generator);
+static at::Tensor random(const at::Tensor & self, int64_t from, ::std::optional<int64_t> to, ::std::optional<at::Generator> generator);
+static at::Tensor random(const at::Tensor & self, int64_t to, ::std::optional<at::Generator> generator);
+static at::Tensor reciprocal(const at::Tensor & self);
+static at::Tensor relu(const at::Tensor & self);
+static at::Tensor remainder(const at::Tensor & self, const at::Tensor & other);
+static at::Tensor repeat(const at::Tensor & self, at::IntArrayRef repeats);
+static at::Tensor rsqrt(const at::Tensor & self);
+static at::Tensor scatter_add(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src);
+static at::Tensor select_backward_symint(const at::Tensor & grad_output, c10::SymIntArrayRef input_sizes, int64_t dim, c10::SymInt index);
+static at::Tensor select_copy(const at::Tensor & self, int64_t dim, int64_t index);
+static at::Tensor select_scatter(const at::Tensor & self, const at::Tensor & src, int64_t dim, int64_t index);
+static at::Tensor sgn(const at::Tensor & self);
+static at::Tensor sigmoid(const at::Tensor & self);
+static at::Tensor sigmoid_backward(const at::Tensor & grad_output, const at::Tensor & output);
+static at::Tensor silu(const at::Tensor & self);
+static at::Tensor slice_backward_symint(const at::Tensor & grad_output, c10::SymIntArrayRef input_sizes, int64_t dim, c10::SymInt start, c10::SymInt end, c10::SymInt step);
+static at::Tensor slice_copy_symint(const at::Tensor & self, int64_t dim, ::std::optional<c10::SymInt> start, ::std::optional<c10::SymInt> end, c10::SymInt step);
+static at::Tensor slice_scatter_symint(const at::Tensor & self, const at::Tensor & src, int64_t dim, ::std::optional<c10::SymInt> start, ::std::optional<c10::SymInt> end, c10::SymInt step);
+static at::Tensor smooth_l1_loss(const at::Tensor & self, const at::Tensor & target, int64_t reduction, double beta);
+static at::Tensor smooth_l1_loss_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction, double beta);
+static at::Tensor softplus(const at::Tensor & self, const at::Scalar & beta, const at::Scalar & threshold);
+static at::Tensor softplus_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & beta, const at::Scalar & threshold);
+static at::Tensor sqrt(const at::Tensor & self);
+static at::Tensor squeeze_copy(const at::Tensor & self);
+static at::Tensor squeeze_copy(const at::Tensor & self, at::IntArrayRef dim);
+static at::Tensor squeeze_copy(const at::Tensor & self, int64_t dim);
+static at::Tensor stack(at::TensorList tensors, int64_t dim);
+static at::Tensor std(const at::Tensor & self, at::OptionalIntArrayRef dim, bool unbiased, bool keepdim);
+static at::Tensor std(const at::Tensor & self, at::OptionalIntArrayRef dim, const ::std::optional<at::Scalar> & correction, bool keepdim);
+static at::Tensor std(const at::Tensor & self, bool unbiased);
+static at::Tensor sub(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha);
+static at::Tensor sum(const at::Tensor & self, ::std::optional<at::ScalarType> dtype);
+static at::Tensor sum(const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim, ::std::optional<at::ScalarType> dtype);
+static at::Tensor t_copy(const at::Tensor & self);
+static at::Tensor tanh(const at::Tensor & self);
+static at::Tensor tanh_backward(const at::Tensor & grad_output, const at::Tensor & output);
+static at::Tensor threshold(const at::Tensor & self, const at::Scalar & threshold, const at::Scalar & value);
+static at::Tensor threshold_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & threshold);
+static at::Tensor trace(const at::Tensor & self);
+static at::Tensor transpose_copy(const at::Tensor & self, int64_t dim0, int64_t dim1);
+static at::Tensor tril(const at::Tensor & self, int64_t diagonal);
+static at::Tensor triu(const at::Tensor & self, int64_t diagonal);
+static at::Tensor trunc(const at::Tensor & self);
+static at::Tensor unfold_copy(const at::Tensor & self, int64_t dimension, int64_t size, int64_t step);
+static at::Tensor uniform(const at::Tensor & self, double from, double to, ::std::optional<at::Generator> generator);
+static at::Tensor unsqueeze_copy(const at::Tensor & self, int64_t dim);
+static at::Tensor upsample_bilinear2d(const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, ::std::optional<double> scales_h, ::std::optional<double> scales_w);
+static at::Tensor upsample_bilinear2d_backward(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, ::std::optional<double> scales_h, ::std::optional<double> scales_w);
+static at::Tensor upsample_nearest2d(const at::Tensor & self, at::IntArrayRef output_size, ::std::optional<double> scales_h, ::std::optional<double> scales_w);
+static at::Tensor upsample_nearest2d_backward(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, ::std::optional<double> scales_h, ::std::optional<double> scales_w);
+static at::Tensor view_copy(const at::Tensor & self, at::ScalarType dtype);
+static at::Tensor view_copy_symint(const at::Tensor & self, c10::SymIntArrayRef size);
+static at::Tensor zero(const at::Tensor & self);
+static ::std::tuple<at::Tensor,at::Tensor,at::Tensor> native_group_norm(const at::Tensor & input, const ::std::optional<at::Tensor> & weight, const ::std::optional<at::Tensor> & bias, int64_t N, int64_t C, int64_t HxW, int64_t group, double eps);
+static at::Tensor max_pool3d(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode);
+
+};
+} // namespace lazy
+} // namespace torch
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/generated/LazyNonNativeIr.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/generated/LazyNonNativeIr.h
new file mode 100644
index 0000000000000000000000000000000000000000..628e1a4bf33f3136ec2a0fa4eb422b606c8aace8
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/generated/LazyNonNativeIr.h
@@ -0,0 +1,155 @@
+#pragma once
+
+#include <torch/csrc/lazy/core/ir.h>
+#include <torch/csrc/lazy/core/ir_builder.h>
+#include <torch/csrc/lazy/core/internal_ops/ltc_ops.h>
+#include <torch/csrc/lazy/core/shape_inference.h>
+#include <torch/csrc/lazy/ts_backend/ts_node.h>
+
+// This file contains autogenerated LazyTensor Non Native IR nodes
+
+namespace torch {
+namespace lazy {
+
+class Scalar : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::prim::Constant);
+  }
+
+  Scalar(const at::Scalar& value, const at::ScalarType& type)
+      : TsNode(
+              Scalar::ClassOpKind(),
+              OpList{},
+              compute_shape_scalar(value, type),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(value, type)),
+        value(value),
+        type(type)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", value=" << value;
+    ss << ", type=" << type;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const at::Scalar& value, const at::ScalarType& type) const;
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override;
+
+  at::Scalar value;
+  at::ScalarType type;
+
+
+};
+
+class Expand : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(at::aten::expand);
+  }
+
+  Expand(const torch::lazy::Value& input, const ::std::vector<int64_t>& size, const bool& is_scalar_expand)
+      : TsNode(
+              Expand::ClassOpKind(),
+              OpList{input},
+              [&](){ return compute_shape_expand(operand(0), size, is_scalar_expand)[0]; },
+              /* num_outputs */ 1,
+              torch::lazy::MHash(size, is_scalar_expand)),
+        size(size),
+        is_scalar_expand(is_scalar_expand)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", size=" << size;
+    ss << ", is_scalar_expand=" << is_scalar_expand;
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& input, const ::std::vector<int64_t>& size, const bool& is_scalar_expand) const {
+    size_t i = 0;
+    return (operand(i++) == input &&
+        this->size == size &&
+        this->is_scalar_expand == is_scalar_expand);
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override;
+
+  ::std::vector<int64_t> size;
+  bool is_scalar_expand;
+
+
+};
+
+class Cast : public TsNode {
+ public:
+  static torch::lazy::OpKind ClassOpKind() {
+    return torch::lazy::OpKind(ltc_cast);
+  }
+
+  Cast(const torch::lazy::Value& input, const at::ScalarType& dtype, const ::std::optional<at::ScalarType>& stype)
+      : TsNode(
+              Cast::ClassOpKind(),
+              OpList{input},
+              compute_shape_cast(input, dtype, stype),
+              /* num_outputs */ 1,
+              torch::lazy::MHash(dtype, stype)),
+        dtype(dtype),
+        stype(stype)
+  {
+
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << TsNode::ToString();
+    ss << ", dtype=" << dtype;
+    if (stype.has_value()) {
+      ss << ", stype=" << stype.value();
+    } else {
+      ss << ", stype=null";
+    }
+    return ss.str();
+  }
+
+
+
+  bool CanBeReused(const torch::lazy::Value& input, const at::ScalarType& dtype, const ::std::optional<at::ScalarType>& stype) const {
+    size_t i = 0;
+    return (operand(i++) == input &&
+        this->dtype == dtype &&
+        ((!this->stype&&!stype) || (this->stype&&stype && *(this->stype) == *stype)));
+  }
+
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override;
+
+  at::ScalarType dtype;
+  ::std::optional<at::ScalarType> stype;
+
+
+};
+
+} // namespace lazy
+} // namespace torch
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/python/init.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/python/init.h
new file mode 100644
index 0000000000000000000000000000000000000000..6590c70dad78bffe3d75927ccc05c52dd19c1cdd
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/python/init.h
@@ -0,0 +1,10 @@
+#pragma once
+#include <pybind11/pybind11.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch::lazy {
+
+TORCH_PYTHON_API void initLazyBindings(PyObject* module);
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/python/python_util.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/python/python_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c0d7ce1e789a96fb8631d324bf637a96432ee6a
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/python/python_util.h
@@ -0,0 +1,13 @@
+#pragma once
+#include <torch/csrc/Export.h>
+#include <torch/csrc/lazy/core/ir_metadata.h>
+#include <optional>
+#include <vector>
+
+namespace torch::lazy {
+
+std::optional<SourceLocation> TORCH_PYTHON_API GetPythonFrameTop();
+
+std::vector<SourceLocation> TORCH_PYTHON_API GetPythonFrames();
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/config.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/config.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a1bdf9fdb20ac724aac6e0991170202241b78c4
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/config.h
@@ -0,0 +1,7 @@
+#pragma once
+#include <c10/util/Flags.h>
+
+// TODO(whc) unclear if this is useful, has only been tested as true
+TORCH_DECLARE_bool(torch_lazy_ts_tensor_update_sync);
+
+TORCH_DECLARE_bool(torch_lazy_ts_cuda);
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/dynamic_ir.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/dynamic_ir.h
new file mode 100644
index 0000000000000000000000000000000000000000..aaac2663645863a044da37a70811659083493538
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/dynamic_ir.h
@@ -0,0 +1,77 @@
+#pragma once
+
+#include <ATen/core/symbol.h>
+
+#include <memory>
+#include <string>
+
+#include <c10/core/ScalarType.h>
+#include <c10/util/Flags.h>
+#include <torch/csrc/lazy/core/dynamic_ir.h>
+#include <torch/csrc/lazy/core/hash.h>
+#include <torch/csrc/lazy/core/ir.h>
+#include <torch/csrc/lazy/core/ir_metadata.h>
+#include <torch/csrc/lazy/ts_backend/ts_node.h>
+
+TORCH_DECLARE_bool(ltc_enable_dynamic_shapes);
+
+namespace torch::lazy {
+
+/**
+ * The goal of "dynamic" Nodes is to patch a hole in our tracing.
+ * Previously, if a user called `sizes` on a Tensor, it would leak out
+ * of our tracing system, as `sizes` returns a torch.Size or an int. To
+ * prevent this from happening, we introduce DimensionNode, a new type
+ * of Node that abstracts the operation of getting the dimensions of a
+ * Tensor.
+ *
+ * Consider the following example:
+ * ```
+ * numel = x.shape()[0] * x.shape()[1]
+ * ```
+ *
+ * Here, `x.shape()[i]` will be a SizeNode (subclass of DimensionNode),
+ * and the multiplication of the two SizeNodes will be represented by
+ * a SizeMul (also a subclass of DimensionNode). Through this, we can
+ * prevent `numel` from being represented as a Python int and thus
+ * burned into the Graph.
+ */
+
+// Represents the result of calling `size` on a Tensor
+class TORCH_API SizeNode : public TsNode, public DimensionNode {
+ public:
+  SizeNode(Value input, size_t dim);
+  int64_t getStaticValue() const override;
+  bool isSymbolic() const override;
+  std::string ToString() const override;
+  size_t dim_ = 0;
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      TSLoweringContext* loctx) const override;
+};
+
+class TORCH_API SizeAdd : public TsNode, public DimensionNode {
+ public:
+  SizeAdd(Value a, Value b);
+  int64_t getStaticValue() const override;
+  bool isSymbolic() const override;
+  std::string ToString() const override;
+};
+
+class TORCH_API SizeMul : public TsNode, public DimensionNode {
+ public:
+  SizeMul(Value a, Value b);
+  int64_t getStaticValue() const override;
+  bool isSymbolic() const override;
+  std::string ToString() const override;
+};
+
+class TORCH_API SizeDiv : public TsNode, public DimensionNode {
+ public:
+  SizeDiv(Value a, Value b);
+  int64_t getStaticValue() const override;
+  bool isSymbolic() const override;
+  std::string ToString() const override;
+};
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ir_builder.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ir_builder.h
new file mode 100644
index 0000000000000000000000000000000000000000..325e8f04d2d923bbfbe25b782233d39c132418db
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ir_builder.h
@@ -0,0 +1,69 @@
+#pragma once
+
+#include <torch/csrc/lazy/core/internal_ops/ltc_ops.h>
+#include <torch/csrc/lazy/core/ir.h>
+#include <torch/csrc/lazy/core/ir_builder.h>
+#include <torch/csrc/lazy/core/shape_inference.h>
+#include <torch/csrc/lazy/generated/LazyNonNativeIr.h>
+#include <torch/csrc/lazy/ts_backend/dynamic_ir.h>
+#include <torch/csrc/lazy/ts_backend/ops/device_data.h>
+#include <torch/csrc/lazy/ts_backend/ops/generic.h>
+#include <torch/csrc/lazy/ts_backend/ts_node.h>
+
+namespace torch::lazy {
+
+struct TorchScriptIrBuilder : IrBuilder {
+  NodePtr MakeDeviceData(
+      const std::shared_ptr<BackendData>& data) const override {
+    return DeviceData::Create(data);
+  }
+  // TODO: Scalar node is not currently used by ts_backend. Enable reusing
+  // Scalar node later if needed.
+  NodePtr MakeScalar(const at::Scalar& value, const at::ScalarType& type)
+      const override {
+    return MakeNode<Scalar>(value, type);
+  }
+  NodePtr MakeExpand(
+      const Value& input0,
+      const std::vector<int64_t>& size,
+      const bool& is_scalar_expand) const override {
+    return ReuseOrMakeNode<Expand>(input0, size, is_scalar_expand);
+  }
+  NodePtr MakeCast(
+      const Value& input0,
+      const at::ScalarType& dtype,
+      const std::optional<at::ScalarType>& stype =
+          std::nullopt) const override {
+    return ReuseOrMakeNode<Cast>(input0, dtype, stype);
+  }
+  NodePtr MakeTensorList(const OpList& inputs) const override {
+    return ReuseOrMakeNode<TensorList>(inputs);
+  }
+  // Generic needs cleanup
+  NodePtr MakeGeneric(
+      const OpKind& op,
+      const OpList& operands,
+      const Shape& shape,
+      const size_t& num_outputs = 1,
+      const hash_t& hash_seed =
+          static_cast<uint32_t>(0x5a2d296e9)) const override {
+    return MakeNode<Generic>(op, operands, shape, num_outputs, hash_seed);
+  }
+
+  // dynamic ir nodes
+  // TODO: verify if IR node reusing works for Dynamic shape ops
+  NodePtr MakeSizeNode(const Value& input, size_t dim) const override {
+    return MakeNode<SizeNode>(input, dim);
+  }
+  NodePtr MakeSizeAdd(const Value& a, const Value& b) const override {
+    return MakeNode<SizeAdd>(a, b);
+  }
+  NodePtr MakeSizeMul(const Value& a, const Value& b) const override {
+    return MakeNode<SizeMul>(a, b);
+  }
+  NodePtr MakeSizeDiv(const Value& a, const Value& b) const override {
+    return MakeNode<SizeDiv>(a, b);
+  }
+};
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ops/device_data.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ops/device_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..c76722f969b1d73e0cf5dd9b8fd3cb62389d1f5e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ops/device_data.h
@@ -0,0 +1,50 @@
+#pragma once
+
+#include <torch/csrc/lazy/backend/backend_data.h>
+#include <torch/csrc/lazy/core/internal_ops/ltc_ops.h>
+#include <torch/csrc/lazy/ts_backend/ts_node.h>
+
+#include <utility>
+
+namespace torch::lazy {
+
+class TORCH_API DeviceData : public TsNode {
+ public:
+  static OpKind ClassOpKind() {
+    return ltc_device_data;
+  }
+
+  explicit DeviceData(std::shared_ptr<BackendData> data);
+
+  // A DeviceData node can be reused if the shape matches,
+  // but we will substitute the actual data_ pointer under
+  // the hood.
+  bool CanBeReused(const std::shared_ptr<BackendData>& data) const {
+    return data_->shape() == data->shape();
+  }
+
+  std::string ToString() const override;
+
+  const std::shared_ptr<BackendData>& data() const {
+    return data_;
+  }
+
+  void SetData(std::shared_ptr<BackendData> data) {
+    data_ = std::move(data);
+  }
+
+  static const DeviceData* Cast(const Node* node);
+
+  // To reuse IR nodes, use this method to create DeviceData nodes
+  // instead of calling the constructor directconst ly.
+  static NodePtr Create(const std::shared_ptr<BackendData>& data);
+
+  TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      TSLoweringContext* loctx) const override;
+
+ private:
+  std::shared_ptr<BackendData> data_;
+};
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ops/generic.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ops/generic.h
new file mode 100644
index 0000000000000000000000000000000000000000..b081dac33276a166778f45eed9569ad8701b5cd5
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ops/generic.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <torch/csrc/lazy/ts_backend/ts_node.h>
+
+#include <torch/csrc/lazy/core/ir_builder.h>
+
+namespace torch::lazy {
+
+// Generic IR Node implementation for nodes which can simply be described by a
+// specific OpKind and a lowering function. IR nodes carrying
+// metadata should not be using this class TORCH_API (and have the metadata
+// captured by the LowerFn), but they should instead create a dedicated IR node.
+// Doing the former would limit IR introspection.
+class TORCH_API Generic : public TsNode {
+ public:
+  Generic(
+      OpKind op,
+      OpList operands,
+      Shape shape,
+      size_t num_outputs = 1,
+      hash_t hash_seed = static_cast<uint32_t>(0x5a2d296e9));
+
+  Generic(
+      OpKind op,
+      OpList operands,
+      const std::function<Shape()>& shape_fn,
+      size_t num_outputs = 1,
+      hash_t hash_seed = static_cast<uint32_t>(0x5a2d296e9));
+
+  Generic(
+      OpKind op,
+      OpList operands,
+      size_t num_outputs = 1,
+      hash_t hash_seed = static_cast<uint32_t>(0x5a2d296e9));
+
+  Generic(OpKind op, Shape shape, size_t num_outputs, hash_t hash_seed);
+
+ private:
+  hash_t hash_seed_;
+};
+
+inline NodePtr GenericOp(
+    OpKind op,
+    OpList operands,
+    Shape shape,
+    size_t num_outputs = 1,
+    hash_t hash_seed = static_cast<uint32_t>(0x5a2d296e9)) {
+  return MakeNode<Generic>(
+      op, operands, std::move(shape), num_outputs, hash_seed);
+}
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ops/to_copy.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ops/to_copy.h
new file mode 100644
index 0000000000000000000000000000000000000000..e60ed1773e32427bda112511ed9e3e3b6210162e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ops/to_copy.h
@@ -0,0 +1,125 @@
+#pragma once
+
+#include <torch/csrc/lazy/ts_backend/ts_node.h>
+
+namespace torch::lazy {
+
+// This IR was copied from code-generated output, but the entire _to_copy
+// operator cannot be trivially code generated since it is only desirable to
+// capture IR for certain permutations of _to_copy (e.g. dtype), and for the
+// others it is difficult to even invoke the aten/eager fallback necessitating
+// directly implementing the right to(device) behavior
+class ToCopy : public torch::lazy::TsNode {
+ public:
+  static OpKind ClassOpKind() {
+    return OpKind(at::aten::_to_copy);
+  }
+
+  ToCopy(
+      const torch::lazy::Value& self,
+      const std::optional<at::ScalarType>& dtype,
+      const std::optional<at::Layout>& layout,
+      const std::optional<at::Device>& device,
+      const std::optional<bool>& pin_memory,
+      const bool& non_blocking,
+      const std::optional<at::MemoryFormat>& memory_format,
+      std::vector<torch::lazy::Shape>&& shapes)
+      : torch::lazy::TsNode(
+            ClassOpKind(),
+            {self},
+            std::move(shapes),
+            /* num_outputs */ 1,
+            torch::lazy::MHash(
+                dtype,
+                layout,
+                device,
+                pin_memory,
+                non_blocking,
+                memory_format)),
+
+        dtype(dtype),
+        layout(layout),
+        device(device),
+        pin_memory(pin_memory),
+        non_blocking(non_blocking),
+        memory_format(memory_format) {}
+
+  bool CanBeReused(
+      const torch::lazy::Value& self,
+      const std::optional<at::ScalarType>& dtype,
+      const std::optional<at::Layout>& layout,
+      const std::optional<at::Device>& device,
+      const std::optional<bool>& pin_memory,
+      const bool& non_blocking,
+      const std::optional<at::MemoryFormat>& memory_format) const {
+    size_t i = 0;
+    return (
+        operand(i++) == self && this->dtype == dtype &&
+        this->layout == layout && this->device == device &&
+        this->pin_memory == pin_memory && this->non_blocking == non_blocking &&
+        this->memory_format == memory_format);
+  }
+
+  std::string ToString() const override {
+    std::stringstream ss;
+    ss << torch::lazy::TsNode::ToString();
+    if (dtype.has_value()) {
+      ss << ", dtype=" << dtype.value();
+    } else {
+      ss << ", dtype=null";
+    }
+    if (layout.has_value()) {
+      ss << ", layout=" << layout.value();
+    } else {
+      ss << ", layout=null";
+    }
+    if (device.has_value()) {
+      ss << ", device=" << device.value();
+    } else {
+      ss << ", device=null";
+    }
+    if (pin_memory.has_value()) {
+      ss << ", pin_memory=" << pin_memory.value();
+    } else {
+      ss << ", pin_memory=null";
+    }
+    ss << ", non_blocking=" << non_blocking;
+    if (memory_format.has_value()) {
+      ss << ", memory_format=" << memory_format.value();
+    } else {
+      ss << ", memory_format=null";
+    }
+    return ss.str();
+  }
+
+  torch::lazy::TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      torch::lazy::TSLoweringContext* loctx) const override {
+    std::vector<torch::jit::NamedValue> arguments;
+    std::vector<torch::jit::NamedValue> kwarguments;
+    arguments.reserve(1);
+    kwarguments.reserve(6);
+    size_t i = 0;
+    arguments.emplace_back(loctx->GetOutputOp(operand(i++)));
+    kwarguments.emplace_back("dtype", dtype);
+    kwarguments.emplace_back("layout", layout);
+    kwarguments.emplace_back("device", device);
+    kwarguments.emplace_back("pin_memory", pin_memory);
+    kwarguments.emplace_back("non_blocking", non_blocking);
+    kwarguments.emplace_back("memory_format", memory_format);
+    torch::lazy::TSOpVector _to_copy_out =
+        torch::lazy::LowerTSBuiltin(function, op().op, arguments, kwarguments);
+    TORCH_CHECK_EQ(_to_copy_out.size(), 1);
+
+    return _to_copy_out;
+  }
+
+  std::optional<at::ScalarType> dtype;
+  std::optional<at::Layout> layout;
+  std::optional<at::Device> device;
+  std::optional<bool> pin_memory;
+  bool non_blocking;
+  std::optional<at::MemoryFormat> memory_format;
+};
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/tensor_aten_ops.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/tensor_aten_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..0dd219e8173daff02edb33ef0a14fe069c678f10
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/tensor_aten_ops.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <torch/csrc/lazy/core/tensor.h>
+
+namespace torch::lazy {
+
+//////////////////////////////////////////////////////////////////////////////
+// ATEN operators follows here, listed in alphabetical order.
+//////////////////////////////////////////////////////////////////////////////
+
+void copy_(torch::lazy::LazyTensorPtr& input, torch::lazy::LazyTensorPtr& src);
+// Fills the input with the given value.
+void fill_(torch::lazy::LazyTensorPtr& input, const at::Scalar& value);
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ts_autograd_functions.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ts_autograd_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d26163d4e8b51d0a6e225bcca1e83595004f2b0
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ts_autograd_functions.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <torch/csrc/autograd/custom_function.h>
+
+namespace torch::lazy {
+
+struct MaxPool3dAutogradFunctionTS
+    : public torch::autograd::Function<MaxPool3dAutogradFunctionTS> {
+  static at::Tensor forward(
+      torch::autograd::AutogradContext* ctx,
+      const at::Tensor& self,
+      at::IntArrayRef kernel_size,
+      at::IntArrayRef stride,
+      at::IntArrayRef padding,
+      at::IntArrayRef dilation,
+      bool ceil_mode);
+  static torch::autograd::variable_list backward(
+      torch::autograd::AutogradContext* ctx,
+      torch::autograd::variable_list grad_output);
+};
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ts_backend_impl.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ts_backend_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e69b57f9875bb4b6ced97367dc949f2902a12b6
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ts_backend_impl.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <torch/csrc/lazy/backend/backend_interface.h>
+
+#include <utility>
+
+namespace torch::lazy {
+
+class TORCH_API TSData : public torch::lazy::BackendData {
+ public:
+  TSData(const at::Scalar& scalar, const torch::lazy::BackendDevice& device)
+      : torch::lazy::BackendData(device, torch::lazy::Shape(scalar.type(), {})),
+        scalar(scalar) {}
+
+  TSData(
+      at::Tensor data,
+      const torch::lazy::Shape& shape,
+      const torch::lazy::BackendDevice& device)
+      : torch::lazy::BackendData(device, shape), data_(std::move(data)) {}
+
+  TSData(
+      const torch::lazy::Shape& shape,
+      const torch::lazy::BackendDevice& device)
+      : torch::lazy::BackendData(device, shape) {}
+
+  Handle GetHandle() override {
+    return reinterpret_cast<int64_t>(this);
+  }
+
+  void Assign(const torch::lazy::BackendData& data) override {
+    data_ = static_cast<const TSData&>(data).data_;
+  }
+
+  bool HasValue() const override {
+    return data_.defined();
+  }
+
+  at::Tensor data() {
+    return data_;
+  }
+
+  std::optional<at::Scalar> scalar;
+
+ private:
+  at::Tensor data_;
+};
+
+TORCH_API torch::lazy::BackendImplInterface* GetTSBackendImpl();
+
+TORCH_PYTHON_API void InitTorchScriptBackend();
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ts_eager_fallback.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ts_eager_fallback.h
new file mode 100644
index 0000000000000000000000000000000000000000..056f0fdb687b1037e4c24779261ca381c4d1cda9
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ts_eager_fallback.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/stack.h>
+#include <functional>
+
+namespace torch::lazy {
+
+bool force_eager_fallback(c10::Symbol op);
+void ltc_eager_fallback(
+    const c10::OperatorHandle& op,
+    torch::jit::Stack* stack);
+
+void ts_eager_fallback(
+    const c10::OperatorHandle& op,
+    torch::jit::Stack* stack,
+    c10::DeviceType device_type);
+
+// The TorchScript backend does not register itself with pytorch dispatcher
+// until it is explicitly initialized.  This function should only be called
+// by the main Torchscript backend init function.
+void register_ts_ltc_eager_fallback();
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ts_lowering_context.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ts_lowering_context.h
new file mode 100644
index 0000000000000000000000000000000000000000..4df7f62ad15d49eeed800313d301094d90e7e3e4
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ts_lowering_context.h
@@ -0,0 +1,150 @@
+#pragma once
+
+#include <sstream>
+
+#include <torch/csrc/api/include/torch/jit.h>
+#include <torch/csrc/jit/runtime/graph_executor.h>
+#include <torch/csrc/lazy/backend/lowering_context.h>
+#include <torch/csrc/lazy/core/ir.h>
+#include <torch/csrc/lazy/ts_backend/ts_node_lowering.h>
+
+namespace torch::lazy {
+
+using TSOpVector = std::vector<torch::jit::Value*>;
+
+class TORCH_API TSComputation : public Computation {
+ public:
+  TSComputation(const std::shared_ptr<torch::jit::Graph>& graph)
+      : graph_(graph), graph_executor_(graph, "") {
+    for (torch::jit::Value* input : graph_->inputs()) {
+      parameter_names_.push_back(input->debugName());
+    }
+  }
+
+  int parameters_size() const override {
+    return static_cast<int>(parameter_names_.size());
+  }
+
+  const std::vector<Shape>& parameter_shapes() const override {
+    throw std::runtime_error(
+        "TODO(whc) implement TS computation shapes or change interface");
+    return parameter_shapes_;
+  }
+
+  const std::vector<std::string>& parameter_names() const override {
+    return parameter_names_;
+  }
+
+  const Shape& result_shape() const override {
+    throw std::runtime_error(
+        "TODO(whc) implement TS computation shapes or change interface");
+    return result_shape_;
+  }
+
+  const std::string to_string() const override {
+    std::ostringstream oss;
+    oss << *graph_;
+    return oss.str();
+  }
+
+  std::shared_ptr<torch::jit::Graph> graph() const {
+    return graph_;
+  }
+
+  torch::jit::GraphExecutor& graph_executor() {
+    return graph_executor_;
+  }
+
+ private:
+  std::shared_ptr<torch::jit::Graph> graph_;
+  torch::jit::GraphExecutor graph_executor_;
+  std::vector<std::string> parameter_names_;
+  std::vector<Shape> parameter_shapes_;
+  Shape result_shape_;
+};
+
+class TORCH_API TSLoweringContext : public LoweringContext {
+ public:
+  TSLoweringContext(const std::string& name, const BackendDevice device);
+
+  TSLoweringContext(
+      const std::string& name,
+      BackendDevice device,
+      c10::ArrayRef<const Node*> post_order,
+      Util::EmissionMap emit_status);
+
+  size_t AddResult(const Output& output) override {
+    return AddResult(GetOutputOp(output));
+  }
+
+  void AddParameter(
+      const torch::lazy::Output& output,
+      size_t index,
+      const Shape& shape,
+      const std::string& name) override {
+    TORCH_INTERNAL_ASSERT(false, "not implemented");
+  }
+
+  void Lower(const Node* node);
+
+  ComputationPtr Build() override {
+    for (torch::jit::Value* output : root_tuple_) {
+      graph_->block()->registerOutput(output);
+    }
+    return std::shared_ptr<Computation>(new TSComputation(graph_));
+  }
+
+  // Retrieves the lowered operation for an output. If the requested output is
+  // not available yet, the graph behind the output's Node is lowered, and the
+  // corresponding TS operation returned.
+  torch::jit::Value* GetOutputOp(const Output& output) {
+    auto it = emitted_outputs_.find(output);
+    if (it == emitted_outputs_.end()) {
+      auto post_order = Util::ComputePostOrder(output.node, &emit_status_);
+      for (auto node : post_order) {
+        Lower(node);
+      }
+      // At this point the output better be present, otherwise there is an issue
+      // with the lowering code.
+      it = emitted_outputs_.find(output);
+      TORCH_CHECK(
+          it != emitted_outputs_.end(),
+          "No TS operation emitted for output: ",
+          output.ToString());
+    }
+    return it->second;
+  }
+
+  // Assigns the given TS operation to the specified output. As outputs are
+  // lowered in a post-order fashion, later nodes should always find their
+  // operands among the emitted outputs.
+  void AssignOutputOp(const Output& output, torch::jit::Value* op);
+
+  // If a parameter associated with data has already been declared, it will be
+  // returned. Otherwise a new one will be created, associated with the tensor
+  // held in data.
+  torch::jit::Value* GetParameter(const BackendDataPtr& data);
+
+  std::shared_ptr<torch::jit::Graph> graph() const {
+    return graph_;
+  }
+
+ private:
+  struct Parameter {
+    torch::jit::Value* param{nullptr};
+    size_t index = 0;
+  };
+
+  size_t AddResult(torch::jit::Value* op) {
+    root_tuple_.push_back(op);
+    return root_tuple_.size() - 1;
+  }
+
+  std::shared_ptr<torch::jit::Graph> graph_;
+  std::shared_ptr<torch::jit::GraphFunction> function_;
+  std::unordered_map<BackendData::Handle, Parameter> parameters_map_;
+  std::vector<torch::jit::Value*> root_tuple_;
+  OutputMap<torch::jit::Value*> emitted_outputs_;
+};
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ts_node.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ts_node.h
new file mode 100644
index 0000000000000000000000000000000000000000..13d875b12f997dd0fceb400e93a491ef7c978319
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ts_node.h
@@ -0,0 +1,104 @@
+#pragma once
+
+#include <c10/util/ArrayRef.h>
+#include <torch/csrc/jit/api/function_impl.h>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/lazy/backend/lowering_context.h>
+#include <torch/csrc/lazy/core/ir.h>
+#include <torch/csrc/lazy/core/shape.h>
+#include <torch/csrc/lazy/ts_backend/ts_lowering_context.h>
+
+namespace torch::lazy {
+
+using TSOpVector = std::vector<torch::jit::Value*>;
+
+class TORCH_API TsNode : public lazy::Node {
+ public:
+  TsNode(
+      OpKind op,
+      OpList operands,
+      std::vector<Shape>&& shapes,
+      size_t num_outputs,
+      hash_t hash_seed = kHashSeed);
+
+  TsNode(
+      OpKind op,
+      OpList operands,
+      const std::function<Shape()>& shape_fn,
+      size_t num_outputs,
+      hash_t hash_seed = kHashSeed);
+
+  TsNode(
+      OpKind op,
+      OpList operands,
+      size_t num_outputs,
+      hash_t hash_seed = kHashSeed);
+
+  TsNode(
+      OpKind op,
+      Shape shape,
+      size_t num_outputs,
+      hash_t hash_seed = kHashSeed);
+
+  ~TsNode() override = default;
+
+  hash_t hash() const override;
+
+  hash_t shapeHash() const override;
+
+  const std::string getPythonStacktrace() const;
+
+  // Lower is a backend-specific method since it returns a backend specific
+  // type. hence, it is convenient to define it differently per-backend rather
+  // than at Node API
+  virtual TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      TSLoweringContext* loctx) const;
+
+ private:
+  // The hash of the dag WITH size info. Used for shape caching
+  hash_t shape_hash_;
+  // The hash of the dag used to look up the compiled graph by a hash
+  // in this case, we will use the dag hash WITHOUT size info if dynamic shape
+  // is enabled and use the dag hash WITH size info otherwise.
+  hash_t dag_hash_;
+};
+
+// Note: this OpKind is separate from ltc_ops.h since it would be a circular
+// import otherwise, I like leaving TensorList in this file, and I think most of
+// ltc_ops special cases will be deleted anyway
+const OpKind tensor_list_opkind = OpKind::Get("lazy_tensors::tensor_list");
+
+// TensorList represents an at::TensorList which is a vector[Tensor] but is also
+// a first-class IValue and can be fed as a single input to a TS program.  It is
+// much easier to handle TensorLists in Lazy Tensor code if they are represented
+// as a single Node so there can be more than one TensorList and more than one
+// Tensor side-by-side as operands to an op.
+//
+// Note: shape is undefined for TensorList.  We assert in some places that
+// #shapes matches #outputs and this stems from
+//       the fact that currently all IR nodes represent tensors (there is no
+//       type system for this IR).  Because of this, TensorList is a bit of a
+//       hack.
+//
+// TODO(whc) once Shape() API is moved to Node base, also make it virtual, and
+// then implement it as NotImplemented for TensorList, also fixing the assertion
+// that would fail.
+struct TORCH_API TensorList : public TsNode {
+  static OpKind ClassOpKind() {
+    return tensor_list_opkind;
+  }
+
+  TensorList() = delete;
+  TensorList(OpList values);
+
+  bool CanBeReused(OpList values) const {
+    return operands() == std::vector<Output>(values.begin(), values.end());
+  }
+
+  TSOpVector Lower(
+      std::shared_ptr<torch::jit::GraphFunction> function,
+      TSLoweringContext* loctx) const override;
+};
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ts_node_lowering.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ts_node_lowering.h
new file mode 100644
index 0000000000000000000000000000000000000000..6f907a1bc57ea26b7f08728c9513cd79fdaae57a
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/lazy/ts_backend/ts_node_lowering.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <torch/csrc/api/include/torch/jit.h>
+#include <torch/csrc/lazy/backend/lowering_context.h>
+
+namespace torch::lazy {
+using TSOpVector = std::vector<torch::jit::Value*>;
+
+TORCH_API TSOpVector LowerTSBuiltin(
+    const std::shared_ptr<torch::jit::GraphFunction>& function,
+    c10::Symbol sym,
+    const std::vector<torch::jit::NamedValue>& arguments,
+    const std::vector<torch::jit::NamedValue>& kwarguments = {});
+
+} // namespace torch::lazy
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/monitor/counters.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/monitor/counters.h
new file mode 100644
index 0000000000000000000000000000000000000000..8533852c04096ac1ba3de6bf396673f74537b280
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/monitor/counters.h
@@ -0,0 +1,280 @@
+#pragma once
+
+#include <bitset>
+#include <mutex>
+#include <sstream>
+#include <unordered_map>
+#include <vector>
+
+#include <c10/macros/Macros.h>
+
+#include <torch/csrc/monitor/events.h>
+
+namespace torch::monitor {
+
+constexpr int NUM_AGGREGATIONS = 7;
+
+// Aggregation is the list of possible aggregations for Stats.
+// These use bitwise flags so they can be efficiently stored.
+enum class C10_API_ENUM Aggregation {
+  // NONE means no aggregations are set.
+  NONE = 0,
+  // VALUE exports the most recently set value.
+  VALUE = 1,
+  // MEAN computes the mean of the set values within the window. Zero if no
+  // values.
+  MEAN = 2,
+  // COUNT tracks the number of times a value is set within the window.
+  COUNT = 3,
+  // SUM computes the sum of the values set within the window.
+  SUM = 4,
+  // MIN computes the minimum of the values set within the window. Zero if no
+  // values.
+  MAX = 5,
+  // MAX computes the maximum of the values set within the window. Zero if no
+  // values.
+  MIN = 6,
+};
+
+struct TORCH_API AggregationHash{template <typename T> std::size_t operator()(
+    T t) const {return static_cast<std::size_t>(t);
+} // namespace torch::monitor
+}
+;
+
+// aggregationName returns the human readable name corresponding to the
+// aggregation.
+TORCH_API const char* aggregationName(Aggregation agg);
+
+template <typename T>
+class Stat;
+
+namespace {
+template <typename T>
+inline std::bitset<NUM_AGGREGATIONS> merge(T& list) {
+  std::bitset<NUM_AGGREGATIONS> a;
+  for (Aggregation b : list) {
+    a.set(static_cast<int>(b));
+  }
+  return a;
+}
+} // namespace
+
+namespace detail {
+void TORCH_API registerStat(Stat<double>* stat);
+void TORCH_API registerStat(Stat<int64_t>* stat);
+void TORCH_API unregisterStat(Stat<double>* stat);
+void TORCH_API unregisterStat(Stat<int64_t>* stat);
+} // namespace detail
+
+// Stat is used to compute summary statistics in a performant way over fixed
+// intervals. Stat logs the statistics as an Event once every `windowSize`
+// duration. When the window closes the stats are logged via the event handlers
+// as a `torch.monitor.Stat` event.
+//
+// `windowSize` should be set to something relatively high to avoid a huge
+// number of events being logged. Ex: 60s. Stat uses millisecond precision.
+//
+// If maxSamples is set, the stat will cap the number of samples per window by
+// discarding `add` calls once `maxSamples` adds have occurred. If it's not set,
+// all `add` calls during the window will be included.
+// This is an optional field to make aggregations more directly comparable
+// across windows when the number of samples might vary.
+//
+// Stats support double and int64_t data types depending on what needs to be
+// logged and needs to be templatized with one of them.
+//
+// When the Stat is destructed it will log any remaining data even if the window
+// hasn't elapsed.
+template <typename T>
+class Stat {
+ private:
+  struct Values {
+    T value{0};
+    T sum{0};
+    T min{0};
+    T max{0};
+    int64_t count{0};
+  };
+
+ public:
+  Stat(
+      std::string name,
+      std::initializer_list<Aggregation> aggregations,
+      std::chrono::milliseconds windowSize,
+      int64_t maxSamples = std::numeric_limits<int64_t>::max())
+      : name_(std::move(name)),
+        aggregations_(merge(aggregations)),
+        windowSize_(windowSize),
+        maxSamples_(maxSamples) {
+    detail::registerStat(this);
+  }
+
+  Stat(
+      std::string name,
+      std::vector<Aggregation> aggregations,
+      std::chrono::milliseconds windowSize,
+      int64_t maxSamples = std::numeric_limits<int64_t>::max())
+      : name_(std::move(name)),
+        aggregations_(merge(aggregations)),
+        windowSize_(windowSize),
+        maxSamples_(maxSamples) {
+    detail::registerStat(this);
+  }
+  Stat(const Stat&) = delete;
+  Stat(Stat&&) = delete;
+  Stat& operator=(const Stat&) = delete;
+  Stat& operator=(Stat&&) = delete;
+
+  virtual ~Stat() {
+    {
+      // on destruction log if there's unlogged data
+      std::lock_guard<std::mutex> guard(mu_);
+      logLocked();
+    }
+    detail::unregisterStat(this);
+  }
+
+  // add adds the value v to the current window.
+  void add(T v) {
+    std::lock_guard<std::mutex> guard(mu_);
+    maybeLogLocked();
+
+    if (alreadyLogged()) {
+      return;
+    }
+
+    if (aggregations_.test(static_cast<int>(Aggregation::VALUE))) {
+      current_.value = v;
+    }
+    if (aggregations_.test(static_cast<int>(Aggregation::MEAN)) ||
+        aggregations_.test(static_cast<int>(Aggregation::SUM))) {
+      current_.sum += v;
+    }
+
+    if (aggregations_.test(static_cast<int>(Aggregation::MAX))) {
+      if (current_.max < v || current_.count == 0) {
+        current_.max = v;
+      }
+    }
+    if (aggregations_.test(static_cast<int>(Aggregation::MIN))) {
+      if (current_.min > v || current_.count == 0) {
+        current_.min = v;
+      }
+    }
+
+    current_.count += 1;
+    maybeLogLocked();
+  }
+
+  const std::string& name() const noexcept {
+    return name_;
+  }
+
+  // count returns the number of items in the current open window.
+  int64_t count() noexcept {
+    std::lock_guard<std::mutex> guard(mu_);
+
+    return current_.count;
+  }
+
+  std::unordered_map<Aggregation, T, AggregationHash> get() noexcept {
+    std::lock_guard<std::mutex> guard(mu_);
+    return getLocked();
+  }
+
+ protected:
+  virtual uint64_t currentWindowId() const {
+    std::chrono::milliseconds now =
+        std::chrono::duration_cast<std::chrono::milliseconds>(
+            std::chrono::steady_clock::now().time_since_epoch());
+
+    // always returns a currentWindowId of at least 1 to avoid 0 window issues
+    return (now / windowSize_) + 1;
+  }
+
+ private:
+  bool alreadyLogged() {
+    return lastLoggedWindowId_ == currentWindowId();
+  }
+
+  void maybeLogLocked() {
+    auto windowId = currentWindowId();
+    bool shouldLog = windowId_ != windowId || current_.count >= maxSamples_;
+    if (shouldLog && !alreadyLogged()) {
+      logLocked();
+      lastLoggedWindowId_ = windowId_;
+      windowId_ = windowId;
+    }
+  }
+
+  void logLocked() {
+    prev_ = current_;
+    current_ = Values();
+
+    // don't log event if there's no data
+    if (prev_.count == 0) {
+      return;
+    }
+
+    Event e;
+    e.name = "torch.monitor.Stat";
+    e.timestamp = std::chrono::system_clock::now();
+
+    auto stats = getLocked();
+    e.data.reserve(stats.size());
+    for (auto& kv : stats) {
+      std::stringstream key;
+      key << name_;
+      key << ".";
+      key << aggregationName(kv.first);
+      e.data[key.str()] = kv.second;
+    }
+
+    logEvent(e);
+  }
+
+  std::unordered_map<Aggregation, T, AggregationHash> getLocked()
+      const noexcept {
+    std::unordered_map<Aggregation, T, AggregationHash> out;
+    out.reserve(aggregations_.count());
+
+    if (aggregations_.test(static_cast<int>(Aggregation::VALUE))) {
+      out.emplace(Aggregation::VALUE, prev_.value);
+    }
+    if (aggregations_.test(static_cast<int>(Aggregation::MEAN))) {
+      if (prev_.count == 0) {
+        out.emplace(Aggregation::MEAN, 0);
+      } else {
+        out.emplace(Aggregation::MEAN, prev_.sum / prev_.count);
+      }
+    }
+    if (aggregations_.test(static_cast<int>(Aggregation::COUNT))) {
+      out.emplace(Aggregation::COUNT, prev_.count);
+    }
+    if (aggregations_.test(static_cast<int>(Aggregation::SUM))) {
+      out.emplace(Aggregation::SUM, prev_.sum);
+    }
+    if (aggregations_.test(static_cast<int>(Aggregation::MAX))) {
+      out.emplace(Aggregation::MAX, prev_.max);
+    }
+    if (aggregations_.test(static_cast<int>(Aggregation::MIN))) {
+      out.emplace(Aggregation::MIN, prev_.min);
+    }
+
+    return out;
+  }
+
+  const std::string name_;
+  const std::bitset<NUM_AGGREGATIONS> aggregations_;
+
+  std::mutex mu_;
+  Values current_;
+  Values prev_;
+
+  uint64_t windowId_{0};
+  uint64_t lastLoggedWindowId_{0};
+  const std::chrono::milliseconds windowSize_;
+  const int64_t maxSamples_;
+};
+} // namespace torch::monitor
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/monitor/events.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/monitor/events.h
new file mode 100644
index 0000000000000000000000000000000000000000..de098bcbd3b1f2541e9dcbe465b5fd05bcb914f4
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/monitor/events.h
@@ -0,0 +1,71 @@
+#pragma once
+
+#include <chrono>
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include <c10/macros/Macros.h>
+#include <variant>
+
+namespace torch::monitor {
+
+// data_value_t is the type for Event data values.
+using data_value_t = std::variant<std::string, double, int64_t, bool>;
+
+// Event represents a single event that can be logged out to an external
+// tracker. This does acquire a lock on logging so should be used relatively
+// infrequently to avoid performance issues.
+struct TORCH_API Event {
+  // name is the name of the event. This is a static string that's used to
+  // differentiate between event types for programmatic access. The type should
+  // be in the format of a fully qualified Python-style class name.
+  // Ex: torch.monitor.MonitorEvent
+  std::string name;
+
+  // timestamp is a timestamp relative to the Unix epoch time.
+  std::chrono::system_clock::time_point timestamp;
+
+  // data contains rich information about the event. The contents are event
+  // specific so you should check the type to ensure it's what you expect before
+  // accessing the data.
+  //
+  // NOTE: these events are not versioned and it's up to the consumer of the
+  // events to check the fields to ensure backwards compatibility.
+  std::unordered_map<std::string, data_value_t> data;
+};
+
+TORCH_API inline bool operator==(const Event& lhs, const Event& rhs) {
+  return lhs.name == rhs.name && lhs.timestamp == rhs.timestamp &&
+      lhs.data == rhs.data;
+}
+
+// EventHandler represents an abstract event handler that can be registered to
+// capture events. Every time an event is logged every handler will be called
+// with the events contents.
+//
+// NOTE: The handlers should avoid any IO, blocking calls or heavy computation
+// as this may block the main thread and cause performance issues.
+class TORCH_API EventHandler {
+ public:
+  virtual ~EventHandler() = default;
+
+  // handle needs to be implemented to handle the events. This may be called
+  // from multiple threads so needs to be thread safe.
+  virtual void handle(const Event& e) = 0;
+};
+
+// logEvent calls each registered event handler with the event. This method can
+// be called from concurrently from multiple threads.
+TORCH_API void logEvent(const Event& e);
+
+// registerEventHandler registers an EventHandler so it receives any logged
+// events. Typically an EventHandler will be registered during program
+// setup and unregistered at the end.
+TORCH_API void registerEventHandler(std::shared_ptr<EventHandler> p);
+
+// unregisterEventHandler unregisters the event handler pointed to by the
+// shared_ptr.
+TORCH_API void unregisterEventHandler(const std::shared_ptr<EventHandler>& p);
+
+} // namespace torch::monitor
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/monitor/python_init.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/monitor/python_init.h
new file mode 100644
index 0000000000000000000000000000000000000000..08c98433babcf264488ae650faf889d6f289f16d
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/monitor/python_init.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch::monitor {
+
+void initMonitorBindings(PyObject* module);
+
+}
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/mps/Module.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/mps/Module.h
new file mode 100644
index 0000000000000000000000000000000000000000..180cb743e0be6542f45f1c0a9e3f7229f831831d
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/mps/Module.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+namespace torch::mps {
+
+PyMethodDef* python_functions();
+void initModule(PyObject* module);
+
+} // namespace torch::mps
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/mtia/Module.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/mtia/Module.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b491b17ddb4c04d6358d413f1a97b86f9eb70bb
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/mtia/Module.h
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+namespace torch::mtia {
+
+// PyMethodDef* python_functions();
+void initModule(PyObject* module);
+
+} // namespace torch::mtia
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/mtia/profiler/MTIAMemoryProfiler.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/mtia/profiler/MTIAMemoryProfiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..fcd66ef2d7d4b1e55759b7c43ccd537e3b0dff07
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/mtia/profiler/MTIAMemoryProfiler.h
@@ -0,0 +1,20 @@
+#pragma once
+#include <torch/csrc/profiler/orchestration/python_tracer.h>
+
+namespace torch::mtia {
+using namespace torch::profiler::impl::python_tracer;
+
+void initMemoryProfiler();
+
+std::unique_ptr<PythonMemoryTracerBase> getMemoryTracer();
+
+class MTIAMemoryProfiler final : public PythonMemoryTracerBase {
+ public:
+  explicit MTIAMemoryProfiler() = default;
+  ~MTIAMemoryProfiler() override = default;
+  void start() override;
+  void stop() override;
+  void export_memory_history(const std::string& path) override;
+};
+
+} // namespace torch::mtia
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/multiprocessing/init.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/multiprocessing/init.h
new file mode 100644
index 0000000000000000000000000000000000000000..f83c97a0d7c2de4b9f466bb816c816bc1a4f9cac
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/multiprocessing/init.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/csrc/python_headers.h>
+
+namespace torch::multiprocessing {
+
+const PyMethodDef* python_functions();
+
+} // namespace torch::multiprocessing
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/onnx/back_compat.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/onnx/back_compat.h
new file mode 100644
index 0000000000000000000000000000000000000000..de6c7f73e510e8391f4c01ba4841ada6b1f7e376
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/onnx/back_compat.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <onnx/onnx_pb.h>
+
+namespace torch::onnx {
+
+// The following constants are defined here to avoid breaking Meta's internal
+// usage of ONNX which pre-dates ONNX 1.14 and thus does not support FLOAT8:
+// cf. https://github.com/pytorch/pytorch/pull/106379#issuecomment-1675189340
+// -abock, 2023-08-25
+//
+// ::ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E4M3FN
+constexpr auto TensorProto_DataType_FLOAT8E4M3FN =
+    static_cast<::ONNX_NAMESPACE::TensorProto_DataType>(17);
+// ::ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E4M3FNUZ
+constexpr auto TensorProto_DataType_FLOAT8E4M3FNUZ =
+    static_cast<::ONNX_NAMESPACE::TensorProto_DataType>(18);
+// ::ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E5M2
+constexpr auto TensorProto_DataType_FLOAT8E5M2 =
+    static_cast<::ONNX_NAMESPACE::TensorProto_DataType>(19);
+// ::ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E5M2FNUZ
+constexpr auto TensorProto_DataType_FLOAT8E5M2FNUZ =
+    static_cast<::ONNX_NAMESPACE::TensorProto_DataType>(20);
+
+} // namespace torch::onnx
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/onnx/init.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/onnx/init.h
new file mode 100644
index 0000000000000000000000000000000000000000..d58f7d4bf160317b2525d498167924a4be7efef9
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/onnx/init.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch::onnx {
+
+void initONNXBindings(PyObject* module);
+
+} // namespace torch::onnx
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/onnx/onnx.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/onnx/onnx.h
new file mode 100644
index 0000000000000000000000000000000000000000..6dd6e1e84ba0d49c5d78a1ff28b9a86026965c72
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/onnx/onnx.h
@@ -0,0 +1,20 @@
+#pragma once
+
+namespace torch::onnx {
+
+enum class OperatorExportTypes {
+  ONNX, // Strict ONNX export
+  ONNX_ATEN, // ONNX With ATen op everywhere
+  ONNX_ATEN_FALLBACK, // ONNX export with ATen fallback
+  ONNX_FALLTHROUGH, // Export supported ONNX ops. Pass through unsupported ops.
+};
+
+enum class TrainingMode {
+  EVAL, // Inference mode
+  PRESERVE, // Preserve model state (eval/training)
+  TRAINING, // Training mode
+};
+
+constexpr auto kOnnxNodeNameAttribute = "onnx_name";
+
+} // namespace torch::onnx
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/api.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/api.h
new file mode 100644
index 0000000000000000000000000000000000000000..28b8fdd4dd2e854511286c87b42f8779519a313e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/api.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <torch/csrc/profiler/orchestration/observer.h>
+
+// There are some components which use these symbols. Until we migrate them
+// we have to mirror them in the old autograd namespace.
+
+namespace torch::autograd::profiler {
+using torch::profiler::impl::ActivityType;
+using torch::profiler::impl::getProfilerConfig;
+using torch::profiler::impl::ProfilerConfig;
+using torch::profiler::impl::profilerEnabled;
+using torch::profiler::impl::ProfilerState;
+} // namespace torch::autograd::profiler
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/collection.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/collection.h
new file mode 100644
index 0000000000000000000000000000000000000000..d0dd0893ebaa299ce04fcf0a3114ed45ff7f715c
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/collection.h
@@ -0,0 +1,690 @@
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <mutex>
+#include <type_traits>
+#include <utility>
+#include <variant>
+
+#include <ATen/Context.h>
+#include <c10/core/Device.h>
+#include <c10/core/TensorImpl.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/ApproximateClock.h>
+#include <c10/util/flat_hash_map.h>
+#include <c10/util/strong_type.h>
+#include <torch/csrc/profiler/containers.h>
+#include <torch/csrc/profiler/data_flow.h>
+#include <torch/csrc/profiler/events.h>
+#include <torch/csrc/profiler/kineto_shim.h>
+#include <torch/csrc/profiler/orchestration/python_tracer.h>
+#include <torch/csrc/profiler/perf.h>
+#include <torch/csrc/profiler/stubs/base.h>
+#include <torch/csrc/profiler/util.h>
+#include <torch/csrc/utils/python_stub.h>
+
+namespace torch::profiler::impl {
+
+enum class EventType : uint8_t {
+  TorchOp = 0,
+  Backend,
+  Vulkan,
+  Allocation,
+  OutOfMemory,
+  PyCall,
+  PyCCall,
+  Kineto
+};
+
+// ============================================================================
+// == Value (Tensor, Scalar) summary ==========================================
+// ============================================================================
+struct TORCH_API RawTensorMetadataBase {
+  RawTensorMetadataBase() = default;
+  explicit RawTensorMetadataBase(const at::Tensor& t);
+
+  StorageImplData data_;
+  c10::ScalarType dtype_{c10::ScalarType::Undefined};
+  c10::Layout layout_{c10::Layout::Strided};
+  uint32_t size_dim_{0};
+};
+
+// Collected during profiling.
+struct TORCH_API RawTensorMetadata : RawTensorMetadataBase {
+  RawTensorMetadata() = default;
+  RawTensorMetadata(const RawTensorMetadata&) = default;
+  RawTensorMetadata(RawTensorMetadata&&) noexcept = default;
+  RawTensorMetadata& operator=(const RawTensorMetadata&) = default;
+  RawTensorMetadata& operator=(RawTensorMetadata&&) noexcept = default;
+  ~RawTensorMetadata() = default;
+  explicit RawTensorMetadata(const at::Tensor& t);
+
+  // Wrap `weak_self_` in `std::optional` and split device into components to
+  // keep struct default constructable. (which the std::array initializer needs)
+  std::optional<WeakTensor> weak_self_;
+  c10::DeviceType device_type_{c10::DeviceType::CPU};
+  c10::DeviceIndex device_index_{-1};
+};
+
+// Used during post processing.
+struct TORCH_API TensorMetadata : public RawTensorMetadataBase {
+  TensorMetadata(
+      const RawTensorMetadata& r,
+      std::vector<int64_t> sizes,
+      std::vector<int64_t> strides);
+
+  TensorImplAddress impl() const {
+    return weak_self_.get();
+  }
+
+  WeakTensor weak_self_;
+  c10::Device device_;
+  std::vector<int64_t> sizes_;
+  std::vector<int64_t> strides_;
+
+  // Set during `calculateUniqueTensorIDs`.
+  std::optional<TensorID> id_;
+  std::optional<AllocationID> allocation_id_;
+};
+
+// Used during post processing.
+struct TORCH_API ProfilerStepInfo {
+  int64_t start_time_ns; // start time of the profiler step
+  int64_t end_time_ns; // end time of the profiler step
+  uint64_t out_idx; // index of the profiler step in the profiler "out" var in
+                    // getRecords
+
+  ProfilerStepInfo(int64_t start, int64_t end, uint64_t out_idx)
+      : start_time_ns(start), end_time_ns(end), out_idx(out_idx) {}
+};
+
+using op_input_t = std::variant<
+    TensorMetadata,
+    std::vector<TensorMetadata>,
+    c10::IValue,
+    std::nullopt_t>;
+
+// ============================================================================
+// == ExtraFields =============================================================
+// ============================================================================
+template <EventType>
+struct ExtraFields;
+
+struct TorchOpBasicFields {
+  int64_t sequence_number_{0};
+  uint64_t forward_tid_{0};
+  at::RecordScope scope_{};
+  bool is_async_{false};
+  uint64_t record_function_id_{0};
+  int64_t debug_handle_{0};
+  std::string name_;
+  std::string overload_name_;
+
+  // Set in the exit callback.
+  uint64_t end_tid_{0};
+};
+
+using jit_stack_t = std::vector<std::string>;
+using jit_modules_t = std::vector<std::string>;
+using extra_args_t = std::unordered_map<std::string, c10::IValue>;
+using extra_meta_t = std::unordered_map<std::string, std::string>;
+using kwinputs_t = std::unordered_map<std::string, c10::IValue>;
+
+struct FallbackPair {
+  ProfilerVoidEventStub device_event_start_ = nullptr;
+  ProfilerVoidEventStub device_event_end_ = nullptr;
+};
+
+template <>
+struct ExtraFields<EventType::TorchOp> : TorchOpBasicFields {
+  ExtraFields(
+      TorchOpBasicFields&& f,
+      uint64_t correlation_id,
+      c10::time_t end_time_ns,
+      std::vector<op_input_t>&& inputs,
+      std::vector<op_input_t>&& concrete_inputs,
+      jit_stack_t&& jit_stack,
+      jit_modules_t&& jit_modules,
+      extra_args_t&& extra_args,
+      extra_meta_t&& extra_meta,
+      kwinputs_t&& kwinputs,
+      FallbackPair&& device_fallback,
+      bool allow_tf32_cublas,
+      std::unique_ptr<perf_counters_t>&& perf_event_counters)
+      : TorchOpBasicFields(std::move(f)),
+        correlation_id_{correlation_id},
+        end_time_ns_{end_time_ns},
+        inputs_{std::move(inputs)},
+        concrete_inputs_{std::move(concrete_inputs)},
+        jit_stack_{std::move(jit_stack)},
+        jit_modules_{std::move(jit_modules)},
+        extra_args_{std::move(extra_args)},
+        extra_meta_{std::move(extra_meta)},
+        kwinputs_{std::move(kwinputs)},
+        device_fallback_{std::move(device_fallback)},
+        allow_tf32_cublas_{allow_tf32_cublas},
+        perf_event_counters_{std::move(perf_event_counters)} {}
+  uint64_t correlation_id_;
+  c10::time_t end_time_ns_;
+  std::vector<op_input_t> inputs_;
+  std::vector<op_input_t> concrete_inputs_;
+  jit_stack_t jit_stack_;
+  jit_modules_t jit_modules_;
+  extra_args_t extra_args_;
+  extra_meta_t extra_meta_;
+  kwinputs_t kwinputs_;
+  FallbackPair device_fallback_;
+  bool allow_tf32_cublas_;
+  std::unique_ptr<perf_counters_t> perf_event_counters_;
+};
+
+template <>
+struct ExtraFields<EventType::Backend> {
+  int64_t start_time_us_;
+  int64_t end_time_us_;
+  int64_t debug_handle_;
+  at::RecordScope scope_;
+  std::string name_;
+  std::string backend_;
+  jit_stack_t jit_stack_;
+  jit_modules_t jit_modules_;
+};
+
+template <>
+struct ExtraFields<EventType::Vulkan> {
+  using raw_event_t = std::pair<c10::approx_time_t, vulkan_id_t>;
+  std::string name_;
+  int64_t duration_ns_{0};
+  // While building the event tree, we want to report a vulkan event's duration
+  // as 0 so that its end time doesn't exceed that of its parent cpu op
+  bool in_tree_building_{false};
+};
+
+struct RawAllocation {
+  c10::approx_time_t start_time_;
+  void* ptr_;
+  int64_t alloc_size_;
+  size_t total_allocated_;
+  size_t total_reserved_;
+  c10::DeviceType device_type_;
+  c10::DeviceIndex device_index_;
+};
+
+// For performance.
+static_assert(
+    std::is_trivial_v<RawAllocation>,
+    "Non-Trivial member of RawAllocation.");
+
+template <>
+struct ExtraFields<EventType::Allocation> : RawAllocation {
+  ExtraFields(const RawAllocation& allocation) : RawAllocation(allocation) {}
+
+  c10::Device device() const {
+    return {device_type_, device_index_};
+  }
+
+  std::optional<TensorID> id_;
+  std::optional<AllocationID> allocation_id_;
+};
+
+template <>
+struct ExtraFields<EventType::OutOfMemory> {
+  c10::approx_time_t start_time_;
+  int64_t alloc_size_;
+  size_t total_allocated_;
+  size_t total_reserved_;
+  c10::DeviceType device_type_;
+  c10::DeviceIndex device_index_;
+};
+
+// For performance.
+static_assert(
+    std::is_trivial_v<ExtraFields<EventType::OutOfMemory>>,
+    "Non-Trivial member of ExtraFields<EventType::OutOfMemory>.");
+
+struct PyFrameState {
+  int line_no_;
+  at::StringView filename_;
+  at::StringView funcname_;
+};
+
+template <typename T, typename Tag>
+using strong_t = strong::
+    type<T, Tag, strong::regular, strong::convertible_to<T>, strong::hashable>;
+
+using PyModuleSelf = strong_t<PyObject*, struct PyModuleSelf_>;
+using PyModuleCls = strong_t<PyObject*, struct PyModuleCls_>;
+using PyMethod = strong_t</*PyMethodDef*/ void*, struct PyMethod_>;
+using PyOptimizerSelf = strong_t<PyObject*, struct PyOptSelf_>;
+using PyOptimizerCls = strong_t<PyObject*, struct PyOptimizer_>;
+
+struct NNModuleInfo {
+  struct ParameterInfo {
+    std::string name_;
+    TensorMetadata metadata_;
+    std::optional<TensorMetadata> grad_metadata_;
+  };
+
+  PyModuleSelf self_;
+  PyModuleCls cls_;
+  at::StringView cls_name_;
+
+  std::vector<ParameterInfo> parameters_;
+  // Indicates that `self_` is the kth instance of `cls_` observed.
+  size_t id_{std::numeric_limits<size_t>::max()};
+};
+
+struct OptimizerInfo {
+  struct ParameterInfo {
+    TensorMetadata metadata_;
+    std::optional<TensorMetadata> grad_metadata_;
+    std::vector<std::pair<std::string, TensorMetadata>> state_;
+  };
+
+  PyOptimizerSelf self_;
+  PyOptimizerCls cls_;
+  at::StringView cls_name_;
+
+  std::vector<ParameterInfo> parameters_;
+};
+
+struct PyExtraFieldsBase {
+  PyExtraFieldsBase(
+      c10::time_t end_time_ns,
+      size_t python_tid,
+      PyFrameState caller)
+      : end_time_ns_{end_time_ns},
+        python_tid_{python_tid},
+        caller_{std::move(caller)} {}
+
+  c10::time_t end_time_ns_;
+  size_t python_tid_;
+  PyFrameState caller_;
+
+  // kth python event observed. (Used by TensorBoard)
+  size_t id_{std::numeric_limits<size_t>::max()};
+};
+
+template <>
+struct ExtraFields<EventType::PyCall> : public PyExtraFieldsBase {
+  struct args_t {
+    PyFrameState frame_state_;
+    std::optional<NNModuleInfo> module_info_;
+    std::optional<OptimizerInfo> optimizer_info_;
+  };
+
+  ExtraFields(
+      c10::time_t end_time_ns,
+      size_t python_tid,
+      PyFrameState caller,
+      args_t args)
+      : PyExtraFieldsBase(end_time_ns, python_tid, std::move(caller)),
+        callsite_{std::move(args.frame_state_)},
+        module_{std::move(args.module_info_)},
+        optimizer_{std::move(args.optimizer_info_)} {}
+
+  PyFrameState callsite_;
+  std::optional<NNModuleInfo> module_;
+  std::optional<OptimizerInfo> optimizer_;
+};
+
+template <>
+struct ExtraFields<EventType::PyCCall> : public PyExtraFieldsBase {
+  using args_t = at::StringView;
+
+  ExtraFields(
+      c10::time_t end_time_ns,
+      size_t python_tid,
+      PyFrameState caller,
+      args_t args)
+      : PyExtraFieldsBase(end_time_ns, python_tid, std::move(caller)),
+        function_name_{std::move(args)} {}
+
+  at::StringView function_name_;
+};
+
+template <>
+struct ExtraFields<EventType::Kineto> {
+  // Mirrors `libkineto::GenericTraceActivity::Flow`. This information is used
+  // during post processing to properly embed Kineto events into the broader
+  // profiler tree structure. End users are not generally expected to use these
+  // fields directly, but they are available for debugging.
+  struct Flow {
+    uint32_t id{0};
+    uint32_t type{0};
+    uint32_t start{0};
+  };
+
+  std::string name_;
+  int64_t duration_ns_{0};
+  uint64_t correlation_id_{0};
+  libkineto::ActivityType activity_type_;
+  Flow flow;
+  std::weak_ptr<Result> linked_activity_{};
+};
+
+struct TORCH_API Result : public std::enable_shared_from_this<Result> {
+  template <typename... Args>
+  [[nodiscard]] static std::shared_ptr<Result> create(Args... args) {
+    return std::shared_ptr<Result>(new Result(std::forward<Args>(args)...));
+  }
+
+  template <typename T>
+  decltype(auto) visit(T&& visitor) {
+    return std::visit(std::forward<T>(visitor), extra_fields_);
+  }
+
+  template <typename T>
+  decltype(auto) visit(T&& visitor) const {
+    return std::visit(std::forward<T>(visitor), extra_fields_);
+  }
+
+  template <typename T, typename Fn>
+  void visit_if_base(const Fn& fn) const {
+    visit([&](const auto& extra_fields) {
+      using extra_fields_t = typename std::remove_cv_t<
+          typename std::remove_reference_t<decltype(extra_fields)>>;
+
+      if constexpr (std::is_base_of_v<T, extra_fields_t>) {
+        fn(extra_fields);
+      }
+    });
+  }
+
+  EventType tag() const {
+    return visit([](const auto& i) { return deduceTag(i); });
+  }
+
+  std::string name() const;
+  std::string overload_name() const;
+  libkineto::ActivityType kinetoType() const;
+  uint64_t correlationID() const;
+  int64_t endTimeNS() const;
+  uint64_t endTID() const;
+  c10::DeviceType deviceType() const;
+
+  int64_t start_time_ns_;
+  uint64_t start_tid_;
+  kineto::DeviceAndResource kineto_info_;
+  std::variant<
+      ExtraFields<EventType::TorchOp>,
+      ExtraFields<EventType::Backend>,
+      ExtraFields<EventType::Vulkan>,
+      ExtraFields<EventType::Allocation>,
+      ExtraFields<EventType::OutOfMemory>,
+      ExtraFields<EventType::PyCall>,
+      ExtraFields<EventType::PyCCall>,
+      ExtraFields<EventType::Kineto>>
+      extra_fields_;
+
+  std::weak_ptr<Result> parent_;
+  std::vector<std::shared_ptr<Result>> children_;
+  bool finished_{false};
+
+  const torch::profiler::impl::kineto::activity_t* kineto_activity_{nullptr};
+
+ private:
+  template <EventType E>
+  Result(
+      int64_t start_time_ns,
+      uint64_t start_tid,
+      kineto::DeviceAndResource kineto_info,
+      ExtraFields<E>&& extra_fields)
+      : start_time_ns_{start_time_ns},
+        start_tid_{start_tid},
+        kineto_info_{kineto_info},
+        extra_fields_{std::move(extra_fields)} {}
+
+  template <EventType E>
+  static EventType deduceTag(const ExtraFields<E>&) {
+    return E;
+  }
+};
+
+struct KinetoObserverContext : public at::ObserverContext {
+  struct Event {
+    TorchOpBasicFields basic_fields_;
+    c10::approx_time_t start_time_;
+
+    // Set in the exit callback.
+    c10::approx_time_t end_time_{
+        std::numeric_limits<c10::approx_time_t>::min()};
+
+    bool allow_tf32_cublas_;
+    std::unique_ptr<perf_counters_t> counters_;
+    extra_meta_t* extra_nccl_meta_{};
+  };
+
+  explicit KinetoObserverContext(Event* event) : event_{event} {}
+
+  Event* event_;
+  FallbackPair* fallback_{nullptr};
+};
+
+constexpr int IO_ENCODER_DEFAULT_BLOCK_SIZE = 1024;
+
+constexpr int SCALAR_LIST_LENGTH_LIMIT = 30;
+
+// InputOutputEncoder
+// Stores each op_events' shapes and dtypes, and concrete values into a
+// contiguous AppendOnlyList so that we no longer create vectors for shapes
+// and dtypes on every op. Those vectors can be created during
+// post-processing.
+// It splits the data into two categories: input shapes and concrete inputs.
+class InputOutputEncoder final {
+ public:
+  void push(c10::ArrayRef<const c10::IValue> values);
+
+  // Used during post-processing to unpack the encoded data.
+  // Each method returns a "supplier" lambda which takes no arguments;
+  // invoking the lambda once will return a list of args that represent
+  // the inputs for one op.
+  // The data is split into two streams: "input shapes" and "concrete inputs".
+  // Note: "auto" only works because these are only used in collection.cpp,
+  // where they are implemented.
+  auto getInputShapeGenerator();
+  auto getConcreteInputGenerator();
+
+  bool isSupportedScalarList(const c10::IValue& list_candidate);
+
+  void clear();
+
+  enum class Tag {
+    Tensor = 0,
+    UndefinedTensor,
+    TensorListBegin, // TODO: generalize to other lists.
+    ScalarList,
+    Scalar,
+    Other,
+    TERMINATOR
+  };
+
+  enum class IOType { Shapes, ConcreteInputs, None };
+
+ private:
+  void push(const at::Tensor& t);
+
+  // Implementation detail for getInputShapeGenerator and
+  // getConcreteInputGenerator
+  auto getIValueGenerator(const IOType& io_type);
+
+  AppendOnlyList<Tag, IO_ENCODER_DEFAULT_BLOCK_SIZE> tags_;
+  AppendOnlyList<RawTensorMetadata, IO_ENCODER_DEFAULT_BLOCK_SIZE>
+      tensor_metadata_;
+  AppendOnlyList<int64_t, IO_ENCODER_DEFAULT_BLOCK_SIZE> tensor_sizes_strides_;
+  AppendOnlyList<c10::IValue, IO_ENCODER_DEFAULT_BLOCK_SIZE> ivalues_;
+};
+
+using perf_profiler_t = torch::profiler::impl::linux_perf::PerfProfiler;
+
+class TORCH_API ThreadLocalSubqueue {
+ public:
+  ThreadLocalSubqueue(const uint64_t tid, ProfilerConfig config);
+
+  std::unique_ptr<KinetoObserverContext> begin_op(const at::RecordFunction& fn);
+
+  template <class... Args>
+  void emplace_backend_event(Args&&... args) {
+    backend_events_.emplace_back(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  void emplace_vulkan_event(Args&&... args) {
+    vulkan_events_.emplace_back(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  void emplace_allocation_event(Args&&... args) {
+    allocations_.emplace_back(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  void emplace_ooms_event(Args&&... args) {
+    ooms_.emplace_back(std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  void emplace_py_call(Args&&... args) {
+    py_calls_.emplace_back(std::forward<Args>(args)...);
+  }
+
+  uint64_t tid() const {
+    return tid_;
+  }
+
+  const kineto::DeviceAndResource& kineto_info() const {
+    return kineto_info_;
+  }
+
+  inline void disable_perf_profiler(perf_counters_t& counters) const {
+    perf_profiler_->Disable(counters);
+  }
+
+ private:
+  uint64_t tid_;
+  ProfilerConfig config_;
+  kineto::DeviceAndResource kineto_info_;
+  std::unique_ptr<perf_profiler_t> perf_profiler_;
+
+  friend class RecordQueue;
+  // See `containers.h` for block size benchmarks.
+  static constexpr size_t BlockSize = 512;
+
+  struct TorchOpStorage {
+    // NB: This is a destructive operation.
+    void materialize(
+        std::vector<std::shared_ptr<Result>>& out,
+        std::vector<ProfilerStepInfo>& step_info,
+        const std::function<c10::time_t(c10::approx_time_t)>& time_converter,
+        const uint64_t tid,
+        const kineto::DeviceAndResource& kineto_info);
+
+    template <typename T, size_t ChunkSize>
+    class EventBlock : public std::array<T, ChunkSize> {
+     public:
+      EventBlock();
+      uint64_t correlation_id(const T* ptr) const;
+
+     private:
+      uint64_t id_start_;
+    };
+
+    using event_t = KinetoObserverContext::Event;
+    class OpList : public AppendOnlyList<event_t, BlockSize, EventBlock> {
+     public:
+      template <class... Args>
+      std::pair<event_t*, uint64_t> emplace_back(Args&&... args);
+      static uint64_t correlationID(const OpList::Iterator& e);
+    } op_events_;
+
+    // report_input_shapes
+    InputOutputEncoder inputs_outputs_;
+
+    // with_stack (JIT)
+    AppendOnlyList<jit_stack_t, BlockSize> jit_stack_;
+
+    // with_modules
+    AppendOnlyList<jit_modules_t, BlockSize> jit_modules_;
+
+    // with_flops
+    AppendOnlyList<extra_args_t, BlockSize> extra_args_;
+
+    // report extra metadata, i.e. collective communication meta
+    AppendOnlyList<extra_meta_t, BlockSize> extra_meta_;
+
+    // report kwinputs
+    AppendOnlyList<kwinputs_t, BlockSize> kwinputs_;
+
+    // ProfilerState::KINETO_GPU_FALLBACK or
+    // ProfilerState::KINETO_PRIVATEUSE1_FALLBACK
+    AppendOnlyList<FallbackPair, BlockSize> device_fallback_;
+  } torch_ops_;
+
+  // reportBackendEventToActiveKinetoProfiler
+  AppendOnlyList<ExtraFields<EventType::Backend>, BlockSize> backend_events_;
+
+  // _reportVulkanEventToProfiler
+  AppendOnlyList<ExtraFields<EventType::Vulkan>::raw_event_t, BlockSize>
+      vulkan_events_;
+
+  // reportMemoryUsage
+  AppendOnlyList<RawAllocation, BlockSize> allocations_;
+
+  // reportOOMs
+  AppendOnlyList<ExtraFields<EventType::OutOfMemory>, BlockSize> ooms_;
+
+  // with_stack (Python)
+  AppendOnlyList<
+      std::pair<python_tracer::TraceKey, c10::approx_time_t>,
+      BlockSize>
+      py_calls_;
+};
+
+class TORCH_API RecordQueue {
+ public:
+  RecordQueue(ProfilerConfig config, std::set<ActivityType> activities);
+
+  bool tracePython() const;
+  ThreadLocalSubqueue* getSubqueue();
+  void stop();
+  void restart();
+
+  // NB: This is a destructive operation.
+  std::pair<
+      std::vector<std::shared_ptr<Result>>,
+      std::unique_ptr<torch::profiler::impl::kineto::ActivityTraceWrapper>>
+  getRecords(
+      std::function<c10::time_t(c10::approx_time_t)> time_converter,
+      uint64_t start_time_ns,
+      uint64_t end_time_ns);
+
+ private:
+  uint32_t id_;
+  ProfilerConfig config_;
+  std::set<ActivityType> activities_;
+  ska::flat_hash_map<uint64_t, std::unique_ptr<ThreadLocalSubqueue>>
+      sub_queues_;
+  std::mutex sub_queue_mutex_;
+  std::unique_ptr<python_tracer::PythonTracerBase> python_tracer_;
+};
+
+TORCH_API bool get_record_concrete_inputs_enabled();
+TORCH_API void set_record_concrete_inputs_enabled_fn(std::function<bool()>);
+TORCH_API void set_record_concrete_inputs_enabled_val(bool);
+
+TORCH_API bool get_fwd_bwd_enabled();
+TORCH_API void set_fwd_bwd_enabled_fn(std::function<bool()>);
+TORCH_API void set_fwd_bwd_enabled_val(bool);
+
+TORCH_API bool get_cuda_sync_enabled();
+TORCH_API void set_cuda_sync_enabled_fn(std::function<bool()>);
+TORCH_API void set_cuda_sync_enabled_val(bool);
+
+// Comms related RecordFunctions will record information about tensor storage
+// locations.
+TORCH_API bool get_record_tensor_addrs_enabled();
+TORCH_API void set_record_tensor_addrs_enabled_fn(std::function<bool()>);
+TORCH_API void set_record_tensor_addrs_enabled_val(bool);
+
+} // namespace torch::profiler::impl
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/combined_traceback.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/combined_traceback.h
new file mode 100644
index 0000000000000000000000000000000000000000..902d654cba7d690234421bd3b82a1ad5fbf0eba2
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/combined_traceback.h
@@ -0,0 +1,76 @@
+#pragma once
+
+#include <torch/csrc/jit/runtime/interpreter.h>
+#include <torch/csrc/profiler/unwind/unwind.h>
+
+namespace torch {
+
+// struct that holds the result of symbolizing multiple tracebacks
+// each traceback is a list of indices into all_frames
+// (lots of Frames get duplicated across traces)
+struct TORCH_API SymbolizedTracebacks {
+  std::vector<unwind::Frame> all_frames;
+  // index into all_frames, so that
+  // it is possible to dedupe frame objects in
+  // construction of python objects
+  std::vector<std::vector<uint64_t>> tracebacks;
+};
+
+struct TORCH_API CapturedTraceback : public c10::GatheredContext {
+  struct PyFrame {
+    void* code; // PyCodeObject*, but python headers not present
+    int lasti;
+  };
+
+  static std::shared_ptr<CapturedTraceback> gather(
+      bool python,
+      bool script,
+      bool cpp);
+  CapturedTraceback() = default;
+  CapturedTraceback(const CapturedTraceback&) = delete;
+  CapturedTraceback& operator=(const CapturedTraceback&) = delete;
+  CapturedTraceback(CapturedTraceback&&) noexcept = default;
+  CapturedTraceback& operator=(CapturedTraceback&&) noexcept = delete;
+  ~CapturedTraceback() override;
+
+  using visitproc = int (*)(void* self, void* arg);
+
+  struct Python {
+    virtual std::vector<PyFrame> gather() = 0;
+    virtual void release(std::vector<PyFrame>& frames) = 0;
+    virtual void appendSymbolized(
+        const std::vector<PyFrame>& to_symbolize,
+        SymbolizedTracebacks& st) = 0;
+    // tp_traverse/tp_clear implementations
+    virtual int traverse(
+        std::vector<PyFrame>& frames,
+        visitproc visit,
+        void* arg) = 0;
+    virtual int clear(std::vector<PyFrame>& frames) = 0;
+    virtual ~Python() = default;
+    Python* next_ = nullptr;
+  };
+  // called once by each python interpreter to
+  // register python stack recording functionality
+  // p cannot be deleted once added.
+  static void addPythonUnwinder(Python* p);
+
+  int traversePython(visitproc visit, void* arg);
+  int clearPython();
+
+ private:
+  std::vector<PyFrame> frames_;
+  std::vector<void*> cpp_frames_;
+  std::vector<jit::StackEntry> script_frames_;
+  friend TORCH_API SymbolizedTracebacks
+  symbolize(const std::vector<CapturedTraceback*>& to_symbolize);
+
+  // non-owning reference to one of the immortal Python* objects
+  // registered above.
+  Python* python_ = nullptr;
+};
+
+TORCH_API SymbolizedTracebacks
+symbolize(const std::vector<CapturedTraceback*>& to_symbolize);
+
+} // namespace torch
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/containers.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/containers.h
new file mode 100644
index 0000000000000000000000000000000000000000..14b87f502bbda05e62346692117e8af9bb9895d9
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/containers.h
@@ -0,0 +1,203 @@
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <forward_list>
+#include <utility>
+
+#include <c10/macros/Macros.h>
+#include <c10/util/ArrayRef.h>
+#include <c10/util/Exception.h>
+
+namespace torch::profiler::impl {
+
+// ============================================================================
+// == AppendOnlyList ==========================================================
+// ============================================================================
+//   During profiling, we have a very predictable access pattern: we only
+// append to the end of the container. We can specialize and outperform both
+// std::vector (which must realloc) and std::deque (which performs a double
+// indirection), and this class of operation is sufficiently important to the
+// profiling hot path to warrant specializing:
+//   https://godbolt.org/z/rTjozf1c4
+//   https://quick-bench.com/q/mmfuu71ogwaiULDCJyHdKnHZms4    (Prototype #1,
+//   int) https://quick-bench.com/q/5vWDW6jjdXVdoffev2zst8D09no    (Prototype
+//   #1, int pair) https://quick-bench.com/q/IfEkfAQMeJSNBA52xtMP6Agcl-Q
+//   (Prototype #2, int pair)
+//   https://quick-bench.com/q/wJV2lKmuXL4XyGJzcI5hs4gEHFg    (Prototype #3, int
+//   pair) https://quick-bench.com/q/xiO8ZaBEkYRYUA9dFrMuPLlW9fo    (Full impl,
+//   int pair)
+// AppendOnlyList has 2x lower emplace overhead compared to more generic STL
+// containers.
+//
+//   The optimal value of `ChunkSize` will vary by use case, but testing shows
+// that a value of 1024 does a good job amortizing the `malloc` cost of growth.
+// Performance drops off for larger values, so testing on a case-by-case basis
+// is recommended if performance is absolutely critical.
+
+template <
+    typename T,
+    size_t ChunkSize,
+    template <typename U, size_t N> class block_t = std::array>
+class AppendOnlyList {
+ public:
+  using array_t = block_t<T, ChunkSize>;
+  static_assert(
+      std::is_base_of_v<std::array<T, ChunkSize>, array_t>,
+      "AppendOnlyList expects raw low level pointer storage.");
+  static_assert(ChunkSize > 0, "Block cannot be empty.");
+
+  AppendOnlyList() : buffer_last_{buffer_.before_begin()} {}
+  AppendOnlyList(const AppendOnlyList&) = delete;
+  AppendOnlyList(AppendOnlyList&&) = delete;
+  AppendOnlyList& operator=(const AppendOnlyList&) = delete;
+  AppendOnlyList& operator=(AppendOnlyList&&) = delete;
+  ~AppendOnlyList() = default;
+
+  size_t size() const {
+    return n_blocks_ * ChunkSize - (size_t)(end_ - next_);
+  }
+
+  template <class... Args>
+  T* emplace_back(Args&&... args) {
+    maybe_grow();
+    if constexpr (
+        std::is_trivially_destructible_v<T> &&
+        std::is_trivially_destructible_v<array_t>) {
+      ::new ((void*)next_) T{std::forward<Args>(args)...};
+    } else {
+      *next_ = T{std::forward<Args>(args)...};
+    }
+    return next_++;
+  }
+
+  template <typename T0>
+  std::enable_if_t<std::is_same_v<T0, T> && std::is_trivially_copyable_v<T>>
+  copy(c10::ArrayRef<T0> src) {
+    size_t n = src.size();
+    if (C10_UNLIKELY(n == 0)) {
+      return;
+    }
+    maybe_grow();
+    if (C10_LIKELY(next_ && (next_ + n <= end_))) {
+      std::memcpy((void*)next_, (void*)src.begin(), n * sizeof(T0));
+      next_ += n;
+    } else {
+      // We could chunk this into several `memcpy`s, but because we expect this
+      // fallback to be infrequent (n << ChunkSize) the performance impact is
+      // negligible.
+      for (auto i : src) {
+        emplace_back(i);
+      }
+    }
+  }
+
+  void clear() {
+    buffer_.clear();
+    buffer_last_ = buffer_.before_begin();
+    n_blocks_ = 0;
+    next_ = nullptr;
+    end_ = nullptr;
+  }
+
+  struct Iterator {
+    using iterator_category = std::forward_iterator_tag;
+    using difference_type = std::ptrdiff_t;
+    using value_type = T;
+    using pointer = T*;
+    using reference = T&;
+
+    Iterator(std::forward_list<array_t>& buffer, const size_t size)
+        : block_{buffer.begin()}, size_{size} {}
+
+    // End iterator.
+    Iterator() = default;
+
+    bool exhausted() const {
+      return current_ >= size_;
+    }
+
+    reference operator*() const {
+      return *current_ptr(/*checked=*/true);
+    }
+    pointer operator->() {
+      return current_ptr(/*checked=*/true);
+    }
+
+    // Prefix increment
+    Iterator& operator++() {
+      if (!(++current_ % ChunkSize)) {
+        block_++;
+      }
+      return *this;
+    }
+
+    // Postfix increment
+    Iterator operator++(int) {
+      Iterator tmp = *this;
+      ++(*this);
+      return tmp;
+    }
+
+    friend bool operator==(const Iterator& a, const Iterator& b) {
+      return a.current_ptr() == b.current_ptr();
+    }
+    friend bool operator!=(const Iterator& a, const Iterator& b) {
+      return a.current_ptr() != b.current_ptr();
+    }
+
+    std::pair<array_t*, size_t> address() const {
+      if (current_ >= size_) {
+        return {nullptr, 0};
+      }
+      return {&(*block_), current_ % ChunkSize};
+    }
+
+   private:
+    T* current_ptr(bool checked = false) const {
+      auto a = address();
+      if (a.first == nullptr) {
+        TORCH_INTERNAL_ASSERT(!checked, "Invalid access on AppendOnlyList.");
+        return nullptr;
+      }
+      return a.first->data() + a.second;
+    }
+
+    typename std::forward_list<array_t>::iterator block_;
+    size_t current_{0};
+    size_t size_{0};
+  };
+
+  Iterator begin() {
+    return Iterator(buffer_, size());
+  }
+  Iterator end() {
+    return Iterator();
+  }
+  // TODO: cbegin and cend()
+
+ private:
+  void maybe_grow() {
+    if (C10_UNLIKELY(next_ == end_)) {
+      buffer_last_ = buffer_.emplace_after(buffer_last_);
+      n_blocks_++;
+      next_ = buffer_last_->data();
+      end_ = next_ + ChunkSize;
+    }
+  }
+
+  std::forward_list<array_t> buffer_;
+
+  // We maintain a pointer to the last element of `buffer_` so that we can
+  // insert at the end in O(1) time.
+  size_t n_blocks_{0};
+  T* next_{nullptr};
+  T* end_{nullptr};
+
+ protected:
+  typename std::forward_list<array_t>::iterator buffer_last_;
+};
+
+} // namespace torch::profiler::impl
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/data_flow.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/data_flow.h
new file mode 100644
index 0000000000000000000000000000000000000000..766f6521ac3e4e874b95f1b3cb0fdc3047a904bf
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/data_flow.h
@@ -0,0 +1,90 @@
+#pragma once
+
+#include <memory>
+
+#include <ATen/core/TensorBody.h>
+#include <c10/core/TensorImpl.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/strong_type.h>
+
+namespace torch::profiler::impl {
+
+// Identity is a complex concept in PyTorch. A Tensor might not have a
+// an associated storage, multiple Tensors might share the same underlying
+// storage, the storage of a Tensor might change over time, etc.
+//
+// For the purpose of profiling we're mostly interested in data flow
+// analysis. As a result, we can take an expansive view of identity:
+// Tensors share an ID if they share a TensorImpl or storage data.
+//
+// This identity equality is transitive; If Tensors T0 and T1 share a storage
+// S0 and T1 later points to a different storage S1 then all Tensors which
+// point to either S0 or S1 are considered to have the same identity. (Since
+// profiler cannot reason beyond that.)
+//
+// The profiler will handle lifetime analysis to ensure that identities do
+// not run afoul of the ABA problem. This does, however, mean that identities
+// can only be assigned when memory profiling is enabled.
+using TensorID = strong::type<size_t, struct TensorID_, strong::regular>;
+
+// Uniquely identifies an allocation. (Generally a StorageImpl's data ptr.)
+using AllocationID = strong::type<
+    size_t,
+    struct StorageID_,
+    strong::ordered,
+    strong::regular,
+    strong::hashable>;
+
+// We use a Tensor's TensorImpl address and StorageImpl data start to build the
+// data flow graph. We do not hold an owning reference so we wrap them in strong
+// types to prevent direct access.
+using TensorImplAddress = strong::type<
+    const c10::TensorImpl*,
+    struct TensorImplAddress_,
+    strong::regular,
+    strong::hashable,
+    strong::boolean>;
+
+using StorageImplData = strong::type<
+    const void*,
+    struct StorageImplData_,
+    strong::regular,
+    strong::hashable,
+    strong::boolean>;
+
+// ============================================================================
+// == weak_intrusive_ptr and the ABA problem for TensorImpl* ==================
+// ============================================================================
+// Tracking `TensorImpl`s is an important part of identity tracking, because
+// a Tensor might change storage; however when it does we want to retain the
+// fact that the old and new storage belong to the same logical Tensor. We
+// cannot take an owning reference to the Tensor because that would change
+// program semantics by extending the lifetime of the Tensor. However if we
+// store a raw TensorImpl* pointer the TensorImpl might be deleted and a new
+// TensorImpl might be created that reuses the address. (ABA problem)
+//
+// Fortunately, there is a feature of `c10::intrusive_ptr` that we can use to
+// prevent address reuse for the duration of profiling: the weak intrusive ptr.
+// When a Tensor's refcount reaches zero but there are outstanding weak
+// references (`weakcount_ > 0`) it will free the underlying managed resources
+// by calling `target_->release_resources()`, but it will not call `delete`.
+// (Instead, `delete` is called when the last weak reference is destroyed.)
+// This means that we can safely use address identity to track `TensorImpls`.
+class WeakTensor {
+ public:
+  explicit WeakTensor(const at::Tensor& t) : weak_self_(t.getIntrusivePtr()) {}
+
+  auto get() const {
+    return TensorImplAddress{weak_self_._unsafe_get_target()};
+  }
+
+ private:
+  c10::weak_intrusive_ptr<c10::TensorImpl> weak_self_;
+};
+
+struct Result;
+
+void calculateUniqueTensorIDs(
+    std::vector<std::shared_ptr<Result>>& sorted_results);
+
+} // namespace torch::profiler::impl
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/events.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/events.h
new file mode 100644
index 0000000000000000000000000000000000000000..32a77a810388f86db1a0714696d20c6cb3923161
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/events.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+namespace torch::profiler {
+
+/* A vector type to hold a list of performance counters */
+using perf_counters_t = std::vector<uint64_t>;
+
+/* Standard list of performance events independent of hardware or backend */
+constexpr std::array<const char*, 2> ProfilerPerfEvents = {
+    /*
+     * Number of Processing Element (PE) cycles between two points of interest
+     * in time. This should correlate positively with wall-time. Measured in
+     * uint64_t. PE can be non cpu. TBD reporting behavior for multiple PEs
+     * participating (i.e. threadpool).
+     */
+    "cycles",
+
+    /* Number of PE instructions between two points of interest in time. This
+     * should correlate positively with wall time and the amount of computation
+     * (i.e. work). Across repeat executions, the number of instructions should
+     * be more or less invariant. Measured in uint64_t. PE can be non cpu.
+     */
+    "instructions"};
+} // namespace torch::profiler
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/kineto_client_interface.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/kineto_client_interface.h
new file mode 100644
index 0000000000000000000000000000000000000000..82f9a549250c0a2fe2aeeb6a1c4be60b55924db8
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/kineto_client_interface.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <torch/csrc/jit/runtime/interpreter.h>
+#include <torch/csrc/profiler/unwind/unwind.h>
+
+namespace torch {
+
+// declare global_kineto_init for libtorch_cpu.so to call
+TORCH_API void global_kineto_init();
+
+} // namespace torch
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/kineto_shim.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/kineto_shim.h
new file mode 100644
index 0000000000000000000000000000000000000000..2938c9d9a62a005dfc02ad6ee23aa701d7e7c354
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/kineto_shim.h
@@ -0,0 +1,148 @@
+#pragma once
+
+#include <memory>
+#include <string>
+
+// Skip Kineto dependency on mobile unless explicitly asked for.
+// When is it explicitly asked for?
+//   KinetoEdgeCPUProfiler uses KinetoProfiler for cpu
+//   event profiling. This has a dependency on cpu only libkineto
+#if defined(USE_KINETO) && defined(C10_MOBILE) && \
+    !defined(EDGE_PROFILER_USE_KINETO)
+#undef USE_KINETO
+#endif
+
+#include <ActivityType.h>
+
+#include <torch/csrc/Export.h>
+#include <torch/csrc/profiler/api.h>
+
+#ifdef USE_KINETO
+// Forward declarations so we don't have to include `libkineto.h` in a header.
+namespace libkineto {
+class GenericTraceActivity;
+struct CpuTraceBuffer;
+class ActivityTraceInterface;
+} // namespace libkineto
+#endif
+
+namespace torch {
+namespace profiler {
+
+#ifdef USE_KINETO
+constexpr bool kKinetoAvailable{true};
+#else
+constexpr bool kKinetoAvailable{false};
+#endif
+
+namespace impl::kineto {
+
+// ----------------------------------------------------------------------------
+// -- Interface (Does not require Kineto) -------------------------------------
+// ----------------------------------------------------------------------------
+struct DeviceAndResource {
+  int32_t device;
+  int32_t resource;
+};
+const DeviceAndResource kineto_ids();
+
+#ifdef USE_KINETO
+using trace_t = libkineto::CpuTraceBuffer;
+using interface_trace_t = libkineto::ActivityTraceInterface;
+using activity_t = libkineto::GenericTraceActivity;
+#else
+struct DummyTraceBuffer {};
+struct DummyTraceInterface {};
+
+using trace_t = DummyTraceBuffer;
+using interface_trace_t = DummyTraceBuffer;
+struct activity_t;
+#endif // USE_KINETO
+
+void addMetadata(
+    activity_t* activity,
+    const std::string& key,
+    const std::string& value);
+
+// Wraps: libkineto::CpuTraceBuffer
+struct TraceWrapper {
+  TraceWrapper(const int64_t start_time, const std::string& name);
+
+  // The caller is expected to hold a mutex when calling `addCPUActivity`.
+  activity_t* addCPUActivity(
+      const std::string& name,
+      const libkineto::ActivityType type,
+      const DeviceAndResource device_and_resource,
+      const uint64_t correlation_id,
+      const int64_t start_time,
+      const int64_t end_time);
+
+  void transferCpuTrace(int64_t end_time);
+
+  explicit operator bool() const;
+
+  std::unique_ptr<trace_t>& get() {
+    return cpu_trace_;
+  }
+
+ private:
+  std::unique_ptr<trace_t> cpu_trace_;
+};
+
+// Wraps libkineto::ActivityTraceInterface
+struct ActivityTraceWrapper {
+  explicit ActivityTraceWrapper(std::unique_ptr<interface_trace_t>&& trace);
+  ActivityTraceWrapper() = default;
+  explicit operator bool() const;
+  void save(const std::string& path);
+
+  const std::unique_ptr<interface_trace_t>& get() {
+    return trace_;
+  }
+
+ private:
+  std::unique_ptr<interface_trace_t> trace_;
+#ifdef USE_KINETO
+  bool saved_ = false; // Kineto's save is destructive
+#endif
+};
+
+using ActivitySet = std::set<torch::autograd::profiler::ActivityType>;
+void prepareTrace(
+    const bool cpuOnly,
+    const ActivitySet& activities,
+    const torch::profiler::impl::ExperimentalConfig& config,
+    const std::string& trace_id = "");
+
+void toggleCollectionDynamic(const bool enable);
+void startTrace();
+ActivityTraceWrapper stopTrace();
+void pushCorrelationId(uint64_t correlation_id);
+void pushUserCorrelationId(uint64_t correlation_id);
+void popCorrelationId();
+void popUserCorrelationId();
+void recordThreadInfo();
+bool collectivesProfilerExists();
+
+void logInvariantViolation(
+    const std::string& assertion,
+    const std::string& error,
+    const std::string& profile_id,
+    const std::string& group_profile_id);
+
+} // namespace impl::kineto
+
+} // namespace profiler
+
+namespace autograd::profiler {
+c10::DeviceType deviceTypeFromActivity(libkineto::ActivityType activity_type);
+
+TORCH_API void addMetadataJson(
+    const std::string& key,
+    const std::string& value);
+
+TORCH_API void profilerStep();
+
+} // namespace autograd::profiler
+
+} // namespace torch
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/orchestration/observer.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/orchestration/observer.h
new file mode 100644
index 0000000000000000000000000000000000000000..32fe9f0c57c698461adad35b6951ca02ebe50d38
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/orchestration/observer.h
@@ -0,0 +1,198 @@
+#pragma once
+
+#include <ATen/record_function.h>
+#include <torch/csrc/Export.h>
+
+#include <utility>
+
+namespace torch::profiler::impl {
+
+// ----------------------------------------------------------------------------
+// -- Profiler Config ---------------------------------------------------------
+// ----------------------------------------------------------------------------
+enum class C10_API_ENUM ActivityType {
+  CPU = 0,
+  XPU, // XPU kernels, runtime
+  CUDA, // CUDA kernels, runtime
+  HPU, // HPU kernels, runtime
+  MTIA, // MTIA kernels, runtime
+  PrivateUse1, // PrivateUse1 kernels, runtime
+  NUM_KINETO_ACTIVITIES, // must be the last one
+};
+
+inline std::string actToString(ActivityType t) {
+  const std::array<
+      std::string,
+      static_cast<size_t>(ActivityType::NUM_KINETO_ACTIVITIES)>
+      ActivityTypeNames = {"CPU", "XPU", "CUDA", "MTIA", "PrivateUse1"};
+  return ActivityTypeNames[static_cast<int>(t)];
+}
+
+enum class C10_API_ENUM ProfilerState {
+  Disabled = 0,
+  CPU, // CPU-only profiling
+  CUDA, // CPU + CUDA events
+  NVTX, // only emit NVTX markers
+  ITT, // only emit ITT markers
+  PRIVATEUSE1, // only emit PRIVATEUSE1 markers
+  KINETO, // use libkineto
+  KINETO_GPU_FALLBACK, // use CUDA events when CUPTI is not available
+  KINETO_PRIVATEUSE1_FALLBACK, // use PrivateUse1 events
+  KINETO_ONDEMAND, // run the profiler in on-demand mode
+  NUM_PROFILER_STATES, // must be the last one
+};
+
+enum class C10_API_ENUM ActiveProfilerType {
+  NONE = 0,
+  LEGACY,
+  KINETO,
+  NVTX,
+  ITT,
+  PRIVATEUSE1
+};
+
+struct TORCH_API ExperimentalConfig {
+  ExperimentalConfig(
+      std::vector<std::string> profiler_metrics = {},
+      bool profiler_measure_per_kernel = false,
+      bool verbose = false,
+      std::vector<std::string> performance_events = {},
+      bool enable_cuda_sync_events = false,
+      bool adjust_profiler_step = false,
+      bool disable_external_correlation = false,
+      bool profile_all_threads = false,
+      bool capture_overload_names = false,
+      bool adjust_timestamps = false);
+  explicit operator bool() const;
+
+  std::vector<std::string> profiler_metrics;
+  bool profiler_measure_per_kernel;
+  bool verbose;
+  /*
+   * List of performance events to be profiled.
+   * An empty list will disable performance event based profiling altogether.
+   */
+  std::vector<std::string> performance_events;
+  /*
+   * For CUDA profiling mode, enable adding CUDA synchronization events
+   * that expose CUDA device, stream and event synchronization activities.
+   * This feature is new and currently disabled by default.
+   */
+  bool enable_cuda_sync_events;
+  /*
+   * Controls whether or not timestamp adjustment for ProfilerStep and parent
+   * Python events occurs after profiling. This occurs at an O(n) cost and
+   * affects only the start of profiler step events.
+   */
+  bool adjust_profiler_step;
+  /*
+   * Controls whether or not external correlation is disabled. This is used to
+   * lower the amount of events received by CUPTI as correlation events are
+   * paired with runtime/gpu events for each kind of correlation
+   */
+  bool disable_external_correlation;
+
+  /* controls whether profiler records cpu events on threads
+   * that are not spawned from the main thread on which the
+   * profiler was enabled, similar to on_demand mode */
+  bool profile_all_threads;
+
+  /* controls whether overload names are queried from an ATen
+   * function schema and stored in the profile  */
+  bool capture_overload_names;
+
+  /*
+   * Controls whether or not timestamp adjustment occurs after profiling.
+   * The purpose of this is to adjust Vulkan event timelines to align with those
+   * of their parent CPU events.
+   * This sometimes requires increasing CPU event durations (to fully contain
+   * their child events) and delaying CPU event start times (to
+   * prevent overlaps), so this should not be used unless Vulkan events are
+   * being profiled and it is ok to use this modified timestamp/duration
+   * information instead of the original information.
+   */
+  bool adjust_timestamps;
+};
+
+struct TORCH_API ProfilerConfig {
+  explicit ProfilerConfig(
+      ProfilerState state,
+      bool report_input_shapes = false,
+      bool profile_memory = false,
+      bool with_stack = false,
+      bool with_flops = false,
+      bool with_modules = false,
+      ExperimentalConfig experimental_config = ExperimentalConfig(),
+      std::string trace_id = "");
+
+  bool disabled() const;
+  bool global() const;
+  bool pushGlobalCallbacks() const;
+
+  ProfilerState state;
+  ExperimentalConfig experimental_config;
+  bool report_input_shapes;
+  bool profile_memory;
+  bool with_stack;
+  bool with_flops;
+  bool with_modules;
+  std::string trace_id;
+
+  // For serialization
+  at::IValue toIValue() const;
+  static ProfilerConfig fromIValue(const at::IValue& profilerConfigIValue);
+};
+
+// ----------------------------------------------------------------------------
+// -- Profiler base class -----------------------------------------------------
+// ----------------------------------------------------------------------------
+struct TORCH_API ProfilerStateBase : public c10::MemoryReportingInfoBase {
+  explicit ProfilerStateBase(ProfilerConfig config);
+  ProfilerStateBase(const ProfilerStateBase&) = delete;
+  ProfilerStateBase(ProfilerStateBase&&) = delete;
+  ProfilerStateBase& operator=(const ProfilerStateBase&) = delete;
+  ProfilerStateBase& operator=(ProfilerStateBase&&) = delete;
+  ~ProfilerStateBase() override;
+
+  static ProfilerStateBase* get(bool global);
+  static ProfilerStateBase* get() {
+    auto* out = get(/*global=*/true);
+    return out ? out : get(/*global=*/false);
+  }
+
+  static void push(std::shared_ptr<ProfilerStateBase>&& state);
+
+  static std::shared_ptr<ProfilerStateBase> pop(bool global);
+  static std::shared_ptr<ProfilerStateBase> pop() {
+    auto out = pop(/*global=*/true);
+    return out ? std::move(out) : pop(/*global=*/false);
+  }
+
+  const ProfilerConfig& config() const {
+    return config_;
+  }
+
+  void setCallbackHandle(at::CallbackHandle handle);
+  void removeCallback();
+
+  bool memoryProfilingEnabled() const override {
+    return config_.profile_memory;
+  }
+
+  virtual ActiveProfilerType profilerType() = 0;
+
+ protected:
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::mutex state_mutex_;
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  ProfilerConfig config_ = ProfilerConfig(ProfilerState::Disabled);
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  at::CallbackHandle handle_ = 0;
+};
+
+// Note: The following are only for the active *thread local* profiler.
+TORCH_API bool profilerEnabled();
+TORCH_API ActiveProfilerType profilerType();
+TORCH_API ProfilerConfig getProfilerConfig();
+
+} // namespace torch::profiler::impl
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/orchestration/python_tracer.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/orchestration/python_tracer.h
new file mode 100644
index 0000000000000000000000000000000000000000..9abab0e00488fecc5869623342ef25d18cddd31d
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/orchestration/python_tracer.h
@@ -0,0 +1,76 @@
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include <c10/util/ApproximateClock.h>
+#include <c10/util/strong_type.h>
+
+#include <torch/csrc/profiler/kineto_shim.h>
+#include <torch/csrc/profiler/util.h>
+
+namespace torch::profiler::impl {
+
+class RecordQueue;
+struct Result;
+namespace python_tracer {
+
+using TraceKey = strong::type<
+    uint64_t,
+    struct TraceKey_,
+    strong::regular,
+    strong::hashable,
+    strong::ostreamable>;
+
+struct CompressedEvent {
+  TraceKey key_;
+  uint64_t system_tid_{};
+  kineto::DeviceAndResource kineto_info_{};
+  c10::time_t enter_t_{};
+};
+
+/*
+Libtorch does not depend on Python (e.g. cannot #include <Python.h>); however
+when we call the profiler from libtorch_python we need the profiler to be able
+to ingest the data that we collect from the Python tracer. (`PyEval_SetProfile`)
+
+In order to solve this dependency issue we define a virtual base and a function
+to register a getter. The python tracer then implements these functions and
+exposes itself by calling `registerTracer` from `torch/csrc/autograd/init.cpp`.
+This pattern of registration for faux python dependencies in libtorch is common
+in the PyTorch codebase.
+*/
+struct TORCH_API PythonTracerBase {
+  static std::unique_ptr<PythonTracerBase> make(RecordQueue* queue);
+  virtual ~PythonTracerBase() = default;
+
+  virtual void stop() = 0;
+  virtual void restart() = 0;
+  virtual std::vector<std::shared_ptr<Result>> getEvents(
+      std::function<c10::time_t(c10::approx_time_t)> time_converter,
+      std::vector<CompressedEvent>& enters,
+      c10::time_t end_time_ns) = 0;
+};
+
+using MakeFn = std::unique_ptr<PythonTracerBase> (*)(RecordQueue*);
+TORCH_API void registerTracer(MakeFn make_tracer);
+
+/**
+ * Memory Tracer Implementation
+ */
+struct TORCH_API PythonMemoryTracerBase {
+  static std::unique_ptr<PythonMemoryTracerBase> make();
+  virtual ~PythonMemoryTracerBase() = default;
+
+  virtual void start() = 0;
+  virtual void stop() = 0;
+  virtual void export_memory_history(const std::string& path) = 0;
+};
+
+using MakeMemoryFn = std::unique_ptr<PythonMemoryTracerBase> (*)();
+TORCH_API void registerMemoryTracer(MakeMemoryFn make_memory_tracer);
+
+} // namespace python_tracer
+} // namespace torch::profiler::impl
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/orchestration/vulkan.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/orchestration/vulkan.h
new file mode 100644
index 0000000000000000000000000000000000000000..ef104ae27d2093a52c1807f56df3ecf37dfcb41a
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/orchestration/vulkan.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <torch/csrc/profiler/stubs/base.h>
+#include <torch/csrc/profiler/util.h>
+#include <cstdint>
+
+namespace torch::profiler::impl::vulkan {
+
+// Using function pointer i.e. [std::tuple<std::string, uint64_t> (*)(int64_t)]
+// doesn't work because we need to capture the QueryPool in the lambda context
+// https://stackoverflow.com/a/28746827
+using GetShaderNameAndDurationNsFn =
+    std::function<std::tuple<std::string, uint64_t>(int64_t)>;
+TORCH_API void registerGetShaderNameAndDurationNs(
+    GetShaderNameAndDurationNsFn get_shader_name_and_duration_ns);
+
+TORCH_API void deregisterGetShaderNameAndDurationNs();
+
+std::tuple<std::string, uint64_t> getShaderNameAndDurationNs(
+    const vulkan_id_t& vulkan_id);
+
+} // namespace torch::profiler::impl::vulkan
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/perf-inl.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/perf-inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f595ea90526510257a2dd41d6c3efe27873151b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/perf-inl.h
@@ -0,0 +1,68 @@
+#pragma once
+
+#if defined(__ANDROID__) || defined(__linux__)
+
+#include <unistd.h>
+
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
+
+#include <linux/perf_event.h>
+
+#endif /* __ANDROID__ || __linux__ */
+
+#include <torch/csrc/profiler/perf.h>
+
+#include <limits>
+
+namespace torch::profiler::impl::linux_perf {
+
+/*
+ * PerfEvent
+ * ---------
+ */
+
+inline void PerfEvent::Disable() const {
+#if defined(__ANDROID__) || defined(__linux__)
+  ioctl(fd_, PERF_EVENT_IOC_DISABLE, 0);
+#endif /* __ANDROID__ || __linux__ */
+}
+
+inline void PerfEvent::Enable() const {
+#if defined(__ANDROID__) || defined(__linux__)
+  ioctl(fd_, PERF_EVENT_IOC_ENABLE, 0);
+#endif /* __ANDROID__ || __linux__ */
+}
+
+inline void PerfEvent::Reset() const {
+#if defined(__ANDROID__) || defined(__linux__)
+  ioctl(fd_, PERF_EVENT_IOC_RESET, 0);
+#endif /* __ANDROID__ || __linux__ */
+}
+
+/*
+ * PerfProfiler
+ * ------------
+ */
+
+inline uint64_t PerfProfiler::CalcDelta(uint64_t start, uint64_t end) const {
+  if (end < start) { // overflow
+    return end + (std::numeric_limits<uint64_t>::max() - start);
+  }
+  // not possible to wrap around start for a 64b cycle counter
+  return end - start;
+}
+
+inline void PerfProfiler::StartCounting() const {
+  for (auto& e : events_) {
+    e.Enable();
+  }
+}
+
+inline void PerfProfiler::StopCounting() const {
+  for (auto& e : events_) {
+    e.Disable();
+  }
+}
+
+} // namespace torch::profiler::impl::linux_perf
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/perf.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/perf.h
new file mode 100644
index 0000000000000000000000000000000000000000..faa6d9a7e284eb5b0824ca00b821b50d149b6108
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/perf.h
@@ -0,0 +1,101 @@
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <memory>
+#include <stack>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include <torch/csrc/profiler/events.h>
+
+#include <c10/util/Exception.h>
+
+namespace torch::profiler::impl::linux_perf {
+
+/*
+ * Maximum number of events supported
+ * This stems from the hardware limitation on CPU performance counters, and the
+ * fact that we don't support time multiplexing just yet.
+ * Time multiplexing involves scaling the counter values proportional to
+ * the enabled and running time or running the workload multiple times.
+ */
+constexpr uint8_t MAX_EVENTS = 4;
+
+struct PerfCounter {
+  uint64_t value; /* The value of the event */
+  uint64_t time_enabled; /* for TIME_ENABLED */
+  uint64_t time_running; /* for TIME_RUNNING */
+};
+
+/*
+ * Basic perf event handler for Android and Linux
+ */
+class PerfEvent {
+ public:
+  explicit PerfEvent(std::string& name) : name_(name) {}
+
+  PerfEvent(const PerfEvent& other) = delete;
+  PerfEvent& operator=(const PerfEvent&) = delete;
+  PerfEvent& operator=(PerfEvent&& other) noexcept {
+    if (this != &other) {
+      fd_ = other.fd_;
+      other.fd_ = -1;
+      name_ = std::move(other.name_);
+    }
+    return *this;
+  }
+
+  PerfEvent(PerfEvent&& other) noexcept {
+    *this = std::move(other);
+  }
+
+  ~PerfEvent();
+
+  /* Setup perf events with the Linux Kernel, attaches perf to this process
+   * using perf_event_open(2) */
+  void Init();
+
+  /* Stop incrementing hardware counters for this event */
+  void Disable() const;
+
+  /* Start counting hardware event from this point on */
+  void Enable() const;
+
+  /* Zero out the counts for this event */
+  void Reset() const;
+
+  /* Returns PerfCounter values for this event from kernel, on non supported
+   * platforms this always returns zero */
+  uint64_t ReadCounter() const;
+
+ private:
+  /* Name of the event */
+  std::string name_;
+
+  int fd_ = -1;
+};
+
+class PerfProfiler {
+ public:
+  /* Configure all the events and track them as individual PerfEvent */
+  void Configure(std::vector<std::string>& event_names);
+
+  /* Enable events counting from here */
+  void Enable();
+
+  /* Disable counting and fill in the caller supplied container with delta
+   * calculated from the start count values since last Enable() */
+  void Disable(perf_counters_t&);
+
+ private:
+  uint64_t CalcDelta(uint64_t start, uint64_t end) const;
+  void StartCounting() const;
+  void StopCounting() const;
+
+  std::vector<PerfEvent> events_;
+  std::stack<perf_counters_t> start_values_;
+};
+} // namespace torch::profiler::impl::linux_perf
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/python/combined_traceback.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/python/combined_traceback.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab1aed550ba6867ed1cc67cb109421d47e7928a5
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/python/combined_traceback.h
@@ -0,0 +1,27 @@
+#include <torch/csrc/profiler/combined_traceback.h>
+
+#include <nlohmann/json.hpp>
+#include <pybind11/pybind11.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch {
+
+// symbolize combined traceback objects, converting them into lists of
+// dictionaries that are easily consumed in python.
+
+// returns std::vector because one use is to call it with a batch of
+// tracebacks that come from a larger datastructure (e.g. a memory snapshot)
+// and then have more c++ code to put those objects in the right place.
+TORCH_API std::vector<pybind11::object> py_symbolize(
+    std::vector<CapturedTraceback*>& to_symbolize);
+
+// Return the callback in json format so that it can be used within cpp
+TORCH_API std::vector<nlohmann::json> json_symbolize(
+    std::vector<CapturedTraceback*>& to_symbolize);
+
+// requires GIL to be held, frees any pending free frames
+TORCH_PYTHON_API void freeDeadCapturedTracebackFrames();
+
+TORCH_PYTHON_API void installCapturedTracebackPython();
+
+} // namespace torch
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/python/init.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/python/init.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c8f0dd7572aadb47913f56af0028b17b3b1ec97
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/python/init.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <Python.h>
+
+#include <torch/csrc/profiler/collection.h>
+#include <torch/csrc/profiler/python/pybind.h>
+
+namespace pybind11::detail {
+using torch::profiler::impl::TensorID;
+
+#define STRONG_POINTER_TYPE_CASTER(T) \
+  template <>                         \
+  struct type_caster<T> : public strong_pointer_type_caster<T> {};
+
+STRONG_POINTER_TYPE_CASTER(torch::profiler::impl::StorageImplData)
+STRONG_POINTER_TYPE_CASTER(torch::profiler::impl::AllocationID)
+STRONG_POINTER_TYPE_CASTER(torch::profiler::impl::TensorImplAddress)
+STRONG_POINTER_TYPE_CASTER(torch::profiler::impl::PyModuleSelf)
+STRONG_POINTER_TYPE_CASTER(torch::profiler::impl::PyModuleCls)
+STRONG_POINTER_TYPE_CASTER(torch::profiler::impl::PyOptimizerSelf)
+#undef STRONG_POINTER_TYPE_CASTER
+
+template <>
+struct type_caster<TensorID> : public strong_uint_type_caster<TensorID> {};
+} // namespace pybind11::detail
+
+namespace torch::profiler {
+
+void initPythonBindings(PyObject* module);
+
+} // namespace torch::profiler
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/python/pybind.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/python/pybind.h
new file mode 100644
index 0000000000000000000000000000000000000000..a7c1de151e6a56d64a0d36256396ecaf70682675
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/python/pybind.h
@@ -0,0 +1,48 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include <c10/util/strong_type.h>
+#include <torch/csrc/utils/pybind.h>
+#include <torch/csrc/utils/python_numbers.h>
+
+namespace pybind11::detail {
+// Strong typedefs don't make much sense in Python since everything is duck
+// typed. So instead we simply extract the underlying value and let the caller
+// handle correctness.
+template <typename T>
+struct strong_pointer_type_caster {
+  template <typename T_>
+  static handle cast(
+      const T_& src,
+      return_value_policy /*policy*/,
+      handle /*parent*/) {
+    const auto* ptr = reinterpret_cast<const void*>(src.value_of());
+    return ptr ? handle(THPUtils_packUInt64(reinterpret_cast<intptr_t>(ptr)))
+               : none();
+  }
+
+  bool load(handle /*src*/, bool /*convert*/) {
+    return false;
+  }
+
+  PYBIND11_TYPE_CASTER(T, _("strong_pointer"));
+};
+
+template <typename T>
+struct strong_uint_type_caster {
+  template <typename T_>
+  static handle cast(
+      const T_& src,
+      return_value_policy /*policy*/,
+      handle /*parent*/) {
+    return handle(THPUtils_packUInt64(src.value_of()));
+  }
+
+  bool load(handle /*src*/, bool /*convert*/) {
+    return false;
+  }
+
+  PYBIND11_TYPE_CASTER(T, _("strong_uint"));
+};
+} // namespace pybind11::detail
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/standalone/execution_trace_observer.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/standalone/execution_trace_observer.h
new file mode 100644
index 0000000000000000000000000000000000000000..8688ae7f3eb5eb37123e6802e99a6351d876450c
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/standalone/execution_trace_observer.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <c10/macros/Export.h>
+#include <string>
+
+namespace torch::profiler::impl {
+
+// Adds the execution trace observer as a global callback function, the data
+// will be written to output file path.
+TORCH_API bool addExecutionTraceObserver(const std::string& output_file_path);
+
+// Remove the execution trace observer from the global callback functions.
+TORCH_API void removeExecutionTraceObserver();
+
+// Enables execution trace observer.
+TORCH_API void enableExecutionTraceObserver();
+
+// Disables execution trace observer.
+TORCH_API void disableExecutionTraceObserver();
+
+} // namespace torch::profiler::impl
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/standalone/itt_observer.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/standalone/itt_observer.h
new file mode 100644
index 0000000000000000000000000000000000000000..61f1c5d30db96949a0a4c6c8bec3032c7ded661e
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/standalone/itt_observer.h
@@ -0,0 +1,9 @@
+#include <torch/csrc/profiler/api.h>
+
+namespace torch::profiler::impl {
+
+void pushITTCallbacks(
+    const ProfilerConfig& config,
+    const std::unordered_set<at::RecordScope>& scopes);
+
+} // namespace torch::profiler::impl
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/standalone/nvtx_observer.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/standalone/nvtx_observer.h
new file mode 100644
index 0000000000000000000000000000000000000000..cf20cb237420f5dd2258376a03aa55ca7700dada
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/standalone/nvtx_observer.h
@@ -0,0 +1,9 @@
+#include <torch/csrc/profiler/api.h>
+
+namespace torch::profiler::impl {
+
+void pushNVTXCallbacks(
+    const ProfilerConfig& config,
+    const std::unordered_set<at::RecordScope>& scopes);
+
+} // namespace torch::profiler::impl
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/standalone/privateuse1_observer.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/standalone/privateuse1_observer.h
new file mode 100644
index 0000000000000000000000000000000000000000..cda02f145305d6e122e91dbeb795ca6a58700cf1
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/standalone/privateuse1_observer.h
@@ -0,0 +1,46 @@
+#pragma once
+#include <torch/csrc/profiler/api.h>
+
+namespace torch::profiler::impl {
+
+using CallBackFnPtr = void (*)(
+    const ProfilerConfig& config,
+    const std::unordered_set<at::RecordScope>& scopes);
+
+struct PushPRIVATEUSE1CallbacksStub {
+  PushPRIVATEUSE1CallbacksStub() = default;
+  PushPRIVATEUSE1CallbacksStub(const PushPRIVATEUSE1CallbacksStub&) = delete;
+  PushPRIVATEUSE1CallbacksStub& operator=(const PushPRIVATEUSE1CallbacksStub&) =
+      delete;
+  PushPRIVATEUSE1CallbacksStub(PushPRIVATEUSE1CallbacksStub&&) = default;
+  PushPRIVATEUSE1CallbacksStub& operator=(PushPRIVATEUSE1CallbacksStub&&) =
+      default;
+  ~PushPRIVATEUSE1CallbacksStub() = default;
+
+  template <typename... ArgTypes>
+  void operator()(ArgTypes&&... args) {
+    return (*push_privateuse1_callbacks_fn)(std::forward<ArgTypes>(args)...);
+  }
+
+  void set_privateuse1_dispatch_ptr(CallBackFnPtr fn_ptr) {
+    push_privateuse1_callbacks_fn = fn_ptr;
+  }
+
+ private:
+  CallBackFnPtr push_privateuse1_callbacks_fn = nullptr;
+};
+
+extern TORCH_API struct PushPRIVATEUSE1CallbacksStub
+    pushPRIVATEUSE1CallbacksStub;
+
+struct RegisterPRIVATEUSE1Observer {
+  RegisterPRIVATEUSE1Observer(
+      PushPRIVATEUSE1CallbacksStub& stub,
+      CallBackFnPtr value) {
+    stub.set_privateuse1_dispatch_ptr(value);
+  }
+};
+
+#define REGISTER_PRIVATEUSE1_OBSERVER(name, fn) \
+  static RegisterPRIVATEUSE1Observer name##__register(name, fn);
+} // namespace torch::profiler::impl
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/stubs/base.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/stubs/base.h
new file mode 100644
index 0000000000000000000000000000000000000000..e2bd626527b66011eef7f10372074aaaac34d445
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/stubs/base.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <functional>
+#include <memory>
+
+#include <c10/core/Device.h>
+#include <c10/util/strong_type.h>
+#include <torch/csrc/Export.h>
+
+struct CUevent_st;
+
+namespace torch::profiler::impl {
+
+// ----------------------------------------------------------------------------
+// -- Annotation --------------------------------------------------------------
+// ----------------------------------------------------------------------------
+using ProfilerEventStub = std::shared_ptr<CUevent_st>;
+using ProfilerVoidEventStub = std::shared_ptr<void>;
+
+struct TORCH_API ProfilerStubs {
+  virtual void record(
+      c10::DeviceIndex* device,
+      ProfilerVoidEventStub* event,
+      int64_t* cpu_ns) const = 0;
+  virtual float elapsed(
+      const ProfilerVoidEventStub* event,
+      const ProfilerVoidEventStub* event2) const = 0;
+  virtual void mark(const char* name) const = 0;
+  virtual void rangePush(const char* name) const = 0;
+  virtual void rangePop() const = 0;
+  virtual bool enabled() const {
+    return false;
+  }
+  virtual void onEachDevice(std::function<void(int)> op) const = 0;
+  virtual void synchronize() const = 0;
+  virtual ~ProfilerStubs() = default;
+};
+
+TORCH_API void registerCUDAMethods(ProfilerStubs* stubs);
+TORCH_API const ProfilerStubs* cudaStubs();
+TORCH_API void registerITTMethods(ProfilerStubs* stubs);
+TORCH_API const ProfilerStubs* ittStubs();
+TORCH_API void registerPrivateUse1Methods(ProfilerStubs* stubs);
+TORCH_API const ProfilerStubs* privateuse1Stubs();
+
+using vulkan_id_t = strong::type<
+    int64_t,
+    struct _VulkanID,
+    strong::regular,
+    strong::convertible_to<int64_t>,
+    strong::hashable>;
+
+} // namespace torch::profiler::impl
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/unwind/action.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/unwind/action.h
new file mode 100644
index 0000000000000000000000000000000000000000..017bc0d74e9b264592258f74dc1380d70230d752
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/unwind/action.h
@@ -0,0 +1,59 @@
+#pragma once
+#include <cstdint>
+#include <ostream>
+
+namespace torch::unwind {
+
+enum {
+  A_UNDEFINED = 0x0,
+  A_REG_PLUS_DATA = 0x1, // exp = REG[reg] + data0
+  A_LOAD_CFA_OFFSET = 0x2, // exp = *(cfa + data0)
+  A_REG_PLUS_DATA_DEREF = 0x3 // exp = *(REG[reg] + data0)
+};
+
+// register numbers in dwarf info
+enum {
+  D_UNDEFINED = -1,
+  D_RBP = 6,
+  D_RSP = 7,
+  D_RIP = 16,
+  D_REG_SIZE = 17,
+};
+
+struct Action {
+  uint8_t kind = A_UNDEFINED;
+  int32_t reg = -1;
+  int64_t data = 0;
+  static Action undefined() {
+    return Action{A_UNDEFINED};
+  }
+  static Action regPlusData(int32_t reg, int64_t offset) {
+    return Action{A_REG_PLUS_DATA, reg, offset};
+  }
+  static Action regPlusDataDeref(int32_t reg, int64_t offset) {
+    return Action{A_REG_PLUS_DATA_DEREF, reg, offset};
+  }
+  static Action loadCfaOffset(int64_t offset) {
+    return Action{A_LOAD_CFA_OFFSET, D_UNDEFINED, offset};
+  }
+
+  friend std::ostream& operator<<(std::ostream& out, const Action& self) {
+    switch (self.kind) {
+      case A_UNDEFINED:
+        out << "u";
+        break;
+      case A_REG_PLUS_DATA:
+        out << "r" << (int)self.reg << " + " << self.data;
+        break;
+      case A_REG_PLUS_DATA_DEREF:
+        out << "*(r" << (int)self.reg << " + " << self.data << ")";
+        break;
+      case A_LOAD_CFA_OFFSET:
+        out << "*(cfa + " << self.data << ")";
+        break;
+    }
+    return out;
+  }
+};
+
+} // namespace torch::unwind
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/util.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/util.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e1ee553c08c963dfbcc45fab0211ebde46c19f6
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/profiler/util.h
@@ -0,0 +1,208 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <list>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <ATen/record_function.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/hash.h>
+#include <torch/csrc/Export.h>
+#include <torch/csrc/jit/frontend/source_range.h>
+#include <optional>
+
+// TODO: replace with pytorch/rfcs#43 when it is ready.
+#define SOFT_ASSERT(cond, ...)                         \
+  [&]() -> bool {                                      \
+    if (C10_UNLIKELY(!(cond))) {                       \
+      torch::profiler::impl::logSoftAssert(            \
+          __func__,                                    \
+          __FILE__,                                    \
+          static_cast<uint32_t>(__LINE__),             \
+          #cond,                                       \
+          ::c10::str(__VA_ARGS__));                    \
+      if (torch::profiler::impl::softAssertRaises()) { \
+        TORCH_INTERNAL_ASSERT(cond, __VA_ARGS__);      \
+      } else {                                         \
+        TORCH_WARN_ONCE(__VA_ARGS__);                  \
+      }                                                \
+      return false;                                    \
+    }                                                  \
+    return true;                                       \
+  }()
+
+namespace torch::profiler::impl {
+TORCH_API bool softAssertRaises();
+TORCH_API void setSoftAssertRaises(std::optional<bool> value);
+TORCH_API void logSoftAssert(
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* cond,
+    const char* args);
+TORCH_API inline void logSoftAssert(
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* cond,
+    ::c10::detail::CompileTimeEmptyString args) {
+  logSoftAssert(func, file, line, cond, (const char*)args);
+}
+TORCH_API void logSoftAssert(
+    const char* func,
+    const char* file,
+    uint32_t line,
+    const char* cond,
+    const std::string& args);
+
+using shape =
+    std::variant<std::vector<int64_t>, std::vector<std::vector<int64_t>>>;
+constexpr int TENSOR_LIST_DISPLAY_LENGTH_LIMIT = 30;
+
+std::string getNvtxStr(
+    const char* name,
+    int64_t sequence_nr,
+    const std::vector<std::vector<int64_t>>& shapes,
+    at::RecordFunctionHandle op_id = 0,
+    const std::list<std::pair<at::RecordFunctionHandle, int>>& input_op_ids =
+        {});
+
+struct TORCH_API FileLineFunc {
+  std::string filename;
+  size_t line;
+  std::string funcname;
+};
+
+struct TORCH_API SaveNcclMetaConfig {
+  bool truncate;
+  bool introspectMetadata;
+  bool introspectInputs;
+  bool introspectOutputs;
+
+  // Default constructor with default values
+  SaveNcclMetaConfig()
+      : truncate(true),
+        introspectMetadata(true),
+        introspectInputs(false),
+        introspectOutputs(false) {}
+
+  SaveNcclMetaConfig(
+      bool truncate,
+      bool introspectMetadata,
+      bool introspectInputs,
+      bool introspectOutputs)
+      : truncate(truncate),
+        introspectMetadata(introspectMetadata),
+        introspectInputs(introspectInputs),
+        introspectOutputs(introspectOutputs) {}
+};
+
+TORCH_API std::vector<FileLineFunc> prepareCallstack(
+    const std::vector<jit::StackEntry>& cs);
+TORCH_API std::vector<std::string> callstackStr(
+    const std::vector<FileLineFunc>& cs);
+TORCH_API std::string stacksToStr(
+    const std::vector<std::string>& stacks,
+    const char* delim);
+TORCH_API std::vector<std::vector<int64_t>> inputSizes(
+    const at::RecordFunction& fn,
+    const bool flatten_list_enabled = false);
+TORCH_API std::string variantShapesToStr(const std::vector<shape>& shapes);
+TORCH_API std::string shapesToStr(
+    const std::vector<std::vector<int64_t>>& shapes);
+TORCH_API std::string strListToStr(const std::vector<std::string>& types);
+TORCH_API std::string inputOpIdsToStr(
+    const std::list<std::pair<at::RecordFunctionHandle, int>>& input_op_ids);
+TORCH_API std::string ivalueToStr(const c10::IValue& val, bool isString);
+TORCH_API std::string ivalueListToStr(const std::vector<c10::IValue>& list);
+TORCH_API std::vector<std::string> inputTypes(const at::RecordFunction& fn);
+
+std::unordered_map<std::string, c10::IValue> TORCH_API
+saveExtraArgs(const at::RecordFunction& fn);
+std::unordered_map<std::string, std::string> TORCH_API saveNcclMeta(
+    const at::RecordFunction& fn,
+    const SaveNcclMetaConfig& config = SaveNcclMetaConfig());
+int getTensorStartHint(const at::Tensor& t);
+bool checkFunctionOutputsForLogging(const at::RecordFunction& fn);
+bool checkFunctionInputsForLogging(const at::RecordFunction& fn);
+std::pair<bool, std::variant<int, std::vector<int>>> findStartAddrForTensors(
+    const c10::IValue& val);
+uint64_t TORCH_API computeFlops(
+    const std::string& op_name,
+    const std::unordered_map<std::string, c10::IValue>& extra_args);
+
+std::string shapeToStr(const std::vector<int64_t>& shape);
+
+template <typename T>
+class TORCH_API GlobalStateManager {
+ public:
+  static GlobalStateManager& singleton() {
+    /* library-local */ static GlobalStateManager singleton_;
+    return singleton_;
+  }
+
+  static void push(std::shared_ptr<T>&& state) {
+    if (singleton().state_) {
+      LOG(WARNING) << "GlobalStatePtr already exists!";
+    } else {
+      singleton().state_ = std::move(state);
+    }
+  }
+
+  static auto* get() {
+    return singleton().state_.get();
+  }
+
+  static std::shared_ptr<T> pop() {
+    auto out = singleton().state_;
+    singleton().state_.reset();
+    return out;
+  }
+
+ private:
+  GlobalStateManager() = default;
+
+  std::shared_ptr<T> state_;
+};
+
+struct HashCombine {
+  template <typename T0, typename T1>
+  size_t operator()(const std::pair<T0, T1>& i) {
+    return c10::get_hash((*this)(i.first), (*this)(i.second));
+  }
+
+  template <typename... Args>
+  size_t operator()(const std::tuple<Args...>& i) {
+    return c10::get_hash(i);
+  }
+
+  template <typename T>
+  size_t operator()(const T& i) {
+    return c10::get_hash(i);
+  }
+};
+
+#ifdef USE_DISTRIBUTED
+constexpr auto kCommsName = "Collective name";
+constexpr auto kDtype = "dtype";
+constexpr auto kInMsgNelems = "In msg nelems";
+constexpr auto kOutMsgNelems = "Out msg nelems";
+constexpr auto kInSplit = "In split size";
+constexpr auto kOutSplit = "Out split size";
+constexpr auto kGlobalRankStart = "Global rank start";
+constexpr auto kGlobalRankStride = "Global rank stride";
+constexpr auto kGroupSize = "Group size";
+constexpr auto kProcessGroupName = "Process Group Name";
+constexpr auto kProcessGroupDesc = "Process Group Description";
+constexpr auto kGroupRanks = "Process Group Ranks";
+constexpr auto kRank = "Rank";
+constexpr auto kP2pSrc = "Src Rank";
+constexpr auto kP2pDst = "Dst Rank";
+constexpr auto kInTensorsStart = "Input Tensors start";
+constexpr auto kOutTensorsStart = "Output Tensors start";
+#endif // USE_DISTRIBUTED
+
+} // namespace torch::profiler::impl
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/python_dimname.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/python_dimname.h
new file mode 100644
index 0000000000000000000000000000000000000000..bc0689a5d2ade7a0c9a1dac6e522f22c34d511a9
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/python_dimname.h
@@ -0,0 +1,7 @@
+#pragma once
+#include <ATen/Dimname.h>
+#include <torch/csrc/python_headers.h>
+
+at::Dimname THPDimname_parse(PyObject* obj);
+bool THPUtils_checkDimname(PyObject* obj);
+bool THPUtils_checkDimnameList(PyObject* obj);
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/python_headers.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/python_headers.h
new file mode 100644
index 0000000000000000000000000000000000000000..552a39fcca97c33368c434ea6452c9352c7f76ee
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/python_headers.h
@@ -0,0 +1,25 @@
+#pragma once
+// workaround for https://github.com/python/cpython/pull/23326
+#include <cmath>
+#include <complex>
+// workaround for Python 2 issue: https://bugs.python.org/issue17120
+// NOTE: It looks like this affects Python 3 as well.
+#pragma push_macro("_XOPEN_SOURCE")
+#pragma push_macro("_POSIX_C_SOURCE")
+#undef _XOPEN_SOURCE
+#undef _POSIX_C_SOURCE
+
+#include <Python.h>
+#include <frameobject.h>
+#include <structseq.h>
+
+#pragma pop_macro("_XOPEN_SOURCE")
+#pragma pop_macro("_POSIX_C_SOURCE")
+
+#ifdef copysign
+#undef copysign
+#endif
+
+#if PY_MAJOR_VERSION < 3
+#error "Python 2 has reached end-of-life and is no longer supported by PyTorch."
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/serialization.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/serialization.h
new file mode 100644
index 0000000000000000000000000000000000000000..2c22a9ee3281eb748d2b1438f6908c7dc73ed17b
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/serialization.h
@@ -0,0 +1,27 @@
+#ifndef THP_SERIALIZATION_INC
+#define THP_SERIALIZATION_INC
+
+#include <c10/core/StorageImpl.h>
+#include <c10/util/intrusive_ptr.h>
+template <class io>
+void doRead(io fildes, void* buf, size_t nbytes);
+
+template <class io>
+void doWrite(io fildes, void* buf, size_t nbytes);
+
+// Note that this takes a mutable storage because it may pass through
+// to at::from_blob.
+template <class io>
+void THPStorage_writeFileRaw(
+    c10::StorageImpl* self,
+    io fd,
+    bool save_size,
+    uint64_t element_size);
+
+template <class io>
+c10::intrusive_ptr<c10::StorageImpl> THPStorage_readFileRaw(
+    io fd,
+    c10::intrusive_ptr<c10::StorageImpl> storage,
+    uint64_t element_size);
+
+#endif
diff --git a/phivenv/Lib/site-packages/torch/include/torch/csrc/utils.h b/phivenv/Lib/site-packages/torch/include/torch/csrc/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe2c39ad8801e6eec436628f11b13d30df3f7722
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/csrc/utils.h
@@ -0,0 +1,203 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <c10/util/Exception.h>
+#include <torch/csrc/Storage.h>
+#include <torch/csrc/THConcat.h>
+#include <torch/csrc/utils/object_ptr.h>
+#include <torch/csrc/utils/python_compat.h>
+#include <torch/csrc/utils/python_numbers.h>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#define THPUtils_(NAME) TH_CONCAT_4(THP, Real, Utils_, NAME)
+
+#define THPUtils_typename(obj) (Py_TYPE(obj)->tp_name)
+
+#if defined(__GNUC__) || defined(__ICL) || defined(__clang__)
+#define THP_EXPECT(x, y) (__builtin_expect((x), (y)))
+#else
+#define THP_EXPECT(x, y) (x)
+#endif
+
+#define THPUtils_checkReal_FLOAT(object) \
+  (PyFloat_Check(object) || PyLong_Check(object))
+
+#define THPUtils_unpackReal_FLOAT(object)           \
+  (PyFloat_Check(object) ? PyFloat_AsDouble(object) \
+       : PyLong_Check(object)                       \
+       ? PyLong_AsLongLong(object)                  \
+       : (throw std::runtime_error("Could not parse real"), 0))
+
+#define THPUtils_checkReal_INT(object) PyLong_Check(object)
+
+#define THPUtils_unpackReal_INT(object) \
+  (PyLong_Check(object)                 \
+       ? PyLong_AsLongLong(object)      \
+       : (throw std::runtime_error("Could not parse real"), 0))
+
+#define THPUtils_unpackReal_BOOL(object) \
+  (PyBool_Check(object)                  \
+       ? object                          \
+       : (throw std::runtime_error("Could not parse real"), Py_False))
+
+#define THPUtils_unpackReal_COMPLEX(object)                                   \
+  (PyComplex_Check(object)                                                    \
+       ? (c10::complex<double>(                                               \
+             PyComplex_RealAsDouble(object), PyComplex_ImagAsDouble(object))) \
+       : PyFloat_Check(object)                                                \
+       ? (c10::complex<double>(PyFloat_AsDouble(object), 0))                  \
+       : PyLong_Check(object)                                                 \
+       ? (c10::complex<double>(PyLong_AsLongLong(object), 0))                 \
+       : (throw std::runtime_error("Could not parse real"),                   \
+          c10::complex<double>(0, 0)))
+
+#define THPUtils_checkReal_BOOL(object) PyBool_Check(object)
+
+#define THPUtils_checkReal_COMPLEX(object)                                    \
+  PyComplex_Check(object) || PyFloat_Check(object) || PyLong_Check(object) || \
+      PyInt_Check(object)
+
+#define THPUtils_newReal_FLOAT(value) PyFloat_FromDouble(value)
+#define THPUtils_newReal_INT(value) PyInt_FromLong(value)
+
+#define THPUtils_newReal_BOOL(value) PyBool_FromLong(value)
+
+#define THPUtils_newReal_COMPLEX(value) \
+  PyComplex_FromDoubles(value.real(), value.imag())
+
+#define THPDoubleUtils_checkReal(object) THPUtils_checkReal_FLOAT(object)
+#define THPDoubleUtils_unpackReal(object) \
+  (double)THPUtils_unpackReal_FLOAT(object)
+#define THPDoubleUtils_newReal(value) THPUtils_newReal_FLOAT(value)
+#define THPFloatUtils_checkReal(object) THPUtils_checkReal_FLOAT(object)
+#define THPFloatUtils_unpackReal(object) \
+  (float)THPUtils_unpackReal_FLOAT(object)
+#define THPFloatUtils_newReal(value) THPUtils_newReal_FLOAT(value)
+#define THPHalfUtils_checkReal(object) THPUtils_checkReal_FLOAT(object)
+#define THPHalfUtils_unpackReal(object) \
+  (at::Half) THPUtils_unpackReal_FLOAT(object)
+#define THPHalfUtils_newReal(value) PyFloat_FromDouble(value)
+#define THPHalfUtils_newAccreal(value) THPUtils_newReal_FLOAT(value)
+#define THPComplexDoubleUtils_checkReal(object) \
+  THPUtils_checkReal_COMPLEX(object)
+#define THPComplexDoubleUtils_unpackReal(object) \
+  THPUtils_unpackReal_COMPLEX(object)
+#define THPComplexDoubleUtils_newReal(value) THPUtils_newReal_COMPLEX(value)
+#define THPComplexFloatUtils_checkReal(object) \
+  THPUtils_checkReal_COMPLEX(object)
+#define THPComplexFloatUtils_unpackReal(object) \
+  (c10::complex<float>)THPUtils_unpackReal_COMPLEX(object)
+#define THPComplexFloatUtils_newReal(value) THPUtils_newReal_COMPLEX(value)
+#define THPBFloat16Utils_checkReal(object) THPUtils_checkReal_FLOAT(object)
+#define THPBFloat16Utils_unpackReal(object) \
+  (at::BFloat16) THPUtils_unpackReal_FLOAT(object)
+#define THPBFloat16Utils_newReal(value) PyFloat_FromDouble(value)
+#define THPBFloat16Utils_newAccreal(value) THPUtils_newReal_FLOAT(value)
+
+#define THPBoolUtils_checkReal(object) THPUtils_checkReal_BOOL(object)
+#define THPBoolUtils_unpackReal(object) THPUtils_unpackReal_BOOL(object)
+#define THPBoolUtils_newReal(value) THPUtils_newReal_BOOL(value)
+#define THPBoolUtils_checkAccreal(object) THPUtils_checkReal_BOOL(object)
+#define THPBoolUtils_unpackAccreal(object) \
+  (int64_t)THPUtils_unpackReal_BOOL(object)
+#define THPBoolUtils_newAccreal(value) THPUtils_newReal_BOOL(value)
+#define THPLongUtils_checkReal(object) THPUtils_checkReal_INT(object)
+#define THPLongUtils_unpackReal(object) (int64_t)THPUtils_unpackReal_INT(object)
+#define THPLongUtils_newReal(value) THPUtils_newReal_INT(value)
+#define THPIntUtils_checkReal(object) THPUtils_checkReal_INT(object)
+#define THPIntUtils_unpackReal(object) (int)THPUtils_unpackReal_INT(object)
+#define THPIntUtils_newReal(value) THPUtils_newReal_INT(value)
+#define THPShortUtils_checkReal(object) THPUtils_checkReal_INT(object)
+#define THPShortUtils_unpackReal(object) (short)THPUtils_unpackReal_INT(object)
+#define THPShortUtils_newReal(value) THPUtils_newReal_INT(value)
+#define THPCharUtils_checkReal(object) THPUtils_checkReal_INT(object)
+#define THPCharUtils_unpackReal(object) (char)THPUtils_unpackReal_INT(object)
+#define THPCharUtils_newReal(value) THPUtils_newReal_INT(value)
+#define THPByteUtils_checkReal(object) THPUtils_checkReal_INT(object)
+#define THPByteUtils_unpackReal(object) \
+  (unsigned char)THPUtils_unpackReal_INT(object)
+#define THPByteUtils_newReal(value) THPUtils_newReal_INT(value)
+// quantized types
+#define THPQUInt8Utils_checkReal(object) THPUtils_checkReal_INT(object)
+#define THPQUInt8Utils_unpackReal(object) (int)THPUtils_unpackReal_INT(object)
+#define THPQUInt8Utils_newReal(value) THPUtils_newReal_INT(value)
+#define THPQInt8Utils_checkReal(object) THPUtils_checkReal_INT(object)
+#define THPQInt8Utils_unpackReal(object) (int)THPUtils_unpackReal_INT(object)
+#define THPQInt8Utils_newReal(value) THPUtils_newReal_INT(value)
+#define THPQInt32Utils_checkReal(object) THPUtils_checkReal_INT(object)
+#define THPQInt32Utils_unpackReal(object) (int)THPUtils_unpackReal_INT(object)
+#define THPQInt32Utils_newReal(value) THPUtils_newReal_INT(value)
+#define THPQUInt4x2Utils_checkReal(object) THPUtils_checkReal_INT(object)
+#define THPQUInt4x2Utils_unpackReal(object) (int)THPUtils_unpackReal_INT(object)
+#define THPQUInt4x2Utils_newReal(value) THPUtils_newReal_INT(value)
+#define THPQUInt2x4Utils_checkReal(object) THPUtils_checkReal_INT(object)
+#define THPQUInt2x4Utils_unpackReal(object) (int)THPUtils_unpackReal_INT(object)
+#define THPQUInt2x4Utils_newReal(value) THPUtils_newReal_INT(value)
+
+/*
+   From https://github.com/python/cpython/blob/v3.7.0/Modules/xxsubtype.c
+   If compiled as a shared library, some compilers don't allow addresses of
+   Python objects defined in other libraries to be used in static PyTypeObject
+   initializers. The DEFERRED_ADDRESS macro is used to tag the slots where such
+   addresses appear; the module init function that adds the PyTypeObject to the
+   module must fill in the tagged slots at runtime. The argument is for
+   documentation -- the macro ignores it.
+*/
+#define DEFERRED_ADDRESS(ADDR) nullptr
+
+TORCH_PYTHON_API void THPUtils_setError(const char* format, ...);
+TORCH_PYTHON_API void THPUtils_invalidArguments(
+    PyObject* given_args,
+    PyObject* given_kwargs,
+    const char* function_name,
+    size_t num_options,
+    ...);
+
+bool THPUtils_checkIntTuple(PyObject* arg);
+std::vector<int> THPUtils_unpackIntTuple(PyObject* arg);
+
+TORCH_PYTHON_API void THPUtils_addPyMethodDefs(
+    std::vector<PyMethodDef>& vector,
+    const PyMethodDef* methods);
+
+int THPUtils_getCallable(PyObject* arg, PyObject** result);
+
+typedef THPPointer<THPGenerator> THPGeneratorPtr;
+typedef class THPPointer<THPStorage> THPStoragePtr;
+
+TORCH_PYTHON_API std::vector<int64_t> THPUtils_unpackLongs(PyObject* arg);
+PyObject* THPUtils_dispatchStateless(
+    PyObject* tensor,
+    const char* name,
+    PyObject* args,
+    PyObject* kwargs);
+
+template <typename _real, typename = void>
+struct mod_traits {};
+
+template <typename _real>
+struct mod_traits<_real, std::enable_if_t<std::is_floating_point_v<_real>>> {
+  static _real mod(_real a, _real b) {
+    return fmod(a, b);
+  }
+};
+
+template <typename _real>
+struct mod_traits<_real, std::enable_if_t<std::is_integral_v<_real>>> {
+  static _real mod(_real a, _real b) {
+    return a % b;
+  }
+};
+
+void setBackCompatBroadcastWarn(bool warn);
+bool getBackCompatBroadcastWarn();
+
+void setBackCompatKeepdimWarn(bool warn);
+bool getBackCompatKeepdimWarn();
+bool maybeThrowBackCompatKeepdimWarn(char* func);
+
+void storage_fill(const at::Storage& self, uint8_t value);
+void storage_set(const at::Storage& self, ptrdiff_t idx, uint8_t value);
+uint8_t storage_get(const at::Storage& self, ptrdiff_t idx);
diff --git a/phivenv/Lib/site-packages/torch/include/torch/custom_class.h b/phivenv/Lib/site-packages/torch/include/torch/custom_class.h
new file mode 100644
index 0000000000000000000000000000000000000000..fadbd3cf0662b376ea9f780edc4d9b9ca5973c13
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/custom_class.h
@@ -0,0 +1,519 @@
+#pragma once
+
+#include <ATen/core/builtin_function.h>
+#include <ATen/core/function_schema.h>
+#include <ATen/core/ivalue.h>
+#include <ATen/core/class_type.h>
+#include <ATen/core/op_registration/infer_schema.h>
+#include <ATen/core/stack.h>
+#include <c10/util/C++17.h>
+#include <c10/util/Metaprogramming.h>
+#include <c10/util/TypeList.h>
+#include <c10/util/TypeTraits.h>
+#include <torch/custom_class_detail.h>
+#include <torch/library.h>
+
+#include <functional>
+#include <sstream>
+
+namespace torch {
+
+/// This function is used in conjunction with `class_::def()` to register
+/// a constructor for a given C++ class type. For example,
+/// `torch::init<int, std::string>()` would register a two-argument constructor
+/// taking an `int` and a `std::string` as argument.
+template <class... Types>
+detail::types<void, Types...> init() {
+  return detail::types<void, Types...>{};
+}
+
+template <typename Func, typename... ParameterTypeList>
+struct InitLambda {
+  Func f;
+};
+
+template <typename Func>
+decltype(auto) init(Func&& f) {
+  using InitTraits = c10::guts::infer_function_traits_t<std::decay_t<Func>>;
+  using ParameterTypeList = typename InitTraits::parameter_types;
+
+  InitLambda<Func, ParameterTypeList> init{std::forward<Func>(f)};
+  return init;
+}
+
+/// Entry point for custom C++ class registration. To register a C++ class
+/// in PyTorch, instantiate `torch::class_` with the desired class as the
+/// template parameter. Typically, this instantiation should be done in
+/// the initialization of a global variable, so that the class will be
+/// made available on dynamic library loading without any additional API
+/// calls needed. For example, to register a class named Foo, you might
+/// create a global variable like so:
+///
+///     static auto register_foo = torch::class_<Foo>("myclasses", "Foo")
+///       .def("myMethod", &Foo::myMethod)
+///       .def("lambdaMethod", [](const c10::intrusive_ptr<Foo>& self) {
+///         // Do something with `self`
+///       });
+///
+/// In addition to registering the class, this registration also chains
+/// `def()` calls to register methods. `myMethod()` is registered with
+/// a pointer to the Foo class's `myMethod()` method. `lambdaMethod()`
+/// is registered with a C++ lambda expression.
+template <class CurClass>
+class class_ : public ::torch::detail::class_base {
+  static_assert(
+      std::is_base_of_v<CustomClassHolder, CurClass>,
+      "torch::class_<T> requires T to inherit from CustomClassHolder");
+
+ public:
+  /// This constructor actually registers the class type.
+  /// String argument `namespaceName` is an identifier for the
+  /// namespace you would like this class to appear in.
+  /// String argument `className` is the name you would like to
+  /// see this class exposed as in Python and TorchScript. For example, if
+  /// you pass `foo` as the namespace name and `Bar` as the className, the
+  /// class will appear as `torch.classes.foo.Bar` in Python and TorchScript
+  explicit class_(
+      const std::string& namespaceName,
+      const std::string& className,
+      std::string doc_string = "")
+      : class_base(
+            namespaceName,
+            className,
+            std::move(doc_string),
+            typeid(c10::intrusive_ptr<CurClass>),
+            typeid(c10::tagged_capsule<CurClass>)) {}
+
+  /// def() can be used in conjunction with `torch::init()` to register
+  /// a constructor for a given C++ class type. For example, passing
+  /// `torch::init<int, std::string>()` would register a two-argument
+  /// constructor taking an `int` and a `std::string` as argument.
+  template <typename... Types>
+  class_& def(
+      torch::detail::types<void, Types...>,
+      std::string doc_string = "",
+      std::initializer_list<arg> default_args =
+          {}) { // Used in combination with
+    // torch::init<...>()
+    auto func = [](c10::tagged_capsule<CurClass> self, Types... args) {
+      auto classObj = c10::make_intrusive<CurClass>(args...);
+      auto object = self.ivalue.toObject();
+      object->setSlot(0, c10::IValue::make_capsule(std::move(classObj)));
+    };
+
+    defineMethod(
+        "__init__",
+        std::move(func),
+        std::move(doc_string),
+        default_args);
+    return *this;
+  }
+
+  // Used in combination with torch::init([]lambda(){......})
+  template <typename Func, typename... ParameterTypes>
+  class_& def(
+      InitLambda<Func, c10::guts::typelist::typelist<ParameterTypes...>> init,
+      std::string doc_string = "",
+      std::initializer_list<arg> default_args = {}) {
+    auto init_lambda_wrapper = [func = std::move(init.f)](
+                                   c10::tagged_capsule<CurClass> self,
+                                   ParameterTypes... arg) {
+      c10::intrusive_ptr<CurClass> classObj =
+          std::invoke(func, std::forward<ParameterTypes>(arg)...);
+      auto object = self.ivalue.toObject();
+      object->setSlot(0, c10::IValue::make_capsule(classObj));
+    };
+
+    defineMethod(
+        "__init__",
+        std::move(init_lambda_wrapper),
+        std::move(doc_string),
+        default_args);
+
+    return *this;
+  }
+
+  /// This is the normal method registration API. `name` is the name that
+  /// the method will be made accessible by in Python and TorchScript.
+  /// `f` is a callable object that defines the method. Typically `f`
+  /// will either be a pointer to a method on `CurClass`, or a lambda
+  /// expression that takes a `c10::intrusive_ptr<CurClass>` as the first
+  /// argument (emulating a `this` argument in a C++ method.)
+  ///
+  /// Examples:
+  ///
+  ///     // Exposes method `foo` on C++ class `Foo` as `call_foo()` in
+  ///     // Python and TorchScript
+  ///     .def("call_foo", &Foo::foo)
+  ///
+  ///     // Exposes the given lambda expression as method `call_lambda()`
+  ///     // in Python and TorchScript.
+  ///     .def("call_lambda", [](const c10::intrusive_ptr<Foo>& self) {
+  ///       // do something
+  ///     })
+  template <typename Func>
+  class_& def(
+      std::string name,
+      Func f,
+      std::string doc_string = "",
+      std::initializer_list<arg> default_args = {}) {
+    auto wrapped_f = detail::wrap_func<CurClass, Func>(std::move(f));
+    defineMethod(
+        std::move(name),
+        std::move(wrapped_f),
+        std::move(doc_string),
+        default_args);
+    return *this;
+  }
+
+  /// Method registration API for static methods.
+  template <typename Func>
+  class_& def_static(std::string name, Func func, std::string doc_string = "") {
+    auto qualMethodName = qualClassName + "." + name;
+    auto schema =
+        c10::inferFunctionSchemaSingleReturn<Func>(std::move(name), "");
+
+    auto wrapped_func =
+        [func = std::move(func)](jit::Stack& stack) mutable -> void {
+      using RetType =
+          typename c10::guts::infer_function_traits_t<Func>::return_type;
+      detail::BoxedProxy<RetType, Func>()(stack, func);
+    };
+    auto method = std::make_unique<jit::BuiltinOpFunction>(
+        std::move(qualMethodName),
+        std::move(schema),
+        std::move(wrapped_func),
+        std::move(doc_string));
+
+    classTypePtr->addStaticMethod(method.get());
+    registerCustomClassMethod(std::move(method));
+    return *this;
+  }
+
+  /// Property registration API for properties with both getter and setter
+  /// functions.
+  template <typename GetterFunc, typename SetterFunc>
+  class_& def_property(
+      const std::string& name,
+      GetterFunc getter_func,
+      SetterFunc setter_func,
+      std::string doc_string = "") {
+    torch::jit::Function* getter{};
+    torch::jit::Function* setter{};
+
+    auto wrapped_getter =
+        detail::wrap_func<CurClass, GetterFunc>(std::move(getter_func));
+    getter = defineMethod(name + "_getter", wrapped_getter, doc_string);
+
+    auto wrapped_setter =
+        detail::wrap_func<CurClass, SetterFunc>(std::move(setter_func));
+    setter = defineMethod(name + "_setter", wrapped_setter, doc_string);
+
+    classTypePtr->addProperty(name, getter, setter);
+    return *this;
+  }
+
+  /// Property registration API for properties with only getter function.
+  template <typename GetterFunc>
+  class_& def_property(
+      const std::string& name,
+      GetterFunc getter_func,
+      std::string doc_string = "") {
+    torch::jit::Function* getter{};
+
+    auto wrapped_getter =
+        detail::wrap_func<CurClass, GetterFunc>(std::move(getter_func));
+    getter = defineMethod(name + "_getter", wrapped_getter, doc_string);
+
+    classTypePtr->addProperty(name, getter, nullptr);
+    return *this;
+  }
+
+  /// Property registration API for properties with read-write access.
+  template <typename T>
+  class_& def_readwrite(const std::string& name, T CurClass::*field) {
+    auto getter_func = [field =
+                            field](const c10::intrusive_ptr<CurClass>& self) {
+      return self.get()->*field;
+    };
+
+    auto setter_func = [field = field](
+                           const c10::intrusive_ptr<CurClass>& self, T value) {
+      self.get()->*field = value;
+    };
+
+    return def_property(name, getter_func, setter_func);
+  }
+
+  /// Property registration API for properties with read-only access.
+  template <typename T>
+  class_& def_readonly(const std::string& name, T CurClass::*field) {
+    auto getter_func =
+        [field = std::move(field)](const c10::intrusive_ptr<CurClass>& self) {
+          return self.get()->*field;
+        };
+
+    return def_property(name, getter_func);
+  }
+
+  /// This is an unsafe method registration API added for adding custom JIT
+  /// backend support via custom C++ classes. It is not for general purpose use.
+  class_& _def_unboxed(
+      const std::string& name,
+      std::function<void(jit::Stack&)> func,
+      c10::FunctionSchema schema,
+      std::string doc_string = "") {
+    auto method = std::make_unique<jit::BuiltinOpFunction>(
+        qualClassName + "." + name,
+        std::move(schema),
+        std::move(func),
+        std::move(doc_string));
+    classTypePtr->addMethod(method.get());
+    registerCustomClassMethod(std::move(method));
+    return *this;
+  }
+
+  /// def_pickle() is used to define exactly what state gets serialized
+  /// or deserialized for a given instance of a custom C++ class in
+  /// Python or TorchScript. This protocol is equivalent to the Pickle
+  /// concept of `__getstate__` and `__setstate__` from Python
+  /// (https://docs.python.org/2/library/pickle.html#object.__getstate__)
+  ///
+  /// Currently, both the `get_state` and `set_state` callables must be
+  /// C++ lambda expressions. They should have the following signatures,
+  /// where `CurClass` is the class you're registering and `T1` is some object
+  /// that encapsulates the state of the object.
+  ///
+  ///     __getstate__(intrusive_ptr<CurClass>) -> T1
+  ///     __setstate__(T2) -> intrusive_ptr<CurClass>
+  ///
+  /// `T1` must be an object that is convertable to IValue by the same rules
+  /// for custom op/method registration.
+  ///
+  /// For the common case, T1 == T2. T1 can also be a subtype of T2. An
+  /// example where it makes sense for T1 and T2 to differ is if __setstate__
+  /// handles legacy formats in a backwards compatible way.
+  ///
+  /// Example:
+  ///
+  ///     .def_pickle(
+  ///         // __getstate__
+  ///         [](const c10::intrusive_ptr<MyStackClass<std::string>>& self) {
+  ///           return self->stack_;
+  ///         },
+  ///         [](std::vector<std::string> state) { // __setstate__
+  ///            return c10::make_intrusive<MyStackClass<std::string>>(
+  ///               std::vector<std::string>{"i", "was", "deserialized"});
+  ///         })
+  template <typename GetStateFn, typename SetStateFn>
+  class_& def_pickle(GetStateFn&& get_state, SetStateFn&& set_state) {
+    static_assert(
+        c10::guts::is_stateless_lambda<std::decay_t<GetStateFn>>::value &&
+            c10::guts::is_stateless_lambda<std::decay_t<SetStateFn>>::value,
+        "def_pickle() currently only supports lambdas as "
+        "__getstate__ and __setstate__ arguments.");
+    def("__getstate__", std::forward<GetStateFn>(get_state));
+
+    // __setstate__ needs to be registered with some custom handling:
+    // We need to wrap the invocation of the user-provided function
+    // such that we take the return value (i.e. c10::intrusive_ptr<CurrClass>)
+    // and assign it to the `capsule` attribute.
+    using SetStateTraits =
+        c10::guts::infer_function_traits_t<std::decay_t<SetStateFn>>;
+    using SetStateArg = typename c10::guts::typelist::head_t<
+        typename SetStateTraits::parameter_types>;
+    auto setstate_wrapper = [set_state = std::forward<SetStateFn>(set_state)](
+                                c10::tagged_capsule<CurClass> self,
+                                SetStateArg arg) {
+      c10::intrusive_ptr<CurClass> classObj =
+          std::invoke(set_state, std::move(arg));
+      auto object = self.ivalue.toObject();
+      object->setSlot(0, c10::IValue::make_capsule(classObj));
+    };
+    defineMethod(
+        "__setstate__",
+        detail::wrap_func<CurClass, decltype(setstate_wrapper)>(
+            std::move(setstate_wrapper)));
+
+    // type validation
+    auto getstate_schema = classTypePtr->getMethod("__getstate__").getSchema();
+#ifndef STRIP_ERROR_MESSAGES
+    auto format_getstate_schema = [&getstate_schema]() {
+      std::stringstream ss;
+      ss << getstate_schema;
+      return ss.str();
+    };
+#endif
+    TORCH_CHECK(
+        getstate_schema.arguments().size() == 1,
+        "__getstate__ should take exactly one argument: self. Got: ",
+        format_getstate_schema());
+    auto first_arg_type = getstate_schema.arguments().at(0).type();
+    TORCH_CHECK(
+        *first_arg_type == *classTypePtr,
+        "self argument of __getstate__ must be the custom class type. Got ",
+        first_arg_type->repr_str());
+    TORCH_CHECK(
+        getstate_schema.returns().size() == 1,
+        "__getstate__ should return exactly one value for serialization. Got: ",
+        format_getstate_schema());
+
+    auto ser_type = getstate_schema.returns().at(0).type();
+    auto setstate_schema = classTypePtr->getMethod("__setstate__").getSchema();
+    auto arg_type = setstate_schema.arguments().at(1).type();
+    TORCH_CHECK(
+        ser_type->isSubtypeOf(*arg_type),
+        "__getstate__'s return type should be a subtype of "
+        "input argument of __setstate__. Got ",
+        ser_type->repr_str(),
+        " but expected ",
+        arg_type->repr_str());
+
+    return *this;
+  }
+
+ private:
+  template <typename Func>
+  torch::jit::Function* defineMethod(
+      std::string name,
+      Func func,
+      std::string doc_string = "",
+      std::initializer_list<arg> default_args = {}) {
+    auto qualMethodName = qualClassName + "." + name;
+    auto schema =
+        c10::inferFunctionSchemaSingleReturn<Func>(std::move(name), "");
+
+    // If default values are provided for function arguments, there must be
+    // none (no default values) or default values for all function
+    // arguments, except for self. This is because argument names are not
+    // extracted by inferFunctionSchemaSingleReturn, and so there must be a
+    // torch::arg instance in default_args even for arguments that do not
+    // have an actual default value provided.
+    TORCH_CHECK(
+        default_args.size() == 0 ||
+            default_args.size() == schema.arguments().size() - 1,
+        "Default values must be specified for none or all arguments");
+
+    // If there are default args, copy the argument names and default values to
+    // the function schema.
+    if (default_args.size() > 0) {
+      schema = withNewArguments(schema, default_args);
+    }
+
+    auto wrapped_func =
+        [func = std::move(func)](jit::Stack& stack) mutable -> void {
+      // TODO: we need to figure out how to profile calls to custom functions
+      // like this! Currently can't do it because the profiler stuff is in
+      // libtorch and not ATen
+      using RetType =
+          typename c10::guts::infer_function_traits_t<Func>::return_type;
+      detail::BoxedProxy<RetType, Func>()(stack, func);
+    };
+    auto method = std::make_unique<jit::BuiltinOpFunction>(
+        qualMethodName,
+        std::move(schema),
+        std::move(wrapped_func),
+        std::move(doc_string));
+
+    // Register the method here to keep the Method alive.
+    // ClassTypes do not hold ownership of their methods (normally it
+    // those are held by the CompilationUnit), so we need a proxy for
+    // that behavior here.
+    auto method_val = method.get();
+    classTypePtr->addMethod(method_val);
+    registerCustomClassMethod(std::move(method));
+    return method_val;
+  }
+};
+
+/// make_custom_class() is a convenient way to create an instance of a
+/// registered custom class and wrap it in an IValue, for example when you want
+/// to pass the object to TorchScript. Its syntax is equivalent to APIs like
+/// `std::make_shared<>` or `c10::make_intrusive<>`.
+///
+/// For example, if you have a custom C++ class that can be constructed from an
+/// `int` and `std::string`, you might use this API like so:
+///
+///     IValue custom_class_iv = torch::make_custom_class<MyClass>(3,
+///     "foobarbaz");
+template <typename CurClass, typename... CtorArgs>
+c10::IValue make_custom_class(CtorArgs&&... args) {
+  auto userClassInstance =
+      c10::make_intrusive<CurClass>(std::forward<CtorArgs>(args)...);
+  return c10::IValue(std::move(userClassInstance));
+}
+
+// Alternative api for creating a torchbind class over torch::class_ this api is
+// preffered to prevent size regressions on Edge usecases. Must be used in
+// conjunction with TORCH_SELECTIVE_CLASS macro aka
+// selective_class<foo>("foo_namespace", TORCH_SELECTIVE_CLASS("foo"))
+template <class CurClass>
+inline class_<CurClass> selective_class_(
+    const std::string& namespace_name,
+    detail::SelectiveStr<true> className) {
+  auto class_name = std::string(className.operator const char*());
+  return torch::class_<CurClass>(namespace_name, class_name);
+}
+
+template <class CurClass>
+inline detail::ClassNotSelected selective_class_(
+    const std::string&,
+    detail::SelectiveStr<false>) {
+  return detail::ClassNotSelected();
+}
+
+// jit namespace for backward-compatibility
+// We previously defined everything in torch::jit but moved it out to
+// better reflect that these features are not limited only to TorchScript
+namespace jit {
+
+using ::torch::class_;
+using ::torch::getCustomClass;
+using ::torch::init;
+using ::torch::isCustomClass;
+
+} // namespace jit
+
+template <class CurClass>
+inline class_<CurClass> Library::class_(const std::string& className) {
+  TORCH_CHECK(
+      kind_ == DEF || kind_ == FRAGMENT,
+      "class_(\"",
+      className,
+      "\"): Cannot define a class inside of a TORCH_LIBRARY_IMPL block.  "
+      "All class_()s should be placed in the (unique) TORCH_LIBRARY block for their namespace.  "
+      "(Error occurred at ",
+      file_,
+      ":",
+      line_,
+      ")");
+  TORCH_INTERNAL_ASSERT(ns_.has_value(), file_, ":", line_);
+  return torch::class_<CurClass>(*ns_, className);
+}
+
+const std::unordered_set<std::string> getAllCustomClassesNames();
+
+template <class CurClass>
+inline class_<CurClass> Library::class_(detail::SelectiveStr<true> className) {
+  auto class_name = std::string(className.operator const char*());
+  TORCH_CHECK(
+      kind_ == DEF || kind_ == FRAGMENT,
+      "class_(\"",
+      class_name,
+      "\"): Cannot define a class inside of a TORCH_LIBRARY_IMPL block.  "
+      "All class_()s should be placed in the (unique) TORCH_LIBRARY block for their namespace.  "
+      "(Error occurred at ",
+      file_,
+      ":",
+      line_,
+      ")");
+  TORCH_INTERNAL_ASSERT(ns_.has_value(), file_, ":", line_);
+  return torch::class_<CurClass>(*ns_, class_name);
+}
+
+template <class CurClass>
+inline detail::ClassNotSelected Library::class_(detail::SelectiveStr<false>) {
+  return detail::ClassNotSelected();
+}
+
+} // namespace torch
diff --git a/phivenv/Lib/site-packages/torch/include/torch/custom_class_detail.h b/phivenv/Lib/site-packages/torch/include/torch/custom_class_detail.h
new file mode 100644
index 0000000000000000000000000000000000000000..8816cd3d12e71aba261e8de7321b76001dc54a6f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/custom_class_detail.h
@@ -0,0 +1,242 @@
+#pragma once
+
+#include <ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h>
+#include <ATen/core/function.h>
+#include <c10/util/Metaprogramming.h>
+#include <c10/util/TypeTraits.h>
+#include <c10/util/irange.h>
+
+#include <functional>
+
+namespace torch {
+
+namespace detail {
+/**
+ * In the Facebook internal build (using BUCK), this macro is enabled by
+ * passing in -c pt.enable_record_kernel_dtype=1 when building the tracer
+ * binary.
+ */
+#if defined ENABLE_RECORD_KERNEL_FUNCTION_DTYPE
+TORCH_API void record_custom_class(std::string name);
+
+/**
+ * Record an instance of a custom class being loaded
+ * grab portion of string after final '.' from qualified name
+ * as this seemingly aligns with how users name their custom classes
+ * example: __torch__.torch.classes.xnnpack.Conv2dOpContext
+ */
+#define RECORD_CUSTOM_CLASS(NAME) \
+  auto name = std::string(NAME);  \
+  detail::record_custom_class(name.substr(name.find_last_of(".") + 1));
+#else
+#define RECORD_CUSTOM_CLASS(NAME)
+#endif
+} // namespace detail
+
+/// This struct is used to represent default values for arguments
+/// when registering methods for custom classes.
+///     static auto register_foo = torch::class_<Foo>("myclasses", "Foo")
+///       .def("myMethod", &Foo::myMethod, {torch::arg("name") = name});
+struct arg {
+  // Static method for representing a default value of None. This is meant to
+  // be used like so:
+  //     torch::arg("name") = torch::arg::none
+  // and is identical to:
+  //     torch::arg("name") = IValue()
+  static c10::IValue none() {
+    return c10::IValue();
+  }
+
+  // Explicit constructor.
+  explicit arg(std::string name)
+      : name_(std::move(name)), value_(std::nullopt) {}
+  // Assignment operator. This enables the pybind-like syntax of
+  // torch::arg("name") = value.
+  arg& operator=(const c10::IValue& rhs) {
+    value_ = rhs;
+    return *this;
+  }
+
+  // The name of the argument. This is copied to the schema; argument
+  // names cannot be extracted from the C++ declaration.
+  std::string name_;
+  // IValue's default constructor makes it None, which is not distinguishable
+  // from an actual, user-provided default value that is None. This boolean
+  // helps distinguish between the two cases.
+  std::optional<c10::IValue> value_;
+};
+
+namespace detail {
+
+// Argument type utilities
+template <class R, class...>
+struct types {
+  using type = types;
+};
+
+template <typename Method>
+struct WrapMethod;
+
+template <typename R, typename CurrClass, typename... Args>
+struct WrapMethod<R (CurrClass::*)(Args...)> {
+  WrapMethod(R (CurrClass::*m)(Args...)) : m(std::move(m)) {}
+
+  R operator()(c10::intrusive_ptr<CurrClass> cur, Args... args) {
+    return std::invoke(m, *cur, args...);
+  }
+
+  R (CurrClass::*m)(Args...);
+};
+
+template <typename R, typename CurrClass, typename... Args>
+struct WrapMethod<R (CurrClass::*)(Args...) const> {
+  WrapMethod(R (CurrClass::*m)(Args...) const) : m(std::move(m)) {}
+
+  R operator()(c10::intrusive_ptr<CurrClass> cur, Args... args) {
+    return std::invoke(m, *cur, args...);
+  }
+
+  R (CurrClass::*m)(Args...) const;
+};
+
+// Adapter for different callable types
+template <
+    typename CurClass,
+    typename Func,
+    std::enable_if_t<
+        std::is_member_function_pointer_v<std::decay_t<Func>>,
+        bool> = false>
+WrapMethod<Func> wrap_func(Func f) {
+  return WrapMethod<Func>(std::move(f));
+}
+
+template <
+    typename CurClass,
+    typename Func,
+    std::enable_if_t<
+        !std::is_member_function_pointer_v<std::decay_t<Func>>,
+        bool> = false>
+Func wrap_func(Func f) {
+  return f;
+}
+
+template <
+    class Functor,
+    bool AllowDeprecatedTypes,
+    size_t... ivalue_arg_indices>
+typename c10::guts::infer_function_traits_t<Functor>::return_type
+call_torchbind_method_from_stack(
+    Functor& functor,
+    jit::Stack& stack,
+    std::index_sequence<ivalue_arg_indices...>) {
+  (void)(stack); // when sizeof...(ivalue_arg_indices) == 0, this argument would
+                 // be unused and we have to silence the compiler warning.
+
+  constexpr size_t num_ivalue_args = sizeof...(ivalue_arg_indices);
+
+  using IValueArgTypes =
+      typename c10::guts::infer_function_traits_t<Functor>::parameter_types;
+  // TODO We shouldn't use c10::impl stuff directly here. We should use the
+  // KernelFunction API instead.
+  return (functor)(c10::impl::ivalue_to_arg<
+                   typename c10::impl::decay_if_not_tensor<
+                       c10::guts::typelist::
+                           element_t<ivalue_arg_indices, IValueArgTypes>>::type,
+                   AllowDeprecatedTypes>::
+                       call(torch::jit::peek(
+                           stack, ivalue_arg_indices, num_ivalue_args))...);
+}
+
+template <class Functor, bool AllowDeprecatedTypes>
+typename c10::guts::infer_function_traits_t<Functor>::return_type
+call_torchbind_method_from_stack(Functor& functor, jit::Stack& stack) {
+  constexpr size_t num_ivalue_args =
+      c10::guts::infer_function_traits_t<Functor>::number_of_parameters;
+  return call_torchbind_method_from_stack<Functor, AllowDeprecatedTypes>(
+      functor, stack, std::make_index_sequence<num_ivalue_args>());
+}
+
+template <class RetType, class Func>
+struct BoxedProxy;
+
+template <class RetType, class Func>
+struct BoxedProxy {
+  void operator()(jit::Stack& stack, Func& func) {
+    auto retval = call_torchbind_method_from_stack<Func, false>(func, stack);
+    constexpr size_t num_ivalue_args =
+        c10::guts::infer_function_traits_t<Func>::number_of_parameters;
+    torch::jit::drop(stack, num_ivalue_args);
+    stack.emplace_back(c10::ivalue::from(std::move(retval)));
+  }
+};
+
+template <class Func>
+struct BoxedProxy<void, Func> {
+  void operator()(jit::Stack& stack, Func& func) {
+    call_torchbind_method_from_stack<Func, false>(func, stack);
+    constexpr size_t num_ivalue_args =
+        c10::guts::infer_function_traits_t<Func>::number_of_parameters;
+    torch::jit::drop(stack, num_ivalue_args);
+    stack.emplace_back();
+  }
+};
+
+inline bool validIdent(size_t i, char n) {
+  return isalpha(n) || n == '_' || (i > 0 && isdigit(n));
+}
+
+inline void checkValidIdent(const std::string& str, const char* type) {
+  for (const auto i : c10::irange(str.size())) {
+    TORCH_CHECK(
+        validIdent(i, str[i]),
+        type,
+        " must be a valid Python/C++ identifier."
+        " Character '",
+        str[i],
+        "' at index ",
+        i,
+        " is illegal.");
+  }
+}
+
+class TORCH_API class_base {
+ protected:
+  explicit class_base(
+      const std::string& namespaceName,
+      const std::string& className,
+      std::string doc_string,
+      const std::type_info& intrusivePtrClassTypeid,
+      const std::type_info& taggedCapsuleClass);
+
+  static c10::FunctionSchema withNewArguments(
+      const c10::FunctionSchema& schema,
+      std::initializer_list<arg> default_args);
+  std::string qualClassName;
+  at::ClassTypePtr classTypePtr;
+};
+
+} // namespace detail
+
+TORCH_API void registerCustomClass(at::ClassTypePtr class_type);
+TORCH_API void registerCustomClassMethod(std::unique_ptr<jit::Function> method);
+
+// Given a qualified name (e.g. __torch__.torch.classes.Foo), return
+// the ClassType pointer to the Type that describes that custom class,
+// or nullptr if no class by that name was found.
+TORCH_API at::ClassTypePtr getCustomClass(const std::string& name);
+
+// Given an IValue, return true if the object contained in that IValue
+// is a custom C++ class, otherwise return false.
+// NOLINTNEXTLINE(readability-redundant-declaration)
+TORCH_API bool isCustomClass(const c10::IValue& v);
+
+// This API is for testing purposes ONLY. It should not be used in
+// any load-bearing code.
+TORCH_API std::vector<c10::FunctionSchema> customClassSchemasForBCCheck();
+
+namespace jit {
+using ::torch::registerCustomClass;
+using ::torch::registerCustomClassMethod;
+} // namespace jit
+
+} // namespace torch
diff --git a/phivenv/Lib/site-packages/torch/include/torch/extension.h b/phivenv/Lib/site-packages/torch/include/torch/extension.h
new file mode 100644
index 0000000000000000000000000000000000000000..236d9d9f4e956f973c887b92b4d619f33fa79138
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/extension.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#ifndef TORCH_INDUCTOR_CPP_WRAPPER
+// All pure C++ headers for the C++ frontend.
+#include <torch/all.h>
+#endif
+
+// Python bindings for the C++ frontend (includes Python.h).
+#include <torch/python.h>
diff --git a/phivenv/Lib/site-packages/torch/include/torch/library.h b/phivenv/Lib/site-packages/torch/include/torch/library.h
new file mode 100644
index 0000000000000000000000000000000000000000..b32e91b4b7bb877a9a17b0d93907154e5dd4a128
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/library.h
@@ -0,0 +1,1106 @@
+#pragma once
+
+/// \file
+///
+/// This header provides an API for extending PyTorch's core library
+/// of operators with user defined operators and data types.  This
+/// API can be used in a few ways:
+///
+/// * You can define new custom operators and classes with TORCH_LIBRARY(),
+///   making them available for use in both eager Python as well as in
+///   TorchScript. This API is modeled off of pybind11's `PYBIND11_MODULE`
+///   macro, as the provided functionality is similar (pybind11 lets you bind
+///   C++ to Python only; `torch/library.h` lets you bind C++ simultaneously to
+///   Python and TorchScript).
+///
+/// * You can override existing operators with TORCH_LIBRARY_IMPL(),
+///   providing a new implementation for these operators for a custom
+///   backend (e.g., XLA).  When you pass operators with tensors of your custom
+///   backend, your overridden implementations will be called instead
+///   of the standard implementations.
+///
+/// * You can use both capabilities at the same time, allowing you
+///   to write custom operators that register CPU/CUDA/Autograd
+///   implementations without having to write the boilerplate
+///   conditionals yourself.
+///
+/// For a tutorial style introduction to the library API, check
+/// out the [Extending TorchScript with Custom C++
+/// Operators](https://pytorch.org/tutorials/advanced/torch_script_custom_ops.html)
+/// tutorial.
+///
+/// ```
+/// // Define a library whose operators live in the namespace 'myops'.
+/// // You must define all of the operators for this library in
+/// // this namespace.
+/// TORCH_LIBRARY(myops, m) {
+///   // Define a operator with exactly one implementation for all backends.
+///   m.def("add(Tensor self, Tensor other) -> Tensor", &add_impl);
+///
+///   // Define a schema for an operator, but provide no implementation
+///   // (use this syntax if you want to use the dispatcher)
+///   m.def("mul(Tensor self, Tensor other) -> Tensor");
+///
+///   // Provide an implementation for a defined operator (you can
+///   // provide multiple; one per backend).  The dispatcher takes care of
+///   // calling the correct implementation depending on if we get a CPU
+///   // tensor or a CUDA tensor
+///   m.impl("mul", torch::kCPU, &mul_cpu_impl);
+///   m.impl("mul", torch::kCUDA, &mul_cuda_impl);
+/// }
+///
+/// // Define implementations for operators for a non-standard backend,
+/// // e.g., XLA (valid values are entries of DispatchKey).  This can
+/// // be used to define operators in a different file than the initial
+/// // TORCH_LIBRARY definition (e.g., if it is in an external library)
+/// TORCH_LIBRARY_IMPL(myops, XLA, m) {
+///   m.impl("mul", &mul_xla_impl);
+/// }
+/// ```
+
+#include <ATen/core/op_registration/infer_schema.h>
+#include <ATen/core/op_registration/op_allowlist.h>
+#include <ATen/core/dispatch/Dispatcher.h>
+#include <c10/core/DispatchKey.h>
+#include <torch/csrc/jit/frontend/function_schema_parser.h>
+
+// Just for inferFunctionSchemaFromFunctor
+#include <ATen/core/enum_tag.h>
+#include <ATen/core/op_registration/op_registration.h>
+
+namespace torch {
+
+#if defined C10_MOBILE
+/**
+ * The NoInferSchemaTag is a type name used to indicate that this call to the
+ * CppFunction constructor should not trigger schema inference from functor.
+ * Schema inference from functor utilizes template meta-programming, and is
+ * costly from a size perspective. Ideally, one would expect that the schema
+ * inference would require very little binary size since most of the
+ * computation can be done by the compiler at build time, but that isn't
+ * necessarily the case.
+ *
+ * Schema inference is elided only for mobile use-cases where we don't need
+ * the additional runtime cost or size overhead on client devices.
+ *
+ */
+struct NoInferSchemaTag {};
+#endif
+
+#define HAS_PT2_COMPLIANT_TAG
+
+// For multipy/torchdeploy use case
+enum class _RegisterOrVerify { REGISTER, VERIFY };
+
+template <class CurClass>
+class class_;
+
+#define HAS_IMPL_ABSTRACT_PYSTUB
+
+/// Represents a C++ function that implements an operator.  Most users won't
+/// interact directly with this class, except via error messages: the
+/// constructors this function define the set of permissible "function"-like
+/// things you can bind via the interface.
+///
+/// This class erases the type of the passed in function, but durably records
+/// the type via an inferred schema for the function.
+class TORCH_API CppFunction final {
+  // TODO: This is morally the same thing as KernelRegistrationConfig, but it's
+  // opaque to the user.
+
+ public:
+  /// This overload accepts function pointers, e.g., `CppFunction(&add_impl)`
+  template <typename Func>
+  explicit CppFunction(
+      Func* f,
+      std::enable_if_t<
+          c10::guts::is_function_type<Func>::value,
+          std::nullptr_t> = nullptr)
+      : func_(c10::KernelFunction::makeFromUnboxedRuntimeFunction(f)),
+        cpp_signature_(c10::impl::CppSignature::make<Func>()),
+        schema_(
+            c10::detail::inferFunctionSchemaFromFunctor<std::decay_t<Func>>())
+        {}
+
+  /// This overload accepts compile time function pointers, e.g.,
+  /// `CppFunction(TORCH_FN(add_impl))`
+  template <typename FuncPtr>
+  explicit CppFunction(
+      FuncPtr f,
+      std::enable_if_t<
+          c10::is_compile_time_function_pointer<FuncPtr>::value,
+          std::nullptr_t> = nullptr)
+      : func_(c10::KernelFunction::makeFromUnboxedFunction(f)),
+        cpp_signature_(
+            c10::impl::CppSignature::make<typename FuncPtr::FuncType>()),
+        schema_(c10::detail::inferFunctionSchemaFromFunctor<
+                typename FuncPtr::FuncType>())
+        {}
+
+  /// This overload accepts lambdas, e.g., `CppFunction([](const Tensor& self) {
+  /// ... })`
+  template <typename Lambda>
+  explicit CppFunction(
+      Lambda&& f,
+      std::enable_if_t<
+          c10::guts::is_functor<std::decay_t<Lambda>>::value,
+          std::nullptr_t> = nullptr)
+      : func_(c10::KernelFunction::makeFromUnboxedLambda(
+            std::forward<Lambda>(f))),
+        cpp_signature_(c10::impl::CppSignature::make<Lambda>()),
+        schema_(c10::detail::inferFunctionSchemaFromFunctor<
+                std::decay_t<Lambda>>())
+        {}
+
+#if defined C10_MOBILE
+  /// This overload accepts function pointers, e.g., `CppFunction(&add_impl,
+  /// NoInferSchemaTag())`
+  template <typename Func>
+  explicit CppFunction(
+      Func* f,
+      NoInferSchemaTag,
+      std::enable_if_t<
+          c10::guts::is_function_type<Func>::value,
+          std::nullptr_t> = nullptr)
+      : func_(c10::KernelFunction::makeFromUnboxedRuntimeFunction(f)),
+        cpp_signature_(c10::impl::CppSignature::make<Func>())
+        // TODO: Don't go through WrapRuntimeKernelFunctor
+        ,
+        schema_(nullptr),
+        debug_() {}
+
+  /// This overload accepts compile time function pointers, e.g.,
+  /// `CppFunction(TORCH_FN(add_impl), NoInferSchemaTag())`
+  template <typename FuncPtr>
+  explicit CppFunction(
+      FuncPtr f,
+      NoInferSchemaTag,
+      std::enable_if_t<
+          c10::is_compile_time_function_pointer<FuncPtr>::value,
+          std::nullptr_t> = nullptr)
+      : func_(c10::KernelFunction::makeFromUnboxedFunction(f)),
+        cpp_signature_(
+            c10::impl::CppSignature::make<typename FuncPtr::FuncType>())
+        // TODO: Don't go through WrapRuntimeKernelFunctor
+        ,
+        schema_(nullptr),
+        debug_() {}
+
+  /// This overload accepts lambdas, e.g., `CppFunction([](const Tensor& self) {
+  /// ... }. NoInferSchemaTag())`
+  template <typename Lambda>
+  explicit CppFunction(
+      Lambda&& f,
+      NoInferSchemaTag,
+      std::enable_if_t<
+          c10::guts::is_functor<std::decay_t<Lambda>>::value,
+          std::nullptr_t> = nullptr)
+      : func_(c10::KernelFunction::makeFromUnboxedLambda(
+            std::forward<Lambda>(f))),
+        cpp_signature_(c10::impl::CppSignature::make<Lambda>())
+        // TODO: Don't go through WrapRuntimeKernelFunctor
+        ,
+        schema_(nullptr),
+        debug_() {}
+#endif
+
+  ~CppFunction();
+
+  CppFunction(const CppFunction&) = delete;
+  CppFunction& operator=(const CppFunction&) = delete;
+
+  CppFunction(CppFunction&&) noexcept = default;
+
+  CppFunction& operator=(CppFunction&&) = default;
+
+  /// \private
+  /// Creates a function from a type-erased boxed kernel.
+  static CppFunction makeFromBoxedKernel(c10::BoxedKernel kernel) {
+    return CppFunction(
+        c10::KernelFunction::makeFromBoxedKernel(std::move(kernel)),
+        /* cpp_signature */ std::nullopt, // not known for boxed functions
+        /* schema */ nullptr);
+  }
+
+  /// This creates a fallthrough function.  Fallthrough functions
+  /// immediately redispatch to the next available dispatch key,
+  /// but are implemented more efficiently than a hand written
+  /// function done in the same way.
+  static CppFunction makeFallthrough() {
+    return makeFromBoxedKernel(c10::BoxedKernel::makeFallthrough());
+  }
+
+  /// \private
+  ///
+  /// Creates a function that raises an error saying that named tensors
+  /// are not supported when called.
+  static CppFunction makeNamedNotSupported() {
+    return makeFromBoxedKernel(c10::BoxedKernel::makeNamedNotSupported());
+  }
+
+  /// Create a function from a boxed kernel function with signature
+  /// `void(const OperatorHandle&, Stack*)`; i.e., they receive a
+  /// stack of arguments in a boxed calling convention, rather than
+  /// in the native C++ calling convention.  Boxed functions are
+  /// typically only used to register backend fallbacks via
+  /// torch::Library::fallback().
+  template <c10::BoxedKernel::BoxedKernelFunction* func>
+  static CppFunction makeFromBoxedFunction() {
+    return makeFromBoxedKernel(c10::BoxedKernel::makeFromFunction<func>());
+  }
+
+  // Variant that takes in a boxed kernel function with a plumbed
+  // DispatchKeySet. See Note [Plumbing Keys Through The Dispatcher] for
+  // details.
+  template <c10::BoxedKernel::BoxedKernelFunction_withDispatchKeys* func>
+  static CppFunction makeFromBoxedFunction() {
+    return makeFromBoxedKernel(c10::BoxedKernel::makeFromFunction<func>());
+  }
+
+  /// Create a function from a boxed kernel functor which defines
+  /// `operator()(const OperatorHandle&, DispatchKeySet, Stack*)`
+  /// (receiving arguments from boxed calling convention) and inherits
+  /// from `c10::OperatorKernel`.  Unlike makeFromBoxedFunction, functions
+  /// registered in this way can also carry additional state which
+  /// is managed by the functor; this is useful if you're writing an
+  /// adapter to some other implementation, e.g., a Python callable, which
+  /// is dynamically associated with the registered kernel.
+  template <class KernelFunctor>
+  static CppFunction makeFromBoxedFunctor(
+      std::unique_ptr<KernelFunctor> kernelFunctor) {
+    return makeFromBoxedKernel(
+        c10::BoxedKernel::makeFromFunctor(std::move(kernelFunctor)));
+  }
+
+  /// Create a function from an unboxed kernel function.
+  /// This is typically used to register common operators.
+  template <
+      typename FuncPtr,
+      std::enable_if_t<
+          c10::guts::is_function_type<FuncPtr>::value,
+          std::nullptr_t> = nullptr>
+  static CppFunction makeFromUnboxedFunction(FuncPtr* f) {
+    return CppFunction(f);
+  }
+
+  /// Create a function from a compile time unboxed kernel function pointer.
+  /// This is typically used to register common operators.
+  /// Compile time function pointers can be used to allow the compiler
+  /// to optimize (e.g. inline) calls to it.
+  template <
+      typename FuncPtr,
+      std::enable_if_t<
+          c10::is_compile_time_function_pointer<FuncPtr>::value,
+          std::nullptr_t> = nullptr>
+  static CppFunction makeFromUnboxedFunction(FuncPtr f) {
+    return CppFunction(f);
+  }
+
+  CppFunction&& debug(std::string d) && {
+    debug_ = std::move(d);
+    return std::move(*this);
+  }
+
+ private:
+  std::optional<c10::DispatchKey> dispatch_key_;
+  c10::KernelFunction func_;
+  std::optional<c10::impl::CppSignature> cpp_signature_;
+  std::unique_ptr<c10::FunctionSchema> schema_;
+  std::string debug_;
+
+  // The "setter" for dispatch_key_
+  template <typename Func>
+  friend CppFunction dispatch(c10::DispatchKey, Func&&);
+
+  // The only class which actually pulls out values from CppFunction (does so
+  // destructively, felt too lazy to write accessors that I don't even
+  // want users to use)
+  friend class Library;
+
+  CppFunction(
+      c10::KernelFunction func,
+      std::optional<c10::impl::CppSignature> cpp_signature,
+      std::unique_ptr<c10::FunctionSchema> schema);
+};
+
+/// \defgroup torch-dispatch-overloads torch::dispatch overloads
+
+/// Create a torch::CppFunction which is associated with a specific
+/// dispatch key.  torch::CppFunctions that are tagged with a
+/// c10::DispatchKey don't get invoked unless the dispatcher determines
+/// that this particular c10::DispatchKey is the one that should be
+/// dispatched to.
+///
+/// This function is generally not used directly, instead, prefer using
+/// TORCH_LIBRARY_IMPL(), which will implicitly set the c10::DispatchKey
+/// for all registration calls inside of its body.
+///
+/// \ingroup torch-dispatch-overloads
+template <typename Func>
+inline CppFunction dispatch(c10::DispatchKey k, Func&& raw_f) {
+  CppFunction f(std::forward<Func>(raw_f));
+  if (k == c10::DispatchKey::CatchAll) {
+    f.dispatch_key_ = std::nullopt;
+  } else {
+    f.dispatch_key_ = k;
+  }
+  return f;
+}
+
+/// Convenience overload of dispatch() which accepts c10::DeviceType
+///
+/// \ingroup torch-dispatch-overloads
+template <typename Func>
+inline CppFunction dispatch(c10::DeviceType type, Func&& raw_f) {
+  auto deviceTypeToDispatchKey = [](c10::DeviceType t) {
+    switch (t) {
+      // This list is synchronized with the k-constants in c10/core/DeviceType.h
+      case c10::DeviceType::CPU:
+        return c10::DispatchKey::CPU;
+      case c10::DeviceType::CUDA:
+        return c10::DispatchKey::CUDA;
+      case c10::DeviceType::IPU:
+        return c10::DispatchKey::IPU;
+      case c10::DeviceType::XLA:
+        return c10::DispatchKey::XLA;
+      case c10::DeviceType::Lazy:
+        return c10::DispatchKey::Lazy;
+      case c10::DeviceType::XPU:
+        return c10::DispatchKey::XPU;
+      case c10::DeviceType::MPS:
+        return c10::DispatchKey::MPS;
+      case c10::DeviceType::Meta:
+        return c10::DispatchKey::Meta;
+      case c10::DeviceType::HIP:
+        return c10::DispatchKey::HIP;
+      case c10::DeviceType::MAIA:
+        return c10::DispatchKey::MAIA;
+      case c10::DeviceType::HPU:
+        return c10::DispatchKey::HPU;
+      case c10::DeviceType::MTIA:
+        return c10::DispatchKey::MTIA;
+      case c10::DeviceType::PrivateUse1:
+        return c10::DispatchKey::PrivateUse1;
+      default:
+        TORCH_CHECK(
+            false,
+            "Device type ",
+            t,
+            " cannot be overloaded at dispatch time, "
+            "please file a bug report explaining what you were trying to do.");
+    }
+  };
+  return dispatch(deviceTypeToDispatchKey(type), std::forward<Func>(raw_f));
+}
+
+/// \defgroup torch-schema-overloads torch::schema overloads
+
+/// Construct a c10::FunctionSchema from a string, with an explicitly
+/// specified c10::AliasAnalysisKind.  Ordinarily, schemas are simply
+/// passed in as strings, but if you need to specify a custom alias
+/// analysis, you can replace the string with a call to this function.
+///
+/// ```
+/// // Default alias analysis (FROM_SCHEMA)
+/// m.def("def3(Tensor self) -> Tensor");
+/// // Pure function alias analysis
+/// m.def(torch::schema("def3(Tensor self) -> Tensor",
+/// c10::AliasAnalysisKind::PURE_FUNCTION));
+/// ```
+///
+/// \ingroup torch-schema-overloads
+inline c10::FunctionSchema schema(const char* str, c10::AliasAnalysisKind k, bool allow_typevars=false) {
+  c10::FunctionSchema s = torch::jit::parseSchema(str, /*allow_typevars*/allow_typevars);
+  s.setAliasAnalysis(k);
+  return s;
+}
+
+/// Function schemas can be directly constructed from string literals.
+///
+/// \ingroup torch-schema-overloads
+inline c10::FunctionSchema schema(const char* s, bool allow_typevars=false) {
+  return schema(s, c10::AliasAnalysisKind::FROM_SCHEMA, allow_typevars);
+}
+
+/// \private
+///
+/// Already constructed function schemas are accepted if they are
+/// rvalues.
+///
+/// \ingroup torch-schema-overloads
+inline c10::FunctionSchema&& schema(c10::FunctionSchema&& s) {
+  return std::move(s);
+}
+
+namespace detail {
+
+inline std::variant<c10::OperatorName, c10::FunctionSchema> constructSchemaOrName(
+    c10::FunctionSchema&& s) {
+  return std::move(s);
+}
+inline std::variant<c10::OperatorName, c10::FunctionSchema> constructSchemaOrName(
+    c10::OperatorName&& n) {
+  return std::move(n);
+}
+inline std::variant<c10::OperatorName, c10::FunctionSchema>
+constructSchemaOrName(const char* str) {
+  auto s = torch::jit::parseSchemaOrName(str);
+  if (std::holds_alternative<c10::FunctionSchema>(s)) {
+    std::get<c10::FunctionSchema>(s).setAliasAnalysis(
+        c10::AliasAnalysisKind::FROM_SCHEMA);
+  }
+  return s;
+}
+
+class TorchLibraryInit;
+
+} // namespace detail
+
+// Note [Selective build]
+// ~~~~~~~~~~~~~~~~~~~~~~
+// In some settings, especially mobile, it is important to avoid compiling any
+// references to functions that you aren't actually going to use, so that they
+// can be eliminated by the linker.  We call this capability "selective build".
+//
+// A very easy way to implement selective build which results in a lot of
+// boilerplate is to just add ifdef's around every registration call, but this
+// means you have to write a lot of extra lines of code at every registration
+// site, and it also means you have to define some munging scheme to map
+// operators to macros.
+//
+// Instead of doing this, we have a different mechanism centered around the
+// concept of a SelectiveStr.  A selective name is like a const char* string,
+// except it also carries at compile time a boolean saying whether or not a
+// registration should actually happen or not.  We then have extra overloads
+// which bypass registration entirely if a selective name is disabled.  We do a
+// constexpr test to see if a operator should be enabled or not; this is
+// currently implemented in ATen/core/op_registration/op_allowlist.h
+
+namespace detail {
+
+// dummy class for non selected custom torchbind classes
+class ClassNotSelected {
+ public:
+  ClassNotSelected& def_pickle(...) {
+    return *this;
+  }
+  ClassNotSelected& def(...) {
+    return *this;
+  }
+};
+
+// A SelectiveStr is like a const char*, except that it also comes
+// with a type brand that says whether or not the name is enabled or
+// not.  If the string is disabled, then (at compile time) we DON'T generate
+// a registration call for it.  This class is not intended to be called
+// directly; use TORCH_SELECTIVE_NAME or TORCH_SELECTIVE_SCHEMA macros below
+// to create it.
+template <bool enabled>
+class SelectiveStr {
+ public:
+  constexpr explicit SelectiveStr(const char* name) : name_(name) {}
+  constexpr operator const char*() {
+    return name_;
+  }
+
+ private:
+  const char* name_;
+};
+
+#define TORCH_SELECTIVE_CLASS(n) \
+  torch::detail::SelectiveStr<c10::impl::custom_class_allowlist_check(n)>(n)
+#define TORCH_SELECTIVE_NAME(n) \
+  torch::detail::SelectiveStr<c10::impl::op_allowlist_check(n)>(n)
+#define TORCH_SELECTIVE_SCHEMA(n) \
+  torch::detail::SelectiveStr<c10::impl::schema_allowlist_check(n)>(n)
+
+} // namespace detail
+
+/// This object provides the API for defining operators and providing
+/// implementations at dispatch keys.  Typically, a torch::Library
+/// is not allocated directly; instead it is created by the
+/// TORCH_LIBRARY() or TORCH_LIBRARY_IMPL() macros.
+///
+/// Most methods on torch::Library return a reference to itself,
+/// supporting method chaining.
+///
+/// ```
+/// // Examples:
+///
+/// TORCH_LIBRARY(torchvision, m) {
+///    // m is a torch::Library
+///    m.def("roi_align", ...);
+///    ...
+/// }
+///
+/// TORCH_LIBRARY_IMPL(aten, XLA, m) {
+///    // m is a torch::Library
+///    m.impl("add", ...);
+///    ...
+/// }
+/// ```
+///
+class TORCH_API Library final {
+ public:
+  /// \private
+  ///
+  /// Which type of macro produced this Library
+  enum Kind {
+    DEF, // from TORCH_LIBRARY (no qualifier)
+    IMPL,
+    FRAGMENT,
+  };
+
+  /// \private
+  ///
+  /// Use TORCH_LIBRARY() or TORCH_LIBRARY_IMPL() instead of using these
+  /// constructors directly
+  Library(
+      Kind kind,
+      std::string ns,
+      std::optional<c10::DispatchKey> k,
+      const char* file,
+      uint32_t line);
+
+  Library(const Library&) = delete;
+  Library& operator=(const Library&) = delete;
+  Library(Library&&) = default;
+  Library& operator=(Library&&) = default;
+  ~Library() = default;
+
+  // Some notes about the API design here.  We had the following constraints:
+  //
+  //  - We need to support multiple "types" of arguments for schema and
+  //    functions (e.g., unnamed lambda types, regular functions, const char*,
+  //    fully instantiated schemas)
+  //  - We don't want to write exponentially many overloads
+  //  - We don't want to rely on implicit conversion to a common type,
+  //    because the C++ compiler will only be willing to do a single
+  //    implicit conversion (reducing the set of valid types which you
+  //    can invoke with); also error messages are worse when an implicit
+  //    conversion is not selected (as the compiler will not explain
+  //    why it didn't select an implicit conversion; this is different
+  //    from overloads where it will explain each candidate overload and
+  //    why it didn't apply)
+  //
+  // To solve all of these constraints at the same time, we use a trick taken
+  // from the pybind11 library: template over the argument in the user visible
+  // API, and inside of the templated function explicitly call an overloaded
+  // function to resolve the argument to a real type.  You get the good error
+  // messages from overloads, but at the same time you only need to write the
+  // overload for any given argument type once.
+
+  /// Declare an operator with a schema, but don't provide any implementations
+  /// for it.  You're expected to then provide implementations using the
+  /// impl() method.  All template arguments are inferred.
+  ///
+  /// \param raw_schema The schema of the operator to be defined.
+  ///     Typically, this is a `const char*` string literal, but any type
+  ///     accepted by torch::schema() is accepted here.
+  ///
+  /// ```
+  /// // Example:
+  /// TORCH_LIBRARY(myops, m) {
+  ///   m.def("add(Tensor self, Tensor other) -> Tensor");
+  /// }
+  /// ```
+
+  Library& def(
+      c10::FunctionSchema&& s,
+      const std::vector<at::Tag>& tags = {},
+      _RegisterOrVerify rv = _RegisterOrVerify::REGISTER) & {
+    return _def(std::move(s), nullptr, tags, rv);
+  }
+
+  Library& def(
+      const char* raw_schema,
+      const std::vector<at::Tag>& tags = {},
+      _RegisterOrVerify rv = _RegisterOrVerify::REGISTER) & {
+    return _def(schema(raw_schema), nullptr, tags, rv);
+  }
+
+  /// Declares that for all operators that are subsequently def'ed, their
+  /// fake impls may be found in the given Python module (pymodule).
+  /// This registers some help text that is used if the fake impl
+  /// cannot be found.
+  ///
+  /// Args:
+  /// - pymodule: the python module
+  /// - context: We may include this in the error message.
+  Library& set_python_module(const char* pymodule, const char* context = "") {
+    python_module_ = {pymodule, context};
+    return *this;
+  }
+
+  /// Deprecated; use set_python_module instead
+  Library& impl_abstract_pystub(const char* pymodule, const char* context = "") {
+    return set_python_module(pymodule, context);
+  }
+
+  /// Define an operator for a schema and then register an implementation for
+  /// it.  This is typically what you would use if you aren't planning
+  /// on making use of the dispatcher to structure your operator
+  /// implementation.  It's roughly equivalent to calling def() and
+  /// then impl(), but if you omit the schema of the operator, we will
+  /// infer it from the type of your C++ function.  All template
+  /// arguments are inferred.
+  ///
+  /// \param raw_name_or_schema The schema of the operator to be
+  ///   defined, or just the name of the operator if the schema is to be
+  ///   inferred from `raw_f`.  Typically a `const char*` literal.
+  /// \param raw_f The C++ function that implements this operator.
+  ///   Any valid constructor of torch::CppFunction is accepted here;
+  ///   typically you provide a function pointer or lambda.
+  ///
+  /// ```
+  /// // Example:
+  /// TORCH_LIBRARY(myops, m) {
+  ///   m.def("add", add_fn);
+  /// }
+  /// ```
+  template <typename NameOrSchema, typename Func>
+  Library& def(NameOrSchema&& raw_name_or_schema, Func&& raw_f,
+      const std::vector<at::Tag>& tags = {}) & {
+    CppFunction f(std::forward<Func>(raw_f));
+    return _def(
+        detail::constructSchemaOrName(
+            ::std::forward<NameOrSchema>(raw_name_or_schema)),
+        ::std::move(f), tags);
+  }
+
+  /// Register an implementation for an operator.  You may register multiple
+  /// implementations for a single operator at different dispatch keys
+  /// (see torch::dispatch()).  Implementations must have a corresponding
+  /// declaration (from def()), otherwise they are invalid.  If you plan
+  /// to register multiple implementations, DO NOT provide a function
+  /// implementation when you def() the operator.
+  ///
+  /// \param name The name of the operator to implement.  Do NOT provide
+  ///   schema here.
+  /// \param raw_f The C++ function that implements this operator.  Any
+  ///   valid constructor of torch::CppFunction is accepted here;
+  ///   typically you provide a function pointer or lambda.
+  ///
+  /// ```
+  /// // Example:
+  /// TORCH_LIBRARY_IMPL(myops, CUDA, m) {
+  ///   m.impl("add", add_cuda);
+  /// }
+  /// ```
+  template <typename Name, typename Func>
+  Library& impl(
+      Name name,
+      Func&& raw_f,
+      _RegisterOrVerify rv = _RegisterOrVerify::REGISTER) & {
+    // TODO: need to raise an error when you impl a function that has a
+    // catch all def
+#if defined C10_MOBILE
+    CppFunction f(std::forward<Func>(raw_f), NoInferSchemaTag());
+#else
+    CppFunction f(std::forward<Func>(raw_f));
+#endif
+    return _impl(name, std::move(f), rv);
+  }
+
+#if defined C10_MOBILE
+  // Note: This overload is needed only for C10_MOBILE, since the automatically
+  // defined copy constructor for the CppFunction doesn't have the additional
+  // NoInferSchemaTag argument. We define the overload for the impl() function
+  // to accept a CppFunction&& argument. The already constructed CppFunction
+  // object may or may not have the inferred schema, but it doesn't matter
+  // for our purposes since if it already has the inferred schema, then we
+  // might as well just pass it through directly.
+  //
+  template <typename Name>
+  Library& impl(Name name, CppFunction&& raw_f) & {
+    // TODO: need to raise an error when you impl a function that has a
+    // catch all def
+    CppFunction f(std::forward<CppFunction>(raw_f));
+    return _impl(name, std::move(f));
+  }
+#endif
+
+  // Helper for getting an OperatorName for a const char*.  You probably
+  // don't need this.
+  c10::OperatorName _resolve(const char* name) const;
+
+  /// \private
+  ///
+  /// Convenience overload for directly specifying the dispatch key when
+  /// impl().  You probably don't need this; instead, prefer specifying
+  /// the dispatch key for the entire block in TORCH_LIBRARY_IMPL()
+  template <typename Name, typename Dispatch, typename Func>
+  Library& impl(Name name, Dispatch&& key, Func&& raw_f) & {
+    return impl(
+        name, dispatch(std::forward<Dispatch>(key), std::forward<Func>(raw_f)));
+  }
+
+  template <typename Name, typename Func>
+  Library& impl_UNBOXED(Name /*name*/, Func* /*raw_f*/) & {
+    static_assert(
+        c10::guts::false_t<Func>(),
+        ".impl_UNBOXED(...) was removed. Please use .impl(...) instead.");
+    return *this;
+  }
+
+  // These overloads cover cases when a SelectiveStr (see Note [Selective
+  // build]) has been disabled at compile time.  In that case, don't generate
+  // any code referencing the passed in functions at all.
+  Library& def(detail::SelectiveStr<false>, const std::vector<at::Tag>& tags [[maybe_unused]] = {}) & {
+    return *this;
+  }
+  Library& def(detail::SelectiveStr<true> raw_schema, const std::vector<at::Tag>& tags = {}) & {
+    return def(raw_schema.operator const char*(), tags);
+  }
+  template <typename Func>
+  Library& def(detail::SelectiveStr<false>, Func&& /*raw_f*/, const std::vector<at::Tag>& tags [[maybe_unused]] = {}) & {
+    return *this;
+  }
+  template <typename Func>
+  Library& def(detail::SelectiveStr<true> raw_name_or_schema, Func&& raw_f, const std::vector<at::Tag>& tags = {}) & {
+    return def(
+        raw_name_or_schema.operator const char*(), std::forward<Func>(raw_f), tags);
+  }
+
+  template <typename Func>
+  // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
+  Library& impl(detail::SelectiveStr<false>, Func&& /*raw_f*/) & {
+    return *this;
+  }
+  template <typename Dispatch, typename Func>
+  Library& impl(
+      detail::SelectiveStr<false>,
+      // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
+      Dispatch&& /*key*/,
+      // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward)
+      Func&& /*raw_f*/) & {
+    return *this;
+  }
+  template <typename Func>
+  Library& impl_UNBOXED(
+      detail::SelectiveStr<false> /*name*/,
+      Func* /*raw_f*/) & {
+    static_assert(
+        c10::guts::false_t<Func>(),
+        ".impl_UNBOXED(...) was removed. Please use .impl(...) instead.");
+    return *this;
+  }
+
+  template <typename Func>
+  Library& impl(detail::SelectiveStr<true> name, Func&& raw_f) & {
+    return impl(name.operator const char*(), std::forward<Func>(raw_f));
+  }
+  template <typename Dispatch, typename Func>
+  Library& impl(
+      detail::SelectiveStr<true> name,
+      Dispatch&& key,
+      Func&& raw_f) & {
+    return impl(
+        name.operator const char*(),
+        std::forward<Dispatch>(key),
+        std::forward<Func>(raw_f));
+  }
+  template <typename Func>
+  Library& impl_UNBOXED(
+      detail::SelectiveStr<true> /*name*/,
+      Func* /*raw_f*/) & {
+    static_assert(
+        c10::guts::false_t<Func>(),
+        ".impl_UNBOXED(...) was removed. Please use .impl(...) instead.");
+    return *this;
+  }
+
+  /// Register a fallback implementation for all operators which will be used
+  /// if there is not a specific implementation for an operator available.
+  /// There MUST be a DispatchKey associated with a fallback; e.g.,
+  /// only call this from TORCH_LIBRARY_IMPL() with namespace `_`.
+  ///
+  /// \param raw_f The function that implements the fallback.  Unboxed
+  ///   functions typically do not work as fallback functions, as
+  ///   fallback functions must work for every operator (even though
+  ///   they have varying type signatures).  Typical arguments are
+  ///   CppFunction::makeFallthrough() or
+  ///   CppFunction::makeFromBoxedFunction()
+  ///
+  /// ```
+  /// // Example:
+  ///
+  /// TORCH_LIBRARY_IMPL(_, AutogradXLA, m) {
+  ///   // If there is not a kernel explicitly registered
+  ///   // for AutogradXLA, fallthrough to the next
+  ///   // available kernel
+  ///   m.fallback(torch::CppFunction::makeFallthrough());
+  /// }
+  ///
+  /// // See aten/src/ATen/core/dispatch/backend_fallback_test.cpp
+  /// // for a full example of boxed fallback
+  /// ```
+  template <typename Func>
+  Library& fallback(Func&& raw_f) & {
+    CppFunction f((std::forward<Func>(raw_f)));
+    return _fallback(std::move(f));
+  }
+
+  template <class CurClass>
+  inline torch::class_<CurClass> class_(const std::string& className);
+
+  // These overloads enable the use of selective build on classes registered
+  // within a library. The API is the same as before with 1 minor change.
+  // Instead of m.class_<foo>("foo") you instead do
+  // m.class_<foo>(TORCH_SELECTIVE_CLASS("foo"))
+  template <class CurClass>
+  inline torch::class_<CurClass> class_(detail::SelectiveStr<true> className);
+
+  template <class CurClass>
+  inline detail::ClassNotSelected class_(detail::SelectiveStr<false> className);
+
+  // De-registers all registrations created with this Library
+  void reset();
+
+ private:
+  Kind kind_;
+  std::optional<std::string> ns_;
+  std::optional<c10::DispatchKey> dispatch_key_;
+  std::optional<std::pair<const char*, const char*>> python_module_;
+  const char* file_;
+  uint32_t line_;
+
+  std::vector<c10::RegistrationHandleRAII> registrars_;
+
+  friend class detail::TorchLibraryInit;
+
+  // Non-user visible actual implementations of functions.  These aren't
+  // public because we only implement & qualifier and not && qualifier
+  Library& _def(
+      c10::FunctionSchema&& schema,
+      c10::OperatorName* out_name = nullptr,
+      const std::vector<at::Tag>& tags = {},
+      _RegisterOrVerify rv = _RegisterOrVerify::REGISTER) &;
+  Library& _def(
+      std::variant<c10::OperatorName, c10::FunctionSchema>&&,
+      CppFunction&& f,
+      const std::vector<at::Tag>& tags = {}) &;
+  Library& _impl(
+      const char* name,
+      CppFunction&& f,
+      _RegisterOrVerify rv = _RegisterOrVerify::REGISTER) &;
+  Library& _fallback(CppFunction&& f) &;
+
+  at::OperatorName _parseNameForLib(const char* name_str) const;
+};
+
+#if defined(TORCH_LIBRARY_THREAD_UNSAFE_LAZY_INIT) && defined(C10_MOBILE)
+void initialize_torch_libraries();
+#endif
+
+namespace detail {
+
+#if defined(TORCH_LIBRARY_THREAD_UNSAFE_LAZY_INIT) && defined(C10_MOBILE)
+// This is an experimental feature to defer TorchLibraryInit cost to run either
+// at model load time, or when a client application explicitly calls
+// torch::initialize_torch_libraries().
+//
+// This is not thread safe, the client is required to ensure that libraries
+// containing TORCH_LIBRARY initializers are loaded in a thread safe manner.
+extern std::vector<TorchLibraryInit*> torch_library_initializers;
+class TorchLibraryInit final {
+    private:
+      using InitFn = void(Library&);
+      Library::Kind kind;
+      InitFn* init_function;
+      const char* ns;
+      std::optional<c10::DispatchKey> key;
+      const char* file;
+      uint32_t line;
+      std::unique_ptr<Library> lib = nullptr;
+
+    public:
+      TorchLibraryInit(
+            Library::Kind kind,
+            InitFn* fn,
+            const char* ns,
+            std::optional<c10::DispatchKey> k,
+            const char* file,
+            uint32_t line) : kind(kind), init_function(fn), ns(ns), key(k), file(file), line(line) {
+              torch_library_initializers.push_back(this);
+            }
+
+      void initialize() {
+        lib = std::unique_ptr<Library>(new Library(kind, ns, key, file, line));
+        init_function(*lib);
+      }
+};
+#else
+class TorchLibraryInit final {
+ private:
+  using InitFn = void(Library&);
+  Library lib_;
+
+ public:
+  TorchLibraryInit(
+      Library::Kind kind,
+      InitFn* fn,
+      const char* ns,
+      std::optional<c10::DispatchKey> k,
+      const char* file,
+      uint32_t line)
+      : lib_(kind, ns, k, file, line) {
+    fn(lib_);
+  }
+};
+#endif
+
+} // namespace detail
+
+} // namespace torch
+
+// NB: The EXACT NAMING of the initializer functions (e.g.,
+// TORCH_LIBRARY_init_aten) matters for the code analyzer;
+// see the regexes at tools/code_analyzer/run_analyzer.sh
+
+/// Macro for defining a function that will be run at static
+/// initialization time to define a library of operators in the
+/// namespace `ns` (must be a valid C++ identifier, no quotes).
+/// Use this macro when you want to define a new set of custom operators
+/// that do not already exist in PyTorch.
+///
+/// Example usage:
+///
+/// ```
+/// TORCH_LIBRARY(myops, m) {
+///   // m is a torch::Library; methods on it will define
+///   // operators in the myops namespace
+///   m.def("add", add_impl);
+/// }
+/// ```
+///
+/// The `m` argument is bound to a torch::Library that is used to
+/// register operators.  There may only be one TORCH_LIBRARY()
+/// for any given namespace.
+#define TORCH_LIBRARY(ns, m)                                                   \
+  static void TORCH_LIBRARY_init_##ns(torch::Library&);                        \
+  static const torch::detail::TorchLibraryInit TORCH_LIBRARY_static_init_##ns( \
+      torch::Library::DEF,                                                     \
+      &TORCH_LIBRARY_init_##ns,                                                \
+      #ns,                                                                     \
+      std::nullopt,                                                            \
+      __FILE__,                                                                \
+      __LINE__);                                                               \
+  void TORCH_LIBRARY_init_##ns(torch::Library& m)
+
+/// \private
+///
+/// This macro is a version of TORCH_LIBRARY() that doesn't enforce that there
+/// is only one library (it is a "fragment").  This is used inside the
+/// PerOpRegistration.cpp file, as well as in places where all op registrations
+/// within the same namespace cannot be easily put into one macro block
+/// (this is mostly the case for custom ops in fbcode that were ported from
+/// the old API)
+#define TORCH_LIBRARY_FRAGMENT(ns, m) _TORCH_LIBRARY_FRAGMENT(ns, m, C10_UID)
+
+/// \private
+///
+/// The above macro requires an extra unique identifier (uid) to prevent
+/// variable name collisions This can happen if TORCH_LIBRARY_FRAGMENT is called
+/// multiple times with the same namespace in the same translation unit. Note
+/// that the TORCH_LIBRARY variant doesn't run into this problem, because it
+/// enforces that it can only be called once for a given namespace.
+#define _TORCH_LIBRARY_FRAGMENT(ns, m, uid)                       \
+  static void C10_CONCATENATE(                                    \
+      TORCH_LIBRARY_FRAGMENT_init_##ns##_, uid)(torch::Library&); \
+  static const torch::detail::TorchLibraryInit C10_CONCATENATE(   \
+      TORCH_LIBRARY_FRAGMENT_static_init_##ns##_, uid)(           \
+      torch::Library::FRAGMENT,                                   \
+      &C10_CONCATENATE(TORCH_LIBRARY_FRAGMENT_init_##ns##_, uid), \
+      #ns,                                                        \
+      std::nullopt,                                               \
+      __FILE__,                                                   \
+      __LINE__);                                                  \
+  void C10_CONCATENATE(                                           \
+      TORCH_LIBRARY_FRAGMENT_init_##ns##_, uid)(torch::Library & m)
+
+/// Macro for defining a function that will be run at static
+/// initialization time to define operator overrides for dispatch key
+/// `k` (must be an unqualified enum member of c10::DispatchKey) in
+/// namespace `ns` (must be a valid C++ identifer, no quotes).  Use this
+/// macro when you want to implement a preexisting set of custom
+/// operators on a new dispatch key (e.g., you want to provide CUDA
+/// implementations of already existing operators).  One common usage
+/// pattern is to use TORCH_LIBRARY() to define schema for all new
+/// operators you want to define, and then use several
+/// TORCH_LIBRARY_IMPL() blocks to provide implementations of the
+/// operator for CPU, CUDA and Autograd.
+///
+/// In some cases, you need to define something that applies to all namespaces,
+/// not just one namespace (usually a fallback).  In that case, use the reserved
+/// namespace _, e.g.,
+///
+/// ```
+/// TORCH_LIBRARY_IMPL(_, XLA, m) {
+///    m.fallback(xla_fallback);
+/// }
+/// ```
+///
+/// Example usage:
+///
+/// ```
+/// TORCH_LIBRARY_IMPL(myops, CPU, m) {
+///   // m is a torch::Library; methods on it will define
+///   // CPU implementations of operators in the myops namespace.
+///   // It is NOT valid to call torch::Library::def()
+///   // in this context.
+///   m.impl("add", add_cpu_impl);
+/// }
+/// ```
+///
+/// If ``add_cpu_impl`` is an overloaded function, use a
+/// ``static_cast`` to specify which overload you want
+/// (by providing the full type).
+///
+// NB: if the dispatch key is not whitelisted, we simply omit the Library
+// call entirely
+#define TORCH_LIBRARY_IMPL(ns, k, m) _TORCH_LIBRARY_IMPL(ns, k, m, C10_UID)
+
+/// \private
+///
+/// The above macro requires an extra unique identifier (uid) to prevent
+/// variable name collisions. This can happen if TORCH_LIBRARY_IMPL is called
+/// multiple times with the same namespace and dispatch key in the same
+/// translation unit.
+#define _TORCH_LIBRARY_IMPL(ns, k, m, uid)                                \
+  static void C10_CONCATENATE(                                            \
+      TORCH_LIBRARY_IMPL_init_##ns##_##k##_, uid)(torch::Library&);       \
+  static const torch::detail::TorchLibraryInit C10_CONCATENATE(           \
+      TORCH_LIBRARY_IMPL_static_init_##ns##_##k##_, uid)(                 \
+      torch::Library::IMPL,                                               \
+      &C10_CONCATENATE(TORCH_LIBRARY_IMPL_init_##ns##_##k##_, uid),       \
+      #ns,                                                                \
+      std::make_optional(c10::DispatchKey::k),                            \
+      __FILE__,                                                           \
+      __LINE__);                                                          \
+  void C10_CONCATENATE(                                                   \
+      TORCH_LIBRARY_IMPL_init_##ns##_##k##_, uid)(torch::Library & m)
+
+// These are variants of the macros above which are to be used for testing (they
+// don't setup the static initializer, so you can control the visibility of
+// the allocated library yourself).
+//
+// DO NOT use these in production code, they are NOT understood by the
+// code analyzer and will be incorrectly analyzed in those situations.
+
+/// \private
+#define MAKE_TORCH_LIBRARY(ns) \
+  torch::Library(torch::Library::DEF, #ns, std::nullopt, __FILE__, __LINE__)
+/// \private
+#define MAKE_TORCH_LIBRARY_IMPL(ns, k)         \
+  torch::Library(                              \
+      torch::Library::IMPL,                    \
+      #ns,                                     \
+      std::make_optional(c10::DispatchKey::k), \
+      __FILE__,                                \
+      __LINE__)
+
+// Make the custom class API visible, so it is available from
+// torch::Library.
+
+#include <torch/custom_class.h>
diff --git a/phivenv/Lib/site-packages/torch/include/torch/script.h b/phivenv/Lib/site-packages/torch/include/torch/script.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ef3b444b9354f19204c4c112f0f4582fea8218f
--- /dev/null
+++ b/phivenv/Lib/site-packages/torch/include/torch/script.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <torch/csrc/api/include/torch/types.h>
+#include <torch/csrc/autograd/InferenceMode.h>
+#include <torch/csrc/autograd/custom_function.h>
+#include <torch/csrc/autograd/generated/variable_factories.h>
+#include <torch/csrc/autograd/grad_mode.h>
+#include <torch/csrc/jit/runtime/custom_operator.h>
+#include <torch/csrc/jit/serialization/import.h>
+#include <torch/csrc/jit/serialization/pickle.h>
+#include <torch/custom_class.h>
+
+#include <ATen/ATen.h>