koichi12 commited on Feb 12, 2025

Commit

7469295

verified ·

1 Parent(s): f681997

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.venv/lib/python3.11/site-packages/torch/_export/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_export/__pycache__/converter.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_export/__pycache__/error.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_export/__pycache__/non_strict_utils.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_export/__pycache__/pass_base.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_export/__pycache__/tools.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_export/__pycache__/utils.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_export/__pycache__/verifier.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_export/__pycache__/wrappers.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_export/db/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_export/db/__pycache__/case.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_export/db/__pycache__/gen_example.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_export/db/__pycache__/logging.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_export/db/examples/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_export/db/examples/__pycache__/constrain_as_value_example.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_export/db/examples/__pycache__/dynamic_shape_map.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_export/db/examples/__pycache__/dynamic_shape_round.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_export/db/examples/__pycache__/dynamic_shape_slicing.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_export/db/examples/__pycache__/model_attr_mutation.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_export/db/examples/__pycache__/optional_input.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_export/db/examples/__pycache__/scalar_output.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_export/db/examples/__pycache__/static_for_loop.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_export/db/examples/__pycache__/static_if.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_export/db/examples/__pycache__/type_reflection_method.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_export/passes/__init__.py +1 -0
.venv/lib/python3.11/site-packages/torch/_export/passes/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_export/passes/__pycache__/_node_metadata_hook.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_export/passes/__pycache__/add_runtime_assertions_for_constraints_pass.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_export/passes/__pycache__/collect_tracepoints_pass.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_export/passes/__pycache__/constant_folding.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_export/passes/__pycache__/functionalize_side_effectful_ops_pass.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_export/passes/__pycache__/lift_constants_pass.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_export/passes/__pycache__/remove_runtime_assertions.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_export/passes/__pycache__/replace_autocast_with_hop_pass.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_export/passes/__pycache__/replace_quantized_ops_with_standard_ops_pass.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_export/passes/__pycache__/replace_set_grad_with_hop_pass.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_export/passes/__pycache__/replace_view_ops_with_view_copy_ops_pass.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_export/passes/__pycache__/replace_with_hop_pass_util.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/torch/_export/passes/_node_metadata_hook.py +80 -0
.venv/lib/python3.11/site-packages/torch/_export/passes/add_runtime_assertions_for_constraints_pass.py +227 -0
.venv/lib/python3.11/site-packages/torch/_export/passes/collect_tracepoints_pass.py +102 -0
.venv/lib/python3.11/site-packages/torch/_export/passes/constant_folding.py +299 -0
.venv/lib/python3.11/site-packages/torch/_export/passes/functionalize_side_effectful_ops_pass.py +94 -0
.venv/lib/python3.11/site-packages/torch/_export/passes/lift_constants_pass.py +318 -0
.venv/lib/python3.11/site-packages/torch/_export/passes/remove_runtime_assertions.py +27 -0
.venv/lib/python3.11/site-packages/torch/_export/passes/replace_autocast_with_hop_pass.py +179 -0
.venv/lib/python3.11/site-packages/torch/_export/passes/replace_quantized_ops_with_standard_ops_pass.py +673 -0
.venv/lib/python3.11/site-packages/torch/_export/passes/replace_set_grad_with_hop_pass.py +110 -0
.venv/lib/python3.11/site-packages/torch/_export/passes/replace_view_ops_with_view_copy_ops_pass.py +65 -0
.venv/lib/python3.11/site-packages/torch/_export/passes/replace_with_hop_pass_util.py +178 -0

.venv/lib/python3.11/site-packages/torch/_export/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (16.7 kB). View file

.venv/lib/python3.11/site-packages/torch/_export/__pycache__/converter.cpython-311.pyc ADDED Viewed

Binary file (82.8 kB). View file

.venv/lib/python3.11/site-packages/torch/_export/__pycache__/error.cpython-311.pyc ADDED Viewed

Binary file (2.78 kB). View file

.venv/lib/python3.11/site-packages/torch/_export/__pycache__/non_strict_utils.cpython-311.pyc ADDED Viewed

Binary file (24.3 kB). View file

.venv/lib/python3.11/site-packages/torch/_export/__pycache__/pass_base.cpython-311.pyc ADDED Viewed

Binary file (27.6 kB). View file

.venv/lib/python3.11/site-packages/torch/_export/__pycache__/tools.cpython-311.pyc ADDED Viewed

Binary file (7.09 kB). View file

.venv/lib/python3.11/site-packages/torch/_export/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (43.3 kB). View file

.venv/lib/python3.11/site-packages/torch/_export/__pycache__/verifier.cpython-311.pyc ADDED Viewed

Binary file (25.7 kB). View file

.venv/lib/python3.11/site-packages/torch/_export/__pycache__/wrappers.cpython-311.pyc ADDED Viewed

Binary file (7.99 kB). View file

.venv/lib/python3.11/site-packages/torch/_export/db/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (189 Bytes). View file

.venv/lib/python3.11/site-packages/torch/_export/db/__pycache__/case.cpython-311.pyc ADDED Viewed

Binary file (8.44 kB). View file

.venv/lib/python3.11/site-packages/torch/_export/db/__pycache__/gen_example.cpython-311.pyc ADDED Viewed

Binary file (1.36 kB). View file

.venv/lib/python3.11/site-packages/torch/_export/db/__pycache__/logging.cpython-311.pyc ADDED Viewed

Binary file (1.74 kB). View file

.venv/lib/python3.11/site-packages/torch/_export/db/examples/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (4.38 kB). View file

.venv/lib/python3.11/site-packages/torch/_export/db/examples/__pycache__/constrain_as_value_example.cpython-311.pyc ADDED Viewed

Binary file (1.56 kB). View file

.venv/lib/python3.11/site-packages/torch/_export/db/examples/__pycache__/dynamic_shape_map.cpython-311.pyc ADDED Viewed

Binary file (1.27 kB). View file

.venv/lib/python3.11/site-packages/torch/_export/db/examples/__pycache__/dynamic_shape_round.cpython-311.pyc ADDED Viewed

Binary file (1.34 kB). View file

.venv/lib/python3.11/site-packages/torch/_export/db/examples/__pycache__/dynamic_shape_slicing.cpython-311.pyc ADDED Viewed

Binary file (1.11 kB). View file

.venv/lib/python3.11/site-packages/torch/_export/db/examples/__pycache__/model_attr_mutation.cpython-311.pyc ADDED Viewed

Binary file (2.01 kB). View file

.venv/lib/python3.11/site-packages/torch/_export/db/examples/__pycache__/optional_input.cpython-311.pyc ADDED Viewed

Binary file (1.13 kB). View file

.venv/lib/python3.11/site-packages/torch/_export/db/examples/__pycache__/scalar_output.cpython-311.pyc ADDED Viewed

Binary file (1.52 kB). View file

.venv/lib/python3.11/site-packages/torch/_export/db/examples/__pycache__/static_for_loop.cpython-311.pyc ADDED Viewed

Binary file (1.1 kB). View file

.venv/lib/python3.11/site-packages/torch/_export/db/examples/__pycache__/static_if.cpython-311.pyc ADDED Viewed

Binary file (1.1 kB). View file

.venv/lib/python3.11/site-packages/torch/_export/db/examples/__pycache__/type_reflection_method.cpython-311.pyc ADDED Viewed

Binary file (1.49 kB). View file

.venv/lib/python3.11/site-packages/torch/_export/passes/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .replace_view_ops_with_view_copy_ops_pass import ReplaceViewOpsWithViewCopyOpsPass

.venv/lib/python3.11/site-packages/torch/_export/passes/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (305 Bytes). View file

.venv/lib/python3.11/site-packages/torch/_export/passes/__pycache__/_node_metadata_hook.cpython-311.pyc ADDED Viewed

Binary file (4.17 kB). View file

.venv/lib/python3.11/site-packages/torch/_export/passes/__pycache__/add_runtime_assertions_for_constraints_pass.cpython-311.pyc ADDED Viewed

Binary file (12.3 kB). View file

.venv/lib/python3.11/site-packages/torch/_export/passes/__pycache__/collect_tracepoints_pass.cpython-311.pyc ADDED Viewed

Binary file (5.73 kB). View file

.venv/lib/python3.11/site-packages/torch/_export/passes/__pycache__/constant_folding.cpython-311.pyc ADDED Viewed

Binary file (14.8 kB). View file

.venv/lib/python3.11/site-packages/torch/_export/passes/__pycache__/functionalize_side_effectful_ops_pass.cpython-311.pyc ADDED Viewed

Binary file (5.28 kB). View file

.venv/lib/python3.11/site-packages/torch/_export/passes/__pycache__/lift_constants_pass.cpython-311.pyc ADDED Viewed

Binary file (15.2 kB). View file

.venv/lib/python3.11/site-packages/torch/_export/passes/__pycache__/remove_runtime_assertions.cpython-311.pyc ADDED Viewed

Binary file (1.63 kB). View file

.venv/lib/python3.11/site-packages/torch/_export/passes/__pycache__/replace_autocast_with_hop_pass.cpython-311.pyc ADDED Viewed

Binary file (8.71 kB). View file

.venv/lib/python3.11/site-packages/torch/_export/passes/__pycache__/replace_quantized_ops_with_standard_ops_pass.cpython-311.pyc ADDED Viewed

Binary file (28.7 kB). View file

.venv/lib/python3.11/site-packages/torch/_export/passes/__pycache__/replace_set_grad_with_hop_pass.cpython-311.pyc ADDED Viewed

Binary file (5.98 kB). View file

.venv/lib/python3.11/site-packages/torch/_export/passes/__pycache__/replace_view_ops_with_view_copy_ops_pass.cpython-311.pyc ADDED Viewed

Binary file (3.96 kB). View file

.venv/lib/python3.11/site-packages/torch/_export/passes/__pycache__/replace_with_hop_pass_util.cpython-311.pyc ADDED Viewed

Binary file (8.48 kB). View file

.venv/lib/python3.11/site-packages/torch/_export/passes/_node_metadata_hook.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# mypy: allow-untyped-defs
+import contextlib
+import torch
+from torch.fx.graph_module import GraphModule
+_EMPTY_NN_MODULE_STACK_KEY = "_empty_nn_module_stack_from_metadata_hook"
+def _node_metadata_hook(node: torch.fx.Node, stack_trace: str) -> None:
+    """
+    Hook for adding the appropriate metadata to nodes that are created during a
+    pass using graph.create_node. An example of how to use it:
+    ```
+    with _set_node_metadata_hook(gm,
+        functools.partial(_node_metadata_hook, stack_trace="file")
+    ):
+        pass(gm)
+    ```
+    This hook should not work for all generic cases -- specifically it assumes
+    that nodes being added are only call_function nodes, and copies over the
+    first argument node's nn_module_stack.
+    """
+    assert node.op == "call_function" and callable(node.target)
+    arg_meta = [arg.meta for arg in node.args if isinstance(arg, torch.fx.Node)]
+    assert len(arg_meta) >= 1
+    arg_meta = arg_meta[0]
+    if (
+        isinstance(node.target, torch._ops.OpOverload)
+        and len(node.target._schema.returns) == 0
+    ):
+        node.meta["val"] = None
+    else:
+        fake_args = [
+            arg.meta["val"] if isinstance(arg, torch.fx.Node) else arg
+            for arg in node.args
+        ]
+        fake_res = node.target(*fake_args)
+        node.meta["val"] = fake_res
+    node.meta["stack_trace"] = stack_trace
+    node.meta["nn_module_stack"] = arg_meta.get(
+        "nn_module_stack",
+        {
+            _EMPTY_NN_MODULE_STACK_KEY: (
+                _EMPTY_NN_MODULE_STACK_KEY,
+                _EMPTY_NN_MODULE_STACK_KEY,
+            )
+        },
+    )
+    node.meta["torch_fn"] = (
+        f"{node.target.__name__}_0",
+        f"{node.target.__class__.__name__}.{node.target.__name__}",
+    )
+@contextlib.contextmanager
+def _set_node_metadata_hook(gm: torch.fx.GraphModule, f):
+    """
+    Takes a callable which will be called after we create a new node. The
+    callable takes the newly created node as input and returns None.
+    """
+    assert callable(f), "node_metadata_hook must be a callable."
+    # Add the hook to all submodules
+    for m in gm.modules():
+        if isinstance(m, GraphModule):
+            m._register_create_node_hook(f)
+    try:
+        yield
+    finally:
+        # Restore hook for all submodules
+        for m in gm.modules():
+            if isinstance(m, GraphModule):
+                m._unregister_create_node_hook(f)

.venv/lib/python3.11/site-packages/torch/_export/passes/add_runtime_assertions_for_constraints_pass.py ADDED Viewed

	@@ -0,0 +1,227 @@

+# mypy: allow-untyped-defs
+import math
+import operator
+import traceback
+from functools import partial
+from typing import Callable, Dict, List, NamedTuple, Set
+import sympy
+import torch
+import torch.fx
+from torch.utils._sympy.value_ranges import ValueRanges
+from torch.utils._sympy.numbers import int_oo
+from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols
+from torch.fx.passes.infra.pass_base import PassBase, PassResult
+__all__ = ["InputDim"]
+class InputDim(NamedTuple):
+    input_name: str
+    dim: int
+def _convert_to_int(val):
+    # Convert simple sympy Integers into concrete int
+    if val in (sympy.oo, int_oo):
+        return math.inf
+    if val in (-sympy.oo, -int_oo):
+        return -math.inf
+    if isinstance(val, sympy.Integer):
+        return int(val)
+    raise RuntimeError(
+        "Export constraints cannot be non-integer expressions"
+    )
+def _convert_range_to_int(range: ValueRanges):
+    assert isinstance(range, ValueRanges)
+    min_val = _convert_to_int(range.lower)
+    max_val = _convert_to_int(range.upper)
+    return min_val, max_val
+class _AddRuntimeAssertionsForInlineConstraintsPass(PassBase):
+    def __init__(
+        self,
+        range_constraints: Dict[sympy.Symbol, ValueRanges],
+    ):
+        super().__init__()
+        self.range_constraints: Dict[sympy.Symbol, ValueRanges] = range_constraints
+        self._asserts_generated_unbacked_symbols: Set[sympy.Symbol] = set()
+        self.counter = 0
+    def _assert_range_constraint(self, node, lower, upper, assert_msg):
+        last_node = node
+        if lower > -math.inf:
+            last_node = self._insert_assert_async(last_node, operator.ge, node, lower, assert_msg)
+        if upper < math.inf:
+            last_node = self._insert_assert_async(last_node, operator.le, node, upper, assert_msg)
+    def _insert_assert_async(self, last_node, op, lower, upper, assert_msg):
+        """
+        Inserts assert_async call_function nodes in the graph. This function is
+        called **during** the interpreter-based pass.
+        """
+        self.counter += 1
+        graph = last_node.graph
+        with graph.inserting_after(last_node):
+            cmp = graph.call_function(op, (lower, upper), {})
+        with graph.inserting_after(cmp):
+            cmp_tensor = graph.call_function(torch.ops.aten.scalar_tensor.default, (cmp,), {})
+        with graph.inserting_after(cmp_tensor):
+            assert_async = graph.call_function(
+                torch.ops.aten._assert_async.msg,
+                (cmp_tensor, assert_msg),
+                {},
+            )
+        return assert_async
+    def call(self, graph_module) -> PassResult:
+        self.existing_inline_assertions = _get_existing_inline_assertions(
+            graph_module, self.range_constraints
+        )
+        for module in graph_module.modules():
+            if not isinstance(module, torch.fx.GraphModule):
+                continue
+            for node in module.graph.nodes:
+                if node.op != "call_function":
+                    continue
+                if "val" not in node.meta:
+                    continue
+                val = node.meta["val"]
+                # In general, we may have to deal the case such as: ret[1].shape[0].
+                # We need first find out what symbols require assertion, then we need to follow the path
+                # from ret to the symbol, construct the proxies along the way and construct the messages
+                # piece-wise at the same time.
+                #
+                # We use post-order traversal to collect all the proxies callbacks needed, construct
+                # the error message callbacks, and at the top-level traversal tree we execute all the callbacks.
+                # We need the callbacks because, in order to call the function to create a proxy for shape[0], we
+                # need the proxy for shape, which further requires the proxy for ret[1], etc.
+                def add_assertions(val):
+                    call_backs: List[Callable] = []
+                    messages: List[str] = []
+                    if isinstance(val, (torch.SymInt, torch.SymFloat, torch.SymBool)):
+                        symbol = val.node.expr
+                        if symbol in self.existing_inline_assertions:
+                            return call_backs, messages
+                        if isinstance(symbol, sympy.Symbol) and free_unbacked_symbols(symbol):
+                            if symbol in self._asserts_generated_unbacked_symbols:
+                                return call_backs, messages
+                            # We only care about unbacked symints for these inline
+                            # constraints, which are prefixed with 'u'
+                            constraint = self.range_constraints[symbol]
+                            min_val, max_val = _convert_range_to_int(constraint)
+                            assert_msg = f" is outside of inline constraint [{min_val}, {max_val}]."
+                            call_backs.append(
+                                partial(self._assert_range_constraint, lower=min_val, upper=max_val)
+                            )
+                            messages.append(assert_msg)
+                            self._asserts_generated_unbacked_symbols.add(symbol)
+                    elif isinstance(val, torch.Tensor):
+                        for i, sym in enumerate(val.shape):
+                            cbs, msgs = add_assertions(sym)
+                            for cb, msg in zip(cbs, msgs):
+                                def sym_size_cb(node, assert_msg, dim):
+                                    with node.graph.inserting_after(node):
+                                        dim_node = module.graph.call_function(
+                                            torch.ops.aten.sym_size.int,
+                                            (node, dim),
+                                            {},
+                                        )
+                                    cb(node=dim_node, assert_msg=assert_msg)
+                                call_backs.append(partial(sym_size_cb, dim=i))
+                                messages.append(f".shape[{i}]" + msg)
+                    return call_backs, messages
+                callbacks, messages = add_assertions(val)
+                for cb, msg in zip(callbacks, messages):
+                    cb(node=node, assert_msg=f"{node}" + msg)
+            module.recompile()
+        # Sometimes this pass would return a wrong graph where we have mismatched
+        # node names in signature. Before we fix it, let's just skip it.
+        if self.counter == 0 and type(self) is _AddRuntimeAssertionsForInlineConstraintsPass:
+            return PassResult(graph_module, False)
+        # Populate the stack trace with dummy vals to respect IR
+        for node in graph_module.graph.nodes:
+            if not node.meta.get("stack_trace", None) and node.op not in ["placeholder", "output"]:
+                node.meta["stack_trace"] = "".join(traceback.format_stack(limit=1))
+        return PassResult(graph_module, True)
+def _get_existing_inline_assertions(
+    graph_module: torch.fx.GraphModule,
+    range_constraints: Dict[sympy.Symbol, ValueRanges],
+) -> Dict[sympy.Symbol, ValueRanges]:
+    existing_inline_assertions: Dict[sympy.Symbol, ValueRanges] = {}
+    for module in graph_module.modules():
+        if not isinstance(module, torch.fx.GraphModule):
+            continue
+        # Find all the existing inline assertions. They will look something like:
+        # %_local_scalar_dense = call_function[target=torch.ops.aten._local_scalar_dense.default](args = (%arg1_1,), kwargs = {})
+        # %ge = call_function[target=operator.ge](args = (%_local_scalar_dense, 0), kwargs = {})
+        # %_assert_scalar = call_function[target=torch.ops.aten._assert_scalar.default](args = (%scalar_tensor, "..."), kwargs = {})
+        for node in module.graph.nodes:
+            if node.target != torch.ops.aten._assert_scalar.default:
+                continue
+            compare_arg = node.args[0]
+            if not (
+                isinstance(compare_arg, torch.fx.Node) and
+                compare_arg.op == "call_function" and
+                compare_arg.target in (operator.le, operator.ge) and
+                len(compare_arg.args) == 2
+            ):
+                continue
+            compare_op = compare_arg.target
+            lhs, rhs = compare_arg.args
+            def maybe_get_symint(x):
+                if (
+                    isinstance(x, torch.fx.Node) and
+                    "val" in x.meta and
+                    isinstance(x.meta["val"], torch.SymInt)
+                ):
+                    return x.meta["val"].node.expr
+                return x
+            lhs = maybe_get_symint(lhs)
+            rhs = maybe_get_symint(rhs)
+            if compare_op == operator.ge:
+                lhs, rhs = rhs, lhs
+            if isinstance(lhs, sympy.Symbol) and isinstance(rhs, int):
+                symint = lhs
+                scalar = rhs
+            elif isinstance(rhs, sympy.Symbol) and isinstance(lhs, int):
+                symint = rhs
+                scalar = lhs
+            else:
+                continue
+            if symint not in range_constraints:
+                raise RuntimeError(f"Unable to find symint {symint} in {range_constraints}")
+            previous_range = existing_inline_assertions.get(symint, ValueRanges(-math.inf, math.inf))
+            if symint is lhs:
+                bounds = ValueRanges(-math.inf, scalar)
+            else:
+                bounds = ValueRanges(scalar, math.inf)
+            existing_inline_assertions[symint] = previous_range & bounds
+    return existing_inline_assertions

.venv/lib/python3.11/site-packages/torch/_export/passes/collect_tracepoints_pass.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# mypy: allow-untyped-defs
+import operator
+import torch
+from torch.export.exported_program import ConstantArgument, TensorArgument
+from torch.fx.passes.infra.pass_base import PassBase, PassResult
+__all__ = ["CollectTracepointsPass"]
+class CollectTracepointsPass(PassBase):
+    """
+    Performs constant folding and constant propagation.
+    """
+    def __init__(self, specs, sig) -> None:
+        super().__init__()
+        self.specs = specs
+        self.sig = sig
+    def call(self, gm):
+        def get_arg_spec(arg):
+            if isinstance(arg, torch.fx.Node):
+                if isinstance(arg.meta.get("val"), torch.Tensor):
+                    return TensorArgument(name=arg.name)
+                else:
+                    raise AssertionError(
+                        "Symint input is not implemented yet for submodule call signature."
+                    )
+            else:
+                return ConstantArgument(name="", value=arg)
+        for module in gm.modules():
+            if not isinstance(module, torch.fx.GraphModule):
+                continue
+            nn_module_stack = None
+            for node in module.graph.nodes:
+                if node.op != "call_function":
+                    continue
+                if node.target == torch.ops.higher_order._export_tracepoint:
+                    kind = node.kwargs["kind"]
+                    if kind == "module_call_outputs":
+                        nn_module_stack = node.meta["nn_module_stack"]
+                    elif kind == "module_call_inputs":
+                        nn_module_stack = None
+                    else:
+                        raise AssertionError(f"Unknown tracepoint kind: {kind}")
+                elif node.meta["nn_module_stack"] == nn_module_stack:
+                    node.meta["nn_module_stack"].popitem()
+                else:
+                    nn_module_stack = None
+            nn_module_stack = None
+            for node in reversed(module.graph.nodes):
+                if node.op != "call_function":
+                    continue
+                if node.target == torch.ops.higher_order._export_tracepoint:
+                    kind = node.kwargs["kind"]
+                    if kind == "module_call_inputs":
+                        nn_module_stack = node.meta["nn_module_stack"]
+                    elif kind == "module_call_outputs":
+                        nn_module_stack = None
+                    else:
+                        raise AssertionError(f"Unknown tracepoint kind: {kind}")
+                elif node.meta["nn_module_stack"] == nn_module_stack:
+                    node.meta["nn_module_stack"].popitem()
+                else:
+                    nn_module_stack = None
+        for module in gm.modules():
+            if not isinstance(module, torch.fx.GraphModule):
+                continue
+            for node in module.graph.nodes:
+                if node.op != "call_function":
+                    continue
+                if node.target == torch.ops.higher_order._export_tracepoint:
+                    for i, arg in enumerate(node.args):
+                        kind = node.kwargs["kind"]
+                        if kind == "module_call_inputs":
+                            self.specs[node.kwargs["path"]].inputs.append(
+                                get_arg_spec(arg)
+                            )
+                        elif kind == "module_call_outputs":
+                            self.specs[node.kwargs["path"]].outputs.append(
+                                get_arg_spec(arg)
+                            )
+                        else:
+                            raise AssertionError(f"Unknown tracepoint kind: {kind}")
+                        if isinstance(arg, torch.fx.Node):
+                            for user in node.users:
+                                assert user.op == "call_function"
+                                assert user.target == operator.getitem
+                                assert isinstance(user.args[1], int)
+                                if user.args[1] == i:
+                                    user.replace_all_uses_with(arg)
+                                    self.sig.replace_all_uses(user.name, arg.name)
+                                    break
+                    users = list(node.users)
+                    for user in users:
+                        assert len(user.users) == 0
+                        gm.graph.erase_node(user)
+                    gm.graph.erase_node(node)
+            return PassResult(gm, True)

.venv/lib/python3.11/site-packages/torch/_export/passes/constant_folding.py ADDED Viewed

	@@ -0,0 +1,299 @@

+# mypy: allow-untyped-defs
+import collections
+from collections import defaultdict
+from typing import Any, Callable, Dict, Optional
+import torch
+import torch.utils._pytree as pytree
+aten = torch.ops.aten
+# We would like to split modules into two subgraphs for runtime weight updates to work correctly.
+# The use case and more information could be found at:
+# https://docs.google.com/document/d/1inZC-8KarJ6gKB7G9egmYLx1V_dKX_apxon0w4zPC0Q/edit?usp=sharing
+META_TAG = "MODULE_TYPE"
+MODULE_TAG = "_MAIN_MODULE"
+CONST_MODULE_TAG = "_CONST_MODULE"
+def replace_node_with_constant(gm, node, constant, name=None):
+    g = gm.graph
+    if name:
+        qualname = name
+    else:
+        if not hasattr(gm, "_frozen_param_count"):
+            gm._frozen_param_count = 0
+        i = gm._frozen_param_count
+        while True:
+            qualname = f"_frozen_param{i}"
+            if not hasattr(gm, qualname):
+                break
+            i += 1
+        gm._frozen_param_count = i + 1
+    with g.inserting_before(node):
+        new_input_node = g.create_node("get_attr", qualname, (), {})
+        node.replace_all_uses_with(new_input_node)
+        new_input_node.meta.update(node.meta)
+        g.erase_node(node)
+    # needed to suppress `does not reference an nn.Module, nn.Parameter, or buffer` warning
+    gm.register_buffer(qualname, constant)
+    setattr(gm, qualname, constant)
+class ConstantFolder(torch.fx.Interpreter):
+    def __init__(
+        self,
+        gm,
+        skip_constructors=False,
+    ):
+        super().__init__(gm)
+        self.node_replacements: Dict[torch.fx.Node, Any] = {}
+        self.replaced_uses: Dict[torch.fx.Node, int] = collections.Counter()
+        self.unknown_value = object()
+        self.skip_constructors: bool = skip_constructors
+        # overwrite this to deallocate env values if their only remaining use
+        # is the output
+        self.user_to_last_uses = self.node_to_last_non_output_use()
+    def is_impure(self, node: torch.fx.node.Node):
+        if (
+            node.target == torch.ops.prims.convert_element_type.default
+            and node.args[0].op == "get_attr"  # type: ignore[union-attr]
+            and node.args[0].meta["val"].dtype == torch.int8  # type: ignore[union-attr]
+            and node.args[1] == torch.bfloat16
+        ):
+            # For int8_weight -> dq -> bf16_weight
+            return True
+        if node.target in [
+            torch.ops.quantized_decomposed.dequantize_per_channel.default,
+            torch.ops.quantized_decomposed.dequantize_per_tensor.default,
+            torch.ops.quantized_decomposed.dequantize_per_tensor.tensor,
+        ]:
+            # For the pattern fp32_weight -> q -> dq
+            # We only folding fp32_weight -> q
+            # int8_weight and leave dq in graph to be fused
+            return True
+        return False
+    def node_to_last_non_output_use(self):
+        last_non_output_use = collections.defaultdict(list)
+        seen_uses = set()
+        output_node = next(iter(reversed(self.module.graph.nodes)))
+        for node in reversed(self.module.graph.nodes):
+            if node.target == "output":
+                continue
+            def add_use(inp):
+                if inp in seen_uses:
+                    return
+                seen_uses.add(inp)
+                last_non_output_use[node].append(inp)
+            # In-place is fine since we don't mutate
+            pytree.tree_map_only_(torch.fx.Node, add_use, (node.args, node.kwargs))
+            # if this node is only used in output, we want to gc it right away
+            if len(node.users) == 1 and output_node in node.users:
+                last_non_output_use[node].append(node)
+        return last_non_output_use
+    def run_node(self, node):
+        if node.target == "output":
+            # because we remove nodes from env on last non output use,
+            # re-define them now or we'll get error in interpreter
+            def set_env(arg):
+                self.env[arg] = self.unknown_value
+            # In-place is fine since we don't mutate
+            pytree.tree_map_only_(torch.fx.Node, set_env, node.args)
+            return super().run_node(node)
+        args, kwargs = self.fetch_args_kwargs_from_env(node)
+        flattened_inputs = pytree.arg_tree_leaves(*args, **kwargs)
+        # We need to do this weird thing because in cases where flattened_inputs
+        # contains a ScriptObject, equality checking results in a type error if
+        # the types are different.
+        if any(
+            type(self.unknown_value) == type(input_) and self.unknown_value == input_
+            for input_ in flattened_inputs
+        ):
+            return self.unknown_value
+        # TODO - fix errors with this
+        if (
+            node.op == "call_function"
+            and node.target == aten._efficientzerotensor.default
+        ):
+            return self.unknown_value
+        # TODO - constant folding triton kernel returns the inputs -- fix this
+        if (
+            node.op == "call_function"
+            and node.name == "triton_kernel_wrapper_functional_proxy"
+        ):
+            return self.unknown_value
+        # skip constructors, since inductor generates optimal code for them already
+        # and turning into tensor would result in an additional global memory read
+        # TODO - more complicated strategy
+        if (
+            self.skip_constructors
+            and node.op != "get_attr"
+            and not any(isinstance(e, torch.Tensor) for e in flattened_inputs)
+        ):
+            return self.unknown_value
+        # All mutations should either be removed or on inputs which we did not make constant
+        if (
+            isinstance(node.target, torch._ops.OpOverload)
+            and torch.Tag.nondeterministic_seeded in node.target.tags
+        ):
+            return self.unknown_value
+        out = super().run_node(node)
+        if node.op != "get_attr" and isinstance(out, torch.Tensor):
+            if out.device.type == "meta":
+                return out
+            if not self.insertable_tensor_check(out):
+                return out
+            if self.is_impure(node):
+                return self.unknown_value
+            self.add_node_replacement(node, out)
+            flattened_node_inps = pytree.arg_tree_leaves(*node.args, **node.kwargs)
+            for n in flattened_node_inps:
+                if not isinstance(n, torch.fx.Node):
+                    continue
+                self.replaced_uses[n] += 1
+            for to_delete in self.user_to_last_uses.get(node, []):
+                if self.replaced_uses[to_delete] == len(to_delete.users):
+                    self.node_replacements.pop(to_delete, None)
+        return out
+    def insertable_tensor_check(self, tensor: torch.Tensor) -> bool:
+        return True
+    def add_node_replacement(self, node: torch.fx.Node, tensor: torch.Tensor) -> None:
+        self.node_replacements[node] = tensor
+    def run(self):
+        env = {}
+        for n in self.module.graph.find_nodes(op="placeholder"):
+            env[n] = self.unknown_value
+        return super().run(initial_env=env)
+def constant_fold(gm, constraint_fn: Optional[Callable[[torch.fx.Node], bool]] = None):
+    with torch.utils._python_dispatch._disable_current_modes():
+        cf = ConstantFolder(gm, skip_constructors=True)
+        cf.run()
+        for node, constant in cf.node_replacements.items():
+            if constraint_fn is not None and not constraint_fn(node):
+                continue
+            replace_node_with_constant(gm, node, constant)
+        erased_params = []
+        # Get all attr users by looking up the graph instead from node.users, because in this case
+        # _tensor_constant0 and _tensor_constant0_1 are actually refereing to the same tensor.
+        #     opcode         name                 target            args                         kwargs
+        # -------------  -------------------  ----------------  ---------------------------  --------
+        # placeholder    arg0_1               arg0              ()                           {}
+        # get_attr       _tensor_constant0    state             ()                           {}
+        # call_function  add                  aten.add.Tensor   (arg0_1, _tensor_constant0)  {}
+        # get_attr       _tensor_constant0_1  state             ()                           {}
+        # call_function  add_                 aten.add_.Tensor  (_tensor_constant0_1, 1)     {}
+        # output         output               output            ([add],)                     {}
+        get_attr_node_users = defaultdict(list)
+        for node in gm.graph.nodes:
+            if node.op == "get_attr":
+                get_attr_node_users[node.target].extend(node.users.keys())
+        for node in gm.graph.find_nodes(op="get_attr"):
+            if node.op == "get_attr" and len(get_attr_node_users[node.target]) == 0:
+                if hasattr(gm, node.target):
+                    delattr(gm, node.target)
+                erased_params.append(node)
+        for node in erased_params:
+            gm.graph.erase_node(node)
+        gm.graph.eliminate_dead_code()
+        gm.graph.lint()
+        gm.recompile()
+def constant_graph_tag(gm: torch.fx.GraphModule):
+    with torch.utils._python_dispatch._disable_current_modes():
+        cf = ConstantFolder(gm, skip_constructors=True)
+        cf.run()
+        for node in gm.graph.nodes:
+            if (
+                node.op == "get_attr"
+                or node in cf.node_replacements
+                or node in cf.replaced_uses
+            ):
+                node.meta[META_TAG] = CONST_MODULE_TAG
+            else:
+                node.meta[META_TAG] = MODULE_TAG
+def run_and_get_constant_graph(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    """
+    Construct a GraphModule which corresponds to the part which could be
+    constant folded in provided gm.
+    """
+    constant_graph_tag(gm)
+    # We rewrite the tags, if it's a constant being directly consumed, without
+    # any folding opportunity, we keep it in main gm.
+    for node in gm.graph.find_nodes(op="get_attr"):
+        used_to_fold = False
+        for u in node.users:
+            if u.meta[META_TAG] == CONST_MODULE_TAG:
+                used_to_fold = True
+                break
+        if not used_to_fold:
+            node.meta[META_TAG] = MODULE_TAG
+    new_graph = torch.fx.Graph()
+    node_remapping: Dict[torch.fx.Node, torch.fx.Node] = {}
+    output_nodes = []
+    for node in gm.graph.nodes:
+        if node.meta[META_TAG] == MODULE_TAG:
+            continue
+        new_node = new_graph.node_copy(node, lambda x: node_remapping[x])
+        node_remapping[node] = new_node
+        for user in node.users:
+            if user.meta[META_TAG] == MODULE_TAG:
+                output_nodes.append(new_node)
+                break
+    new_graph.output(tuple(output_nodes))
+    new_graph.lint()
+    new_gm = torch.fx.GraphModule(gm, new_graph)
+    return new_gm

.venv/lib/python3.11/site-packages/torch/_export/passes/functionalize_side_effectful_ops_pass.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import copy
+from typing import Dict, Optional, Tuple, List
+import torch
+from torch._export.pass_base import _ExportPassBaseDeprecatedDoNotUse, PassResult, Argument
+from torch._export.pass_infra.node_metadata import NodeMetadata
+from torch._export.pass_infra.proxy_value import ProxyValue
+from torch._ops import OpOverload
+aten = torch.ops.aten
+_NON_FUNCTIONAL_TO_FUNCTIONAL_SIDE_EFFECTFUL_FUNCS: Dict[OpOverload, OpOverload] = {
+    aten.sym_constrain_range.default: aten._functional_sym_constrain_range,
+    aten._assert_async.msg: aten._functional_assert_async.msg,
+}
+class _FunctionalizeSideEffectfulOpsPass(_ExportPassBaseDeprecatedDoNotUse):
+    """
+    Functionalize ops with side effect in graph module by replacing the op with
+    functional version of it. A new dependency token (`dep_token`) will be
+    created and propagated through functional ops to output.
+    For example:
+    ```
+    def f(x):
+        sym_constrain_range(x.shape[0], min=1, max=3)
+        return x.add(3)
+    ```
+    Will be transformed to:
+    ```
+    def f(x):
+        dep_token0 = _make_dep_token()
+        dep_token1 = _functional_sym_constrain_range(
+            x.shape[0], min=1, max=3, dep_token=dep_token0
+        )
+        return x.add(3), dep_token1
+    ```
+    """
+    def __init__(self) -> None:
+        super().__init__()
+        self._dep_token: Optional[ProxyValue] = None
+        self._next_dep_token_index: Optional[int] = None
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        # Early return if no non-functional assertions.
+        if not any(
+            n.target in _NON_FUNCTIONAL_TO_FUNCTIONAL_SIDE_EFFECTFUL_FUNCS
+            for n in graph_module.graph.nodes
+        ):
+            return PassResult(graph_module=graph_module, modified=False)
+        gm = copy.deepcopy(graph_module)
+        self._dep_token = None
+        self._next_dep_token_index = None
+        return super().call(gm)
+    def call_operator(
+        self,
+        op: OpOverload,
+        args: Tuple[Argument, ...],
+        kwargs: Dict[str, Argument],
+        meta: NodeMetadata,
+    ) -> ProxyValue:
+        if op not in _NON_FUNCTIONAL_TO_FUNCTIONAL_SIDE_EFFECTFUL_FUNCS:
+            return super().call_operator(op, args, kwargs, meta)
+        if self._dep_token is None:
+            self._dep_token = super().call_operator(
+                aten._make_dep_token,
+                args=(),
+                kwargs={},
+                meta=self._create_dummy_node_metadata(),
+            )
+            self._dep_token.node.name = "dep_token0"
+            self._next_dep_token_index = 1
+        self._dep_token = super().call_operator(
+            _NON_FUNCTIONAL_TO_FUNCTIONAL_SIDE_EFFECTFUL_FUNCS[op],
+            args=args,
+            kwargs={**kwargs, "dep_token": self._dep_token},
+            meta=meta,
+        )
+        assert self._next_dep_token_index is not None
+        self._dep_token.node.name = f"dep_token{self._next_dep_token_index}"
+        self._next_dep_token_index += 1
+        return self._dep_token
+    def output(self, results: List[Argument], meta: NodeMetadata) -> ProxyValue:
+        assert self._dep_token is not None
+        return super().output(results=(*results, self._dep_token), meta=meta)  # type: ignore[arg-type]

.venv/lib/python3.11/site-packages/torch/_export/passes/lift_constants_pass.py ADDED Viewed

	@@ -0,0 +1,318 @@

+# mypy: allow-untyped-defs
+import collections
+import warnings
+from typing import Any, Dict, List, Union
+import torch
+from torch._export.verifier import SpecViolationError
+from torch._guards import detect_fake_mode
+from torch._library.fake_class_registry import FakeScriptObject
+from torch._subclasses.fake_tensor import unset_fake_temporarily
+from torch.export.exported_program import (
+    ArgumentSpec,
+    CustomObjArgument,
+    ExportGraphSignature,
+    InputKind,
+    InputSpec,
+    TensorArgument,
+)
+class ConstantAttrMap(collections.abc.MutableMapping):
+    """A mapping class that understands how to use module constants (tensors,
+    ScriptObjects, FakeScriptObjects) as keys. We store tensors and FakeScriptObjects normally,
+    but ScriptObjects are stored by hash, because different torch.ScriptObjects can point to
+    the same underlying value (but we guarantee that they will `hash()` to the same value
+    if that's the case).
+    """
+    def __init__(self) -> None:
+        # Underlying dict that we use to implement this mapping.
+        self._constant_attrs: Dict[
+            Union[int, torch.Tensor, FakeScriptObject], List[Any]
+        ] = {}
+        # Map from the hash(ScriptObject) to the ScriptObject itself. Used for
+        # APIs like `__iter__` that should look like they're returning the
+        # original ScriptObjects.
+        self._script_object_map: Dict[int, torch.ScriptObject] = {}
+    def __getitem__(
+        self, key: Union[torch.Tensor, torch.ScriptObject, FakeScriptObject]
+    ) -> Any:
+        real_key = hash(key) if isinstance(key, torch.ScriptObject) else key
+        assert isinstance(real_key, (int, torch.Tensor, FakeScriptObject))
+        return self._constant_attrs[real_key]
+    def __setitem__(self, key: Union[torch.Tensor, torch.ScriptObject], value):
+        # we shouldn't actually call this, should go to add() instead to handle aliasing
+        raise NotImplementedError(
+            """Directly setting values for ConstantAttrMap is not supported, please use add(key, value) instead.
+The same key can be mapped to multiple values, for handling constant aliasing."""
+        )
+    def add(
+        self, key: Union[torch.Tensor, torch.ScriptObject, FakeScriptObject], value: Any
+    ) -> None:
+        if isinstance(key, torch.ScriptObject):
+            if hash(key) not in self._constant_attrs:
+                self._constant_attrs[hash(key)] = []
+            self._constant_attrs[hash(key)].append(value)
+            self._script_object_map[hash(key)] = key
+        elif isinstance(key, (torch.Tensor, FakeScriptObject)):
+            if key not in self._constant_attrs:
+                self._constant_attrs[key] = []
+            self._constant_attrs[key].append(value)
+        else:
+            raise TypeError(
+                f"Expected key to be a tensor or ScriptObject, got {type(key)}"
+            )
+    def __delitem__(self, key):
+        real_key = hash(key) if isinstance(key, torch.ScriptObject) else key
+        del self._constant_attrs[real_key]
+    def __iter__(self):
+        for key in self._constant_attrs:
+            if isinstance(key, int):
+                yield self._script_object_map[key]
+            else:
+                yield key
+    def __len__(self):
+        return len(self._constant_attrs)
+    def __contains__(self, key: object) -> bool:
+        real_key = hash(key) if isinstance(key, torch.ScriptObject) else key
+        return real_key in self._constant_attrs
+def get_constant_fqn(node: torch.fx.Node, constant_name: str) -> str:
+    # The FQN of the constant tensor in the state dict should
+    # correspond to the module where the constant tensor was
+    # originally used.
+    if len(node.meta["nn_module_stack"]) == 0:
+        return constant_name
+    parent_fqn = list(node.meta["nn_module_stack"].values())[-1][0]
+    if len(parent_fqn) > 0:
+        return f"{parent_fqn}.{constant_name}"
+    else:
+        return constant_name
+def _get_first_fqn(
+    const_attrs: ConstantAttrMap,
+    key: Union[torch.Tensor, torch.ScriptObject, FakeScriptObject],
+) -> Any:
+    fqns = const_attrs.get(key)
+    return fqns[0] if fqns else None
+def lift_constants_pass(
+    gm: torch.fx.GraphModule,
+    graph_signature: ExportGraphSignature,
+    constant_attrs: ConstantAttrMap,
+) -> Dict[str, Union[torch.Tensor, torch.ScriptObject, FakeScriptObject]]:
+    """
+    Takes a graph module, graph signature, and modifies them implace to lift any
+    constants (tensors or custom classes) as inputs to the graph. Returns a
+    dictionary of names to constants.
+    Arguments:
+        gm (torch.fx.GraphModule): The graph module containing the graph and constants to lift.
+        graph_signature (ExportGraphSignature): This graph signature will be
+            mutated to add additional CONSTANT_TENSOR and CUSTOM_OBJ inputs.
+        constant_attrs (ConstantAttr): A mapping from a constant value to its
+            fully-qualified path in `gm`. This is used to maintain consistent
+            location of constants between the original module and the exported
+            version.
+    Returns:
+        A dictionary of fqn => constant value.
+    """
+    all_constants: Dict[
+        str, Union[torch.Tensor, torch.ScriptObject, FakeScriptObject]
+    ] = {}
+    inputs = graph_signature.input_specs
+    num_custom_obj = sum(
+        input_specs.kind == InputKind.CUSTOM_OBJ for input_specs in inputs
+    )
+    num_tensor_constants = sum(
+        input_specs.kind == InputKind.CONSTANT_TENSOR for input_specs in inputs
+    )
+    fake_mode = detect_fake_mode(
+        tuple(node.meta["val"] for node in gm.graph.nodes if node.op == "placeholder")
+    )
+    first_user_input_loc, first_user_input = 0, None
+    for node in gm.graph.nodes:
+        if node.op == "placeholder" and node.name in graph_signature.user_inputs:
+            first_user_input = node
+            break
+        first_user_input_loc += 1
+    lifted_objs = ConstantAttrMap()
+    for node in gm.graph.nodes:
+        if node.op == "get_attr":
+            constant_val = getattr(gm, node.target)
+            if constant_val in lifted_objs:
+                # We already lifted this constant elsewhere. Just rewrite uses
+                # of this get_attr to point to the already-existing placeholder
+                # node.
+                const_placeholder_node = _get_first_fqn(lifted_objs, constant_val)
+                node.replace_all_uses_with(const_placeholder_node)
+                gm.graph.erase_node(node)
+                continue
+            # For ScriptObject, Tensor and FakeScriptObject constants:
+            # First check if the constant was an attribute on some module by
+            # consulting `constant_attrs` map. If it is, use the fqn that keeps
+            # its location consistent with the eager module.
+            #
+            # If it's not in the `constant_attrs` map, that means it's an inline
+            # constant (e.g. x + torch.tensor(0)), and thus did not have a
+            # specific location in the eager module. In that case, just generate
+            # some name and attach it to the module in which it was used.
+            if isinstance(constant_val, (torch.ScriptObject, FakeScriptObject)):
+                constant_kind = InputKind.CUSTOM_OBJ
+                constant_fqn = _get_first_fqn(constant_attrs, constant_val)
+                if constant_fqn is not None:
+                    constant_name = constant_fqn.replace(".", "_")
+                else:
+                    constant_name = f"lifted_custom_{num_custom_obj}"
+                    constant_fqn = get_constant_fqn(node, constant_name)
+                    num_custom_obj += 1
+            elif isinstance(constant_val, torch.Tensor):
+                # Remove the parameterness of constant_val
+                if isinstance(constant_val, torch.nn.Parameter):
+                    warnings.warn(
+                        f"{node.target} created when tracing {node.meta['stack_trace']} is a parameter. But"
+                        f"it's not registered with register_parameter(). export will treat it as a constant tensor"
+                    )
+                    # We get the real data out of the parameter by disabling the surrounding fake mode.
+                    with unset_fake_temporarily():
+                        constant_val = constant_val.data
+                constant_kind = InputKind.CONSTANT_TENSOR
+                constant_fqn = _get_first_fqn(constant_attrs, constant_val)
+                if constant_fqn is not None:
+                    constant_name = constant_fqn.replace(".", "_")
+                else:
+                    constant_name = f"lifted_tensor_{num_tensor_constants}"
+                    constant_fqn = get_constant_fqn(node, constant_name)
+                    num_tensor_constants += 1
+            elif isinstance(constant_val, torch.fx.GraphModule):
+                continue
+            elif "LoweredBackendModule" in type(constant_val).__name__:
+                continue
+            else:
+                raise SpecViolationError(
+                    f"getattr node {node} referencing unsupported type {type(constant_val)}"
+                )
+            with gm.graph.inserting_before(first_user_input):
+                # Insert the constant node before the first user input
+                const_placeholder_node = gm.graph.placeholder(constant_name)
+                # match target name with its node name in case there is name collision
+                # and suffix is added to node name in fx
+                const_placeholder_node.target = const_placeholder_node.name
+                for k, v in node.meta.items():
+                    const_placeholder_node.meta[k] = v
+                # Once the FQN has been used, remove nn_module_stack, stack_trace
+                const_placeholder_node.meta.pop("nn_module_stack")
+                const_placeholder_node.meta.pop("stack_trace", None)
+                input_spec_arg: ArgumentSpec
+                if isinstance(constant_val, torch.Tensor):
+                    if fake_mode is not None:
+                        const_placeholder_node.meta["val"] = fake_mode.from_tensor(
+                            constant_val, static_shapes=True
+                        )
+                        const_placeholder_node.meta["val"].constant = constant_val
+                    else:
+                        const_placeholder_node.meta["val"] = constant_val
+                    input_spec_arg = TensorArgument(name=const_placeholder_node.name)
+                elif isinstance(constant_val, torch._C.ScriptObject):
+                    class_fqn = constant_val._type().qualified_name()  # type: ignore[attr-defined]
+                    const_placeholder_node.meta["val"] = CustomObjArgument(
+                        constant_fqn, class_fqn
+                    )
+                    input_spec_arg = CustomObjArgument(
+                        name=const_placeholder_node.name, class_fqn=class_fqn
+                    )
+                elif isinstance(constant_val, FakeScriptObject):
+                    class_fqn = constant_val.script_class_name
+                    const_placeholder_node.meta["val"] = CustomObjArgument(
+                        constant_fqn, class_fqn, constant_val
+                    )
+                    input_spec_arg = CustomObjArgument(
+                        name=const_placeholder_node.name,
+                        class_fqn=class_fqn,
+                        fake_val=constant_val,
+                    )
+                else:
+                    raise SpecViolationError(
+                        f"tried to lift unsupported type {type(constant_val)} from node {node.format_node()}"
+                    )
+                lifted_objs.add(constant_val, const_placeholder_node)
+                node.replace_all_uses_with(const_placeholder_node)
+                gm.graph.erase_node(node)
+                # Add the constant as a buffer to the graph signature
+                graph_signature.input_specs.insert(
+                    first_user_input_loc,
+                    InputSpec(
+                        kind=constant_kind,
+                        arg=input_spec_arg,
+                        target=constant_fqn,
+                    ),
+                )
+                if constant_val in constant_attrs:
+                    for fqn in constant_attrs[constant_val]:
+                        all_constants[fqn] = constant_val
+                else:
+                    all_constants[constant_fqn] = constant_val
+                first_user_input_loc += 1
+    return all_constants
+def rewrite_script_object_meta(
+    gm: torch.fx.GraphModule,
+) -> Dict[str, Union[torch.Tensor, torch.ScriptObject, FakeScriptObject],]:
+    """When tracing, we produce a graph with FakeScriptObject in the
+    meta["val"].
+    For now, we rewrie meta["val"] to be a placeholder CustomObjArgument
+    """
+    constants: Dict[
+        str,
+        Union[
+            torch.Tensor,
+            torch.ScriptObject,
+            FakeScriptObject,
+        ],
+    ] = {}
+    for node in gm.graph.nodes:
+        if "val" not in node.meta:
+            continue
+        old_meta = node.meta["val"]
+        if isinstance(old_meta, torch.ScriptObject):
+            class_fqn = old_meta._type().qualified_name()  # type: ignore[attr-defined]
+            new_meta = CustomObjArgument(node.name, class_fqn)
+            constants[node.name] = old_meta
+            node.meta["val"] = new_meta
+        elif isinstance(old_meta, FakeScriptObject):
+            class_fqn = old_meta.script_class_name  # type: ignore[attr-defined]
+            new_meta = CustomObjArgument(node.name, class_fqn, old_meta)
+            constants[node.name] = old_meta
+            node.meta["val"] = new_meta
+    return constants

.venv/lib/python3.11/site-packages/torch/_export/passes/remove_runtime_assertions.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# mypy: allow-untyped-defs
+import torch
+from torch.fx.passes.infra.pass_base import PassBase, PassResult
+class _RemoveRuntimeAssertionsPass(PassBase):
+    """
+    Remove runtime assertions inserted by the
+    _AddRuntimeAssertionsForInlineConstraintsPass.
+    """
+    def call(self, graph_module) -> PassResult:
+        modified = False
+        for module in graph_module.modules():
+            if not isinstance(module, torch.fx.GraphModule):
+                continue
+            for node in module.graph.nodes:
+                if node.target == torch.ops.aten._assert_async.msg:
+                    assert_async_node = node
+                    if len(assert_async_node.users) > 0:
+                        continue
+                    module.graph.erase_node(assert_async_node)
+                    # the upstream scalar_tensor <- {le, ge} <- sym_size
+                    # linear chain of nodes of nodes is removed by the
+                    # downstream dead code elimination
+                    modified = True
+        return PassResult(graph_module, modified)

.venv/lib/python3.11/site-packages/torch/_export/passes/replace_autocast_with_hop_pass.py ADDED Viewed

	@@ -0,0 +1,179 @@

+# mypy: allow-untyped-defs
+from typing import List
+import torch
+from torch._higher_order_ops.wrap import wrap_with_autocast
+from ..utils import node_inline_, nodes_filter, nodes_first, sequential_split
+from .replace_with_hop_pass_util import (
+    _replace_with_hop_helper,
+    _replace_with_hop_pass_helper,
+    _sequential_split_and_maybe_inline_subgraphs_helper,
+)
+def _is_autocast_node(node: torch.fx.Node):
+    return (
+        node
+        and node.op == "call_function"
+        and node.target
+        in [
+            torch.amp.autocast_mode._enter_autocast,
+            torch.amp.autocast_mode._exit_autocast,
+        ]
+    )
+def _is_enter_autocast_node(node: torch.fx.Node):
+    return (
+        node
+        and node.op == "call_function"
+        and node.target == torch.amp.autocast_mode._enter_autocast
+    )
+def _is_exit_autocast_node(node: torch.fx.Node):
+    return (
+        node
+        and node.op == "call_function"
+        and node.target == torch.amp.autocast_mode._exit_autocast
+    )
+def _is_autocast_sub_mod(node: torch.fx.Node):
+    """
+    Check if the first non-placeholder node is `torch.amp.autocast_mode._enter_autocast`.
+    """
+    if node.op == "call_module":
+        assert isinstance(node.target, str)
+        subgm = getattr(node.graph.owning_module, node.target)
+        first_non_ph = nodes_first(
+            subgm.graph.nodes, lambda node: node.op != "placeholder"
+        )
+        if (
+            first_non_ph
+            and first_non_ph.op == "call_function"
+            and first_non_ph.target == torch.amp.autocast_mode._enter_autocast
+        ):
+            # TODO: check if current auto-cast type is the same as the args of
+            # _enter_autocast. If so, return False, i.e. do not create a submodule.
+            return True
+    return False
+def _check_valid_autocast_block(enter_autocast_node, exit_autocast_node):
+    assert _is_enter_autocast_node(enter_autocast_node)
+    assert _is_exit_autocast_node(exit_autocast_node)
+    assert exit_autocast_node.args[0] == enter_autocast_node
+def _replace_with_hop(node: torch.fx.Node):
+    assert node.op == "call_module"
+    graph: torch.fx.Graph = node.graph
+    gm: torch.fx.GraphModule = graph.owning_module
+    assert isinstance(node.target, str)
+    sub_gm = getattr(gm, node.target)
+    sub_graph = sub_gm.graph
+    autocast_nodes = nodes_filter(sub_graph.nodes, _is_autocast_node)
+    if len(autocast_nodes) > 0:
+        assert len(autocast_nodes) > 1  # need at least an enter node and an exist node
+        enter_autocast_node = autocast_nodes[0]
+        exit_autocast_node = autocast_nodes[-1]
+        _check_valid_autocast_block(enter_autocast_node, exit_autocast_node)
+        _replace_with_hop_helper(
+            node, enter_autocast_node, _is_autocast_node, wrap_with_autocast
+        )
+        sub_graph.erase_node(exit_autocast_node)
+        sub_graph.erase_node(enter_autocast_node)
+def _split_autocast(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
+    """
+    split_autocast creates a new graph module that splits the input graph module into multiple submodules
+    based on the `_enter_autocast` and `_exit_autocast` nodes. It doesn't mutate the input graph module.
+    Nodes between the **outer-most** `_enter_autocast` and `_exit_autocast(_enter_autocast)` are splitted
+    into a submodule. Nested autocast regions are not splitted.
+    `_enter_autocast` and `_exit_autocast(_enter_autocast)` nodes are in the submodule as well.
+    Below is an example of splitting. A, B, C, D, E are blocks of non-autocast nodes in the original graph
+    module. Nodes marked with the same number are grouped into the same submodule.
+    A               # 0
+    enter_autocast  # 1
+    B               # 1
+    exit_autocast   # 1
+    C               # 2
+    enter_autocast  # 3
+    D               # 3
+    exit_autocast   # 3
+    E               # 4
+    """
+    enter_autocast_node_stack: List[torch.fx.Node] = []
+    first_node_after_outer_most_exit: bool = False
+    def node_call_back(node: torch.fx.Node):
+        nonlocal enter_autocast_node_stack, first_node_after_outer_most_exit
+        if first_node_after_outer_most_exit or (
+            len(enter_autocast_node_stack) == 0 and _is_enter_autocast_node(node)
+        ):
+            assert len(enter_autocast_node_stack) == 0
+            first_node_after_outer_most_exit = False
+            if _is_enter_autocast_node(node):
+                enter_autocast_node_stack.append(node)
+            return True
+        if _is_exit_autocast_node(node):
+            assert len(enter_autocast_node_stack) > 0
+            last_enter_autocast_node = enter_autocast_node_stack.pop()
+            assert node.args[0] == last_enter_autocast_node
+            if len(enter_autocast_node_stack) == 0:
+                # next node should be in the next submodule since
+                # autocast block ends
+                first_node_after_outer_most_exit = True
+        return False
+    return sequential_split(gm, node_call_back)
+def _sequential_split_and_maybe_inline_subgraphs(
+    gm: torch.fx.GraphModule, graph_signature
+):
+    """
+    Helper function for replace_autocast_with_hop_pass().
+    Split the graph module into multiple subgraphs based on the autocast nodes.
+    For each subgraph, decides whether to construct a HOO subgraph, or inline the calls
+    back into the parent graph module.
+    Nodes between `_enter_autocast` and `_exit_autocast(_enter_autocast)` are considered
+    as a subgraph.
+    """
+    need_replacing = any(_is_autocast_node(node) for node in gm.graph.nodes)
+    if not need_replacing:
+        return gm, graph_signature
+    # split_autocast returns a new graph module that could have different output
+    # args names. We need to fix the graph signature in `_sequential_split_and_maybe_inline_subgraphs_helper`.
+    new_gm = _split_autocast(gm)
+    def _maybe_inline_or_replace_with_hop(node: torch.fx.Node):
+        if _is_autocast_sub_mod(node):
+            _replace_with_hop(node)
+        else:
+            assert node.op == "call_module"
+            assert isinstance(node.target, str)
+            node_inline_(node)
+    return _sequential_split_and_maybe_inline_subgraphs_helper(
+        new_gm, graph_signature, _maybe_inline_or_replace_with_hop
+    )
+def replace_autocast_with_hop_pass(gm: torch.fx.GraphModule, graph_signature):
+    """
+    Split gm into sub-graph-modules using `sequential_split_and_maybe_inline_subgraphs`, and
+    then recursively call itself on each of the submodules.
+    """
+    return _replace_with_hop_pass_helper(
+        gm,
+        graph_signature,
+        _sequential_split_and_maybe_inline_subgraphs,
+    )

.venv/lib/python3.11/site-packages/torch/_export/passes/replace_quantized_ops_with_standard_ops_pass.py ADDED Viewed

	@@ -0,0 +1,673 @@

+# mypy: allow-untyped-defs
+import logging
+import operator
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.export._trace
+from torch._ops import OpOverload
+from torch.ao.quantization.fx._decomposed import (
+    dequantize_per_channel,
+    dequantize_per_tensor,
+    quantize_per_tensor,
+)
+from torch.ao.quantization.utils import calculate_qmin_qmax
+from torch.fx.graph_module import _assign_attr
+log = logging.getLogger(__name__)
+# Those values will need to be carried over multiple operators.
+_INPUT_Q_DTYPE: Optional[Union[torch.dtype, torch.fx.Node]] = None
+_SCALE: Optional[Union[float, torch.fx.Node]] = None
+_ZERO_POINT: Optional[Union[float, torch.fx.Node]] = None
+def int_to_valid_dtype(val: int) -> torch.dtype:
+    from torch._export.converter import _TORCH_ENUM_TO_DTYPE  # No circular import.
+    if isinstance(val, torch.dtype):
+        return val
+    dtype = _TORCH_ENUM_TO_DTYPE[val]
+    if dtype == torch.quint8:
+        return torch.uint8
+    elif dtype == torch.qint8:
+        return torch.int8
+    return dtype
+def fx_enum_to_dtype(gm: torch.fx.GraphModule, val: int) -> torch.fx.Node:
+    return gm.graph.call_function(int_to_valid_dtype, (val,))
+def insert_quantized_node(
+    gm: torch.fx.GraphModule,
+    val_node: torch.fx.Node,
+    scale_node: Union[float, torch.fx.Node],
+    zero_point_node: Union[float, torch.fx.Node],
+    qmin_node: Union[float, int, torch.fx.Node],
+    qmax_node: Union[float, int, torch.fx.Node],
+    dtype_node: Union[torch.dtype, torch.fx.Node],
+    qscheme: Optional[torch.qscheme],
+) -> torch.fx.Node:
+    return gm.graph.call_function(
+        quantize_per_tensor,
+        (
+            val_node,
+            scale_node,
+            zero_point_node,
+            qmin_node,
+            qmax_node,
+            dtype_node,
+        ),
+    )
+def get_dequantized(
+    val: torch.Tensor,
+    scale: Union[float, torch.Tensor],
+    zero_point: Union[float, torch.Tensor],
+    qmin: Union[float, int],
+    qmax: Union[float, int],
+    dtype: torch.dtype,
+    axis: Optional[int],
+    qscheme: Optional[torch.qscheme],
+) -> torch.Tensor:
+    if qscheme is torch.per_tensor_affine:
+        return dequantize_per_tensor(
+            val,
+            scale,
+            zero_point,
+            qmin,
+            qmax,
+            dtype,
+        )
+    elif qscheme is torch.per_channel_affine:
+        return dequantize_per_channel(
+            val,
+            scale,
+            zero_point,
+            axis,
+            qmin,
+            qmax,
+            dtype,
+        )
+    else:
+        raise RuntimeError(f"Unsupported dequantization scheme: {qscheme}")
+def insert_dequantized_node(
+    gm: torch.fx.GraphModule,
+    val_node: torch.fx.Node,
+    scale_node: Union[float, torch.fx.Node],
+    zero_point_node: Union[float, torch.fx.Node],
+    qmin_node: Union[float, int, torch.fx.Node],
+    qmax_node: Union[float, int, torch.fx.Node],
+    dtype_node: Union[torch.dtype, torch.fx.Node],
+    axis_node: Optional[Union[int, torch.fx.Node]],
+    qscheme: Optional[torch.qscheme],
+) -> torch.fx.Node:
+    if qscheme is torch.per_tensor_affine:
+        return gm.graph.call_function(
+            dequantize_per_tensor,
+            (
+                val_node,
+                scale_node,
+                zero_point_node,
+                qmin_node,
+                qmax_node,
+                dtype_node,
+            ),
+        )
+    elif qscheme is torch.per_channel_affine:
+        return gm.graph.call_function(
+            dequantize_per_channel,
+            (
+                val_node,
+                scale_node,
+                zero_point_node,
+                axis_node,
+                qmin_node,
+                qmax_node,
+                dtype_node,
+            ),
+        )
+    else:
+        raise RuntimeError(f"Unsupported dequantization scheme: {qscheme}")
+def get_qmin_qmax(dtype: torch.dtype) -> Tuple[Union[int, float], Union[int, float]]:
+    return calculate_qmin_qmax(None, None, False, dtype, False)  # type: ignore[arg-type]
+def insert_qmin_qmax_node(
+    gm: torch.fx.GraphModule, dtype_node: Union[torch.dtype, torch.fx.Node]
+) -> Tuple[torch.fx.Node, torch.fx.Node]:
+    q_min_max_node = gm.graph.call_function(
+        calculate_qmin_qmax, (None, None, False, dtype_node, False)
+    )
+    qmin_node = gm.graph.call_function(operator.getitem, (q_min_max_node, 0))
+    qmax_node = gm.graph.call_function(operator.getitem, (q_min_max_node, 1))
+    return qmin_node, qmax_node
+def get_script_object(
+    gm: torch.nn.Module, node: torch.fx.Node
+) -> torch._C.ScriptObject:
+    assert isinstance(node, torch.fx.Node)
+    assert node.op == "get_attr"
+    attr_name = node.target
+    assert isinstance(attr_name, str)
+    mod = gm
+    for attr in attr_name.split("."):
+        mod = getattr(mod, attr)
+    assert isinstance(mod, torch._C.ScriptObject)
+    return mod
+def insert_weight_and_bias_get_attr_node_from_get_attr_to_scriptobject(
+    gm: torch.fx.GraphModule,
+    param_node: torch.fx.Node,
+) -> Tuple[torch.fx.Node, Optional[torch.fx.Node]]:
+    """Directly inline tensor from a get_attr fx node."""
+    mod = get_script_object(gm, param_node)
+    w_qtensor, b_qtensor = mod.unpack()  # type: ignore[attr-defined]
+    w_attr_name, b_attr_name = (
+        f"dequantized_{param_node.target}_w",
+        f"dequantized_{param_node.target}_b",
+    )
+    return insert_weight_and_bias_get_attr_node(
+        gm, w_qtensor, b_qtensor, w_attr_name, b_attr_name
+    )
+def insert_weight_and_bias_get_attr_node_from_get_attr_to_qtensor(
+    gm: torch.fx.GraphModule,
+    get_attr_to_weight_node: torch.fx.Node,
+    get_attr_to_bias_node: Optional[torch.fx.Node],
+) -> Tuple[torch.fx.Node, Optional[torch.fx.Node]]:
+    assert isinstance(get_attr_to_weight_node.target, str)
+    w_qtensor = getattr(gm, get_attr_to_weight_node.target)
+    w_attr_name = f"dequantized_{get_attr_to_weight_node.target}_w"
+    if get_attr_to_bias_node is not None:
+        assert isinstance(get_attr_to_bias_node.target, str)
+        b_qtensor = getattr(gm, get_attr_to_bias_node.target)
+        b_attr_name = f"dequantized_{get_attr_to_bias_node.target}_b"
+    else:
+        b_qtensor, b_attr_name = None, ""
+    return insert_weight_and_bias_get_attr_node(
+        gm, w_qtensor, b_qtensor, w_attr_name, b_attr_name
+    )
+def insert_weight_and_bias_get_attr_node(
+    gm: torch.fx.GraphModule,
+    w_qtensor: torch.Tensor,
+    b_qtensor: Optional[torch.Tensor],
+    w_attr_name: str,
+    b_attr_name: str,
+) -> Tuple[torch.fx.Node, Optional[torch.fx.Node]]:
+    w_tensor = get_tensor_from_qtensor(w_qtensor)
+    _assign_attr(w_tensor, gm, w_attr_name)
+    w_tensor_attr = gm.graph.get_attr(w_attr_name)
+    if b_qtensor is not None:
+        b_tensor = get_tensor_from_qtensor(b_qtensor, dequant=False)
+        _assign_attr(b_tensor, gm, b_attr_name)
+        b_tensor_attr = gm.graph.get_attr(b_attr_name)
+    else:
+        b_tensor_attr = None
+    return w_tensor_attr, b_tensor_attr
+def get_tensor_from_qtensor(
+    qtensor: torch.Tensor, dequant: bool = True
+) -> torch.Tensor:
+    # Manual conversion because qint8 is not used anymore.
+    if qtensor.dtype in [torch.qint8, torch.quint8]:
+        tensor = qtensor.int_repr()
+    else:
+        tensor = qtensor
+    # Weights need dequantization with scaling and zero_point adjustment, but
+    # bias does not need that.
+    if dequant:
+        qscheme = qtensor.qscheme()
+        if qscheme == torch.per_channel_affine:
+            scale, zero_point, axis = (
+                qtensor.q_per_channel_scales(),
+                qtensor.q_per_channel_zero_points(),
+                qtensor.q_per_channel_axis(),
+            )
+        else:
+            scale, zero_point, axis = (
+                qtensor.q_scale(),  # type: ignore[assignment]
+                qtensor.q_zero_point(),  # type: ignore[assignment]
+                None,
+            )
+        dtype = tensor.dtype
+        qmin, qmax = get_qmin_qmax(dtype)
+        return get_dequantized(
+            tensor, scale, zero_point, qmin, qmax, dtype, axis, qscheme
+        )
+    return tensor
+def insert_fused_activation_node(
+    gm: torch.fx.GraphModule, opname: str, fx_node: torch.fx.Node
+) -> torch.fx.Node:
+    if opname in ["conv1d_relu", "conv2d_relu", "linear_relu", "add_relu", "mul_relu"]:
+        fx_node = gm.graph.call_function(torch.ops.aten.relu, (fx_node,))
+    return fx_node
+def _conv1d_op_with_squeeze(
+    inp: torch.Tensor,
+    weight: torch.Tensor,
+    bias: Optional[torch.Tensor],
+    stride: List[int],
+    padding: List[int],
+    dilation: List[int],
+    groups: int,
+) -> torch.Tensor:
+    # In quantized version, conv1d is emulated using conv2d with squeeze and unsqueeze
+    # operations before and after the conv2d operation to match the dimension of weights.
+    # Reference: https://github.com/pytorch/pytorch/blob/eca0cb0fbe84bb0a34fa94afe261bceecd52c436/aten/src/ATen/native/quantized/cpu/qconv.cpp#L1827  # noqa: B950
+    s_inp = torch.ops.aten.unsqueeze(inp, 2)
+    conv1d_res = torch.ops.aten.conv2d(
+        s_inp,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        groups,
+    )
+    uns_conv1d_res = torch.ops.aten.squeeze(conv1d_res, 2)
+    return uns_conv1d_res
+def _transform_conv_with_packedparam(gm: torch.fx.GraphModule, node: torch.fx.Node):
+    """Conv specfic transformation function."""
+    assert isinstance(node.target, torch._ops.OpOverload)
+    opname = node.target._opname
+    scale_node, zero_point_node = node.args[2], node.args[3]
+    op_f = (
+        torch.ops.aten.conv2d
+        if opname in ["conv2d", "conv2d_relu"]
+        else _conv1d_op_with_squeeze
+    )
+    inp_node, param_node = node.args[0], node.args[1]
+    assert isinstance(inp_node, torch.fx.Node)
+    assert isinstance(param_node, torch.fx.Node)
+    if param_node.op == "call_function":
+        # Using Conv2dPrepackParam from conv_prepack.
+        # We directly skip the packing call and inline weights and bias.
+        w_node, b_node = param_node.args[0], param_node.args[1]
+        assert isinstance(w_node, torch.fx.Node)
+        assert b_node is None or isinstance(b_node, torch.fx.Node)
+        (
+            param_0,
+            param_1,
+        ) = insert_weight_and_bias_get_attr_node_from_get_attr_to_qtensor(
+            gm, w_node, b_node
+        )
+        op_res_node = gm.graph.call_function(
+            op_f, (inp_node, param_0, param_1, *param_node.args[2:])
+        )
+    else:
+        # Using ConvPrepackedParam.
+        param = get_script_object(gm, param_node)
+        (
+            param_0,
+            param_1,
+        ) = insert_weight_and_bias_get_attr_node_from_get_attr_to_scriptobject(
+            gm, param_node
+        )  # type: ignore[assignment]
+        op_res_node = gm.graph.call_function(
+            op_f,
+            (
+                inp_node,
+                param_0,
+                param_1,
+                param.stride(),  # type: ignore[attr-defined]
+                param.padding(),  # type: ignore[attr-defined]
+                param.dilation(),  # type: ignore[attr-defined]
+                param.groups(),  # type: ignore[attr-defined]
+            ),
+        )
+    return op_res_node, scale_node, zero_point_node
+def _transform_linear_with_packedparam(gm: torch.fx.GraphModule, node: torch.fx.Node):
+    """Linear specfic transformation function."""
+    scale_node, zero_point_node = node.args[2], node.args[3]
+    inp_node, param_node = node.args[0], node.args[1]
+    assert isinstance(inp_node, torch.fx.Node)
+    assert isinstance(param_node, torch.fx.Node)
+    if param_node.op == "call_function":
+        # Using LinearPrepackParam from linear_prepack.
+        # We directly skip the packing call and inline weights and bias.
+        w_node, b_node = param_node.args[0], param_node.args[1]
+        assert isinstance(w_node, torch.fx.Node)
+        assert b_node is None or isinstance(b_node, torch.fx.Node)
+        (
+            param_0,
+            param_1,
+        ) = insert_weight_and_bias_get_attr_node_from_get_attr_to_qtensor(
+            gm, w_node, b_node
+        )
+        op_res_node = gm.graph.call_function(
+            torch.ops.aten.linear, (inp_node, param_0, param_1, *param_node.args[2:])
+        )
+    else:
+        # Using LinearPackedParams.
+        (
+            param_0,
+            param_1,
+        ) = insert_weight_and_bias_get_attr_node_from_get_attr_to_scriptobject(
+            gm, param_node
+        )  # type: ignore[assignment]
+        op_res_node = gm.graph.call_function(
+            torch.ops.aten.linear, (inp_node, param_0, param_1)
+        )
+    return op_res_node, scale_node, zero_point_node
+def _transform_op_where_last_two_arguments_are_scale_and_zero_point(
+    gm: torch.fx.GraphModule, node: torch.fx.Node
+):
+    """
+    This transformation function can be used for function where the last two
+    parameters are scale and zero point. Additionally, the function's parameters
+    do not need any unpacking.
+    """
+    to_standard_op = {
+        "mul": torch.ops.aten.mul,
+        "mul_relu": torch.ops.aten.mul,
+        "add": torch.ops.aten.add,
+        "add_relu": torch.ops.aten.add,
+        "softmax": torch.ops.aten.softmax,
+        "cat": torch.ops.aten.cat,
+        "hardswish": torch.ops.aten.hardswish,
+    }
+    assert isinstance(node.target, torch._ops.OpOverload)
+    opname, args = node.target._opname, node.args
+    scale_node, zero_point_node = args[-2], args[-1]
+    op_res_node = gm.graph.call_function(to_standard_op[opname], tuple(args[:-2]))
+    return op_res_node, scale_node, zero_point_node
+def _transform_scalar_arithmetic(gm: torch.fx.GraphModule, node: torch.fx.Node):
+    """Transform scalar overload for basic arithmetic."""
+    to_standard_op = {
+        "mul": torch.ops.aten.mul.Scalar,
+        "add": torch.ops.aten.add.Scalar,
+    }
+    assert isinstance(node.target, torch._ops.OpOverload)
+    opname, args = node.target._opname, node.args
+    op_res_node = gm.graph.call_function(to_standard_op[opname], args)
+    return op_res_node, _SCALE, _ZERO_POINT
+def _transform_prepacked_op(gm: torch.fx.GraphModule, node: torch.fx.Node):
+    """
+    Transformation for functions under prepacked namespace, where they share
+    the same handling logic that [...]OpContext contains all parameters.
+    """
+    assert isinstance(node.target, torch._ops.OpOverload)
+    opname, args = node.target._opname, node.args
+    op_f = None
+    if opname == "conv2d_clamp_run":
+        op_f = torch.ops.aten.conv2d
+    elif opname == "linear_clamp_run":
+        op_f = torch.ops.aten.linear
+    else:
+        raise RuntimeError(f"Invalid operator {opname}")
+    assert isinstance(args[1], torch.fx.Node)
+    so = get_script_object(gm, args[1])
+    func_args = []
+    func_args += [args[0]]
+    func_args += so.unpack()[:2]  # type: ignore[attr-defined]
+    if opname == "conv2d_clamp_run":
+        func_args += torch.ops.prepacked.unpack_prepacked_sizes_conv2d(so)[2:]
+    op_res_node = gm.graph.call_function(op_f, tuple(func_args))
+    return op_res_node
+def _transform_batch_norm(gm: torch.fx.GraphModule, node: torch.fx.Node):
+    args = node.args
+    scale_node, zero_point_node = args[-2], args[-1]
+    op_res_node = gm.graph.call_function(
+        torch.ops.aten.native_batch_norm, (*args[:-3], False, 0.1, args[-3])
+    )
+    op_res_node = gm.graph.call_function(operator.getitem, (op_res_node, 0))
+    return op_res_node, scale_node, zero_point_node
+def fx_transform_quantized_op_to_standard_op(
+    gm: torch.fx.GraphModule, node: torch.fx.Node
+) -> torch.fx.Node:
+    global _SCALE, _ZERO_POINT, _INPUT_Q_DTYPE
+    assert isinstance(node.target, torch._ops.OpOverload)
+    opname, overload = node.target._opname, node.target._overloadname
+    key = f"{opname}.{overload}"
+    opname_to_transform_f = {
+        "conv1d.new": _transform_conv_with_packedparam,
+        "conv1d_relu.new": _transform_conv_with_packedparam,
+        "conv1d.default": _transform_conv_with_packedparam,
+        "conv1d_relu.default": _transform_conv_with_packedparam,
+        "conv2d.new": _transform_conv_with_packedparam,
+        "conv2d_relu.new": _transform_conv_with_packedparam,
+        "conv2d.default": _transform_conv_with_packedparam,
+        "conv2d_relu.default": _transform_conv_with_packedparam,
+        "linear.default": _transform_linear_with_packedparam,
+        "linear_relu.default": _transform_linear_with_packedparam,
+        "add.default": _transform_op_where_last_two_arguments_are_scale_and_zero_point,
+        "add_relu.default": _transform_op_where_last_two_arguments_are_scale_and_zero_point,
+        "mul.default": _transform_op_where_last_two_arguments_are_scale_and_zero_point,
+        "mul_relu.default": _transform_op_where_last_two_arguments_are_scale_and_zero_point,
+        "softmax.default": _transform_op_where_last_two_arguments_are_scale_and_zero_point,
+        "cat.default": _transform_op_where_last_two_arguments_are_scale_and_zero_point,
+        "hardswish.default": _transform_op_where_last_two_arguments_are_scale_and_zero_point,
+        "batch_norm2d.default": _transform_batch_norm,
+        "mul.Scalar": _transform_scalar_arithmetic,
+        "add.Scalar": _transform_scalar_arithmetic,
+    }
+    if f"{key}" not in opname_to_transform_f:
+        raise RuntimeError(f"Unsupported quantized op during transformation: {key}")
+    op_res_node, scale_node, zero_point_node = opname_to_transform_f[f"{key}"](gm, node)
+    # Add fused activation layer.
+    op_res_node = insert_fused_activation_node(gm, opname, op_res_node)
+    _SCALE, _ZERO_POINT = scale_node, zero_point_node
+    assert _INPUT_Q_DTYPE is not None
+    qmin_node, qmax_node = insert_qmin_qmax_node(gm, _INPUT_Q_DTYPE)
+    q_fx_node = insert_quantized_node(
+        gm,
+        op_res_node,
+        scale_node,
+        zero_point_node,
+        qmin_node,
+        qmax_node,
+        _INPUT_Q_DTYPE,
+        torch.per_tensor_affine,
+    )
+    dq_fx_node = insert_dequantized_node(
+        gm,
+        q_fx_node,
+        scale_node,
+        zero_point_node,
+        qmin_node,
+        qmax_node,
+        _INPUT_Q_DTYPE,
+        None,
+        torch.per_tensor_affine,
+    )
+    return dq_fx_node
+def replace_quantized_ops_with_standard_ops(gm: torch.fx.GraphModule):
+    """
+    Replace legacy quantized ops (aten.quantize_per_tensor, quantized.conv) with
+    PT2 ops (quantize_decomposed.quantize_per_tensor, aten.conv).
+    Before:    x || -> aten.q        || -> quantized.conv2d     || -> quantized.linear    || -> aten.dq || -> y
+    After:     x || -> qd.q -> qd.dq || -> aten.conv2d -> qd.q -> qd.dq || aten.linear -> qd.q -> qd.dq || -> y
+    (qd == quantized_decomposed library, q = quantize, dq = dequantize)
+                                          ^
+                                          |
+                getattr(w), getattr(b) from Conv2dParamPrepack
+    During each iteration, the transformation spits out the transformed operator, its quantized output,
+    and its dequantized value together. We did this because dequantization need to use the
+    scale and zero point parameters from the quantization to recover the approximate original value. After each
+    iteration, the new dequantization node will be used as the input to the next node (e.g., dq2 -> linear).
+    For operators like conv2d and linear, their weights and bias are packed in a quantized format in the ScriptObject.
+    During the transformation, we unpack those objects, get their dequantized tensor, populate those
+    as attributes to the module, and use getattr to access them.
+    One exception in the transformation is conv_prepack and linear_prepack. Those calls pack
+    weight and bias constant tensors into ScriptObject, which are then used by subsequent conv2d or linear calls.
+    During transformation, we directly skip transforming conv_prepack or linear_prepack. We check whether ScriptObject to the
+    quantized::conv2d or linear is from conv_prepack or linear_prepack. If it is, we then inline those parameters
+    to the operator by converting them to a getattr fx.node.
+    For prepacked::conv2d_clamp_run and prepacked::linear_clamp_run, we directly convert them to aten.conv2d and aten.linear
+    without the need of doing de/quantization.
+    Three global variables defined are _INPUT_Q_DTYPE, _SCALE, _ZERO_POINT. _INPUT_Q_DTYPE determines the de/quantization
+    data type, which is the same across the entire program, but it only shows up in the very first quantization
+    call. _SCALE and _ZERO_POINT are used only when operators do not have those specified. E.g., mul.Scalar.
+    """
+    global _INPUT_Q_DTYPE
+    quantized = False
+    last_quantized_node = None
+    for node in gm.graph.nodes:
+        if isinstance(node.target, OpOverload):
+            with gm.graph.inserting_before(node):
+                namespace, opname = node.target.namespace, node.target._opname
+                if namespace == "quantized" and opname not in [
+                    "conv_prepack",
+                    "linear_prepack",
+                ]:
+                    quantized = True
+                    fx_node = fx_transform_quantized_op_to_standard_op(gm, node)
+                    node.replace_all_uses_with(fx_node)
+                    last_quantized_node = fx_node
+                elif namespace == "prepacked":
+                    quantized = True
+                    fx_node = _transform_prepacked_op(gm, node)
+                    node.replace_all_uses_with(fx_node)
+                    last_quantized_node = fx_node
+                elif namespace == "aten" and opname == "quantize_per_tensor":
+                    inp_node, scale_node, zero_point_node, dtype_node = node.args
+                    dtype_node = fx_enum_to_dtype(gm, dtype_node)
+                    _INPUT_Q_DTYPE = dtype_node
+                    qmin_node, qmax_node = insert_qmin_qmax_node(gm, dtype_node)
+                    q_fx_node = insert_quantized_node(
+                        gm,
+                        inp_node,
+                        scale_node,
+                        zero_point_node,
+                        qmin_node,
+                        qmax_node,
+                        dtype_node,
+                        torch.per_tensor_affine,
+                    )
+                    dq_fx_node = insert_dequantized_node(
+                        gm,
+                        q_fx_node,
+                        scale_node,
+                        zero_point_node,
+                        qmin_node,
+                        qmax_node,
+                        dtype_node,
+                        None,
+                        torch.per_tensor_affine,
+                    )
+                    node.replace_all_uses_with(dq_fx_node)
+                    last_quantized_node = dq_fx_node
+                elif namespace == "aten" and opname == "dequantize":
+                    assert last_quantized_node is not None
+                    node.replace_all_uses_with(last_quantized_node)
+                else:
+                    last_quantized_node = node
+    # Post-processing again to remove legacy ScriptObjects and quantizated tensors
+    # stored as attributes or in the buffer. This is used to clean up the GraphModule
+    # to not trigger tracing errors like missing __obj_flatten__ functions.
+    def _clean_attr(mod: torch.nn.Module):
+        for submod in mod.modules():
+            attr_names_to_clean = set()
+            for k, v in submod.__dict__.items():
+                if isinstance(v, torch.ScriptObject):
+                    attr_names_to_clean.add(k)
+                if k == "_buffers":
+                    buffer_name_to_clean = set()
+                    for b_name, b_value in v.items():
+                        if isinstance(b_value, torch.Tensor) and b_value.dtype in [
+                            torch.qint8,
+                            torch.quint8,
+                        ]:
+                            buffer_name_to_clean.add(b_name)
+                    for b_name in buffer_name_to_clean:
+                        v.pop(b_name, None)
+            for attr_name in attr_names_to_clean:
+                delattr(submod, attr_name)
+    if quantized:
+        """
+        TODO: SetAttr + quantized ops will result incorrect program. This flag is used to temporarily
+        bypass test cases.
+        The deadcode elimination pass is needed to remove legacy quantized ops. Otherwise, retracing
+        will throw errors. However, the current way of SetAttr does inplace update to attributes, so
+        this pass regard them as dead code and remove them. Below is an example of GraphModule before
+        and after the dead code elimination pass.
+        class GraphModule(torch.nn.Module):
+            def forward(self, x_1):
+                # No stacktrace found for following nodes
+                data = self.data;  data = None
+                data_1 = self.data
+                add_tensor = torch.ops.aten.add.Tensor(data_1, x_1, alpha = 1);  data_1 = None
+                data_2 = self.data
+                copy_ = torch_Tensor_copy_(data_2, add_tensor);  data_2 = add_tensor = copy_ = None
+                data_3 = self.data
+                add_tensor_1 = torch.ops.aten.add.Tensor(x_1, data_3, alpha = 1);  x_1 = data_3 = None
+                return add_tensor_1
+        class GraphModule(torch.nn.Module):
+            def forward(self, x_1):
+                # No stacktrace found for following nodes
+                data_3 = self.data
+                add_tensor_1 = torch.ops.aten.add.Tensor(x_1, data_3, alpha = 1);  x_1 = data_3 = None
+                return add_tensor_1
+        """
+        gm.graph.eliminate_dead_code()
+        _clean_attr(gm)

.venv/lib/python3.11/site-packages/torch/_export/passes/replace_set_grad_with_hop_pass.py ADDED Viewed

	@@ -0,0 +1,110 @@

+# mypy: allow-untyped-defs
+import torch
+from torch._higher_order_ops.wrap import wrap_with_set_grad_enabled
+from ..utils import node_inline_, nodes_filter, nodes_first, nodes_map, sequential_split
+from .replace_with_hop_pass_util import (
+    _replace_with_hop_helper,
+    _replace_with_hop_pass_helper,
+    _sequential_split_and_maybe_inline_subgraphs_helper,
+)
+def _is_set_grad_enabled_node(node: torch.fx.Node):
+    return (
+        node
+        and node.op == "call_function"
+        and node.target == torch._C._set_grad_enabled
+    )
+def _is_set_grad_enabled_sub_mod(node: torch.fx.Node, omit_if_same_with_ambient=False):
+    if node.op == "call_module":
+        assert isinstance(node.target, str)
+        subgm = getattr(node.graph.owning_module, node.target)
+        first_non_ph = nodes_first(
+            subgm.graph.nodes, lambda node: node.op != "placeholder"
+        )
+        if (
+            first_non_ph
+            and first_non_ph.op == "call_function"
+            and first_non_ph.target == torch._C._set_grad_enabled
+        ):
+            return (
+                first_non_ph.args[0] != torch.is_grad_enabled()
+                if omit_if_same_with_ambient
+                else True
+            )
+    return False
+def _replace_with_hop(node: torch.fx.Node):
+    assert node.op == "call_module"
+    graph: torch.fx.Graph = node.graph
+    gm: torch.fx.GraphModule = graph.owning_module
+    assert isinstance(node.target, str)
+    sub_gm = getattr(gm, node.target)
+    sub_graph = sub_gm.graph
+    set_grad_nodes = nodes_filter(sub_graph.nodes, _is_set_grad_enabled_node)
+    if len(set_grad_nodes) > 0:
+        assert len(set_grad_nodes) == 1
+        set_grad_node = set_grad_nodes[0]
+        _replace_with_hop_helper(
+            node, set_grad_node, _is_set_grad_enabled_node, wrap_with_set_grad_enabled
+        )
+        sub_graph.erase_node(set_grad_node)
+def _remove_set_grad_and_inline(node: torch.fx.Node):
+    assert node.op == "call_module"
+    graph: torch.fx.Graph = node.graph
+    gm: torch.fx.GraphModule = graph.owning_module
+    assert isinstance(node.target, str)
+    sub_gm = getattr(gm, node.target)
+    sub_graph = sub_gm.graph
+    nodes_map(
+        sub_graph.nodes,
+        lambda n: sub_graph.erase_node(n) if _is_set_grad_enabled_node(n) else n,
+    )
+    node_inline_(node)
+def _sequential_split_and_maybe_inline_subgraphs(
+    gm: torch.fx.GraphModule, graph_signature
+):
+    """
+    Helper function for replace_set_grad_with_hop_pass().
+    Split the graph module into multiple subgraphs based on the set_grad_enabled nodes.
+    For each subgraph, decides whether to construct a HOO subgraph, or inline the calls
+    back into the parent graph module.
+    """
+    need_replacing = any(_is_set_grad_enabled_node(node) for node in gm.graph.nodes)
+    if not need_replacing:
+        return gm, graph_signature
+    # sequential_split returns a new graph module that could have different output
+    # args names. We need to fix the graph signature.
+    new_gm = sequential_split(gm, _is_set_grad_enabled_node)
+    def _maybe_inline_or_replace_with_hop(node: torch.fx.Node):
+        if _is_set_grad_enabled_sub_mod(node, omit_if_same_with_ambient=True):
+            _replace_with_hop(node)
+        else:
+            _remove_set_grad_and_inline(node)
+    return _sequential_split_and_maybe_inline_subgraphs_helper(
+        new_gm, graph_signature, _maybe_inline_or_replace_with_hop
+    )
+def replace_set_grad_with_hop_pass(gm: torch.fx.GraphModule, graph_signature):
+    """
+    Split gm into sub-graph-modules using `sequential_split_and_maybe_inline_subgraphs`, and
+    then recursively call itself on each of the submodules.
+    """
+    return _replace_with_hop_pass_helper(
+        gm,
+        graph_signature,
+        _sequential_split_and_maybe_inline_subgraphs,
+    )

.venv/lib/python3.11/site-packages/torch/_export/passes/replace_view_ops_with_view_copy_ops_pass.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# mypy: allow-untyped-defs
+from typing import Dict, Optional
+import torch
+from torch._ops import OpOverload, HigherOrderOperator
+from torch._export.error import InternalError
+from torch._export.pass_base import _ExportPassBaseDeprecatedDoNotUse
+__all__ = ["ReplaceViewOpsWithViewCopyOpsPass"]
+_NON_FUNCTIONAL_OPS_TO_FUNCTIONAL_OPS: Dict[OpOverload, OpOverload] = {
+    torch.ops.aten._unsafe_view.default: torch.ops.aten.view_copy.default,
+}
+def is_view_op(schema: torch._C.FunctionSchema) -> bool:
+    if len(schema.arguments) == 0:
+        return False
+    alias_info = schema.arguments[0].alias_info
+    return (alias_info is not None) and (not alias_info.is_write)
+def get_view_copy_of_view_op(schema: torch._C.FunctionSchema) -> Optional[OpOverload]:
+    if is_view_op(schema) and schema.name.startswith("aten::"):
+        view_op_name = schema.name.split("::")[1]
+        view_op_overload = (
+            schema.overload_name
+            if schema.overload_name != ""
+            else "default"
+        )
+        view_copy_op_name = view_op_name + "_copy"
+        if not hasattr(torch.ops.aten, view_copy_op_name):
+            raise InternalError(f"{schema.name} is missing a view_copy variant")
+        view_copy_op_overload_packet = getattr(torch.ops.aten, view_copy_op_name)
+        if not hasattr(view_copy_op_overload_packet, view_op_overload):
+            raise InternalError(f"{schema.name} is missing a view_copy variant")
+        return getattr(view_copy_op_overload_packet, view_op_overload)
+    return None
+class ReplaceViewOpsWithViewCopyOpsPass(_ExportPassBaseDeprecatedDoNotUse):
+    """
+    Our backend expects pure functional operators. For efficiency
+    purposes, we keep view ops around while functionalizing the exported
+    program. This pass replaces view ops with view copy ops for backends that
+    need AOT memory planning.
+    """
+    def call_operator(self, op, args, kwargs, meta):
+        if op in _NON_FUNCTIONAL_OPS_TO_FUNCTIONAL_OPS:
+            return super().call_operator(
+                (_NON_FUNCTIONAL_OPS_TO_FUNCTIONAL_OPS[op]), args, kwargs, meta
+            )
+        if isinstance(op, HigherOrderOperator):
+            return super().call_operator(op, args, kwargs, meta)
+        if view_copy_op := get_view_copy_of_view_op(op._schema):
+            return super().call_operator(view_copy_op, args, kwargs, meta)
+        return super().call_operator(op, args, kwargs, meta)

.venv/lib/python3.11/site-packages/torch/_export/passes/replace_with_hop_pass_util.py ADDED Viewed

	@@ -0,0 +1,178 @@

+# mypy: allow-untyped-defs
+import contextlib
+import copy
+import operator
+from typing import Callable
+import torch
+from torch._ops import HigherOrderOperator
+from ..utils import node_replace_, nodes_map
+def _replace_with_hop_helper(
+    node: torch.fx.Node,
+    enter_block_node: torch.fx.Node,
+    node_filter: Callable,
+    wrap_hoo: HigherOrderOperator,
+):
+    graph: torch.fx.Graph = node.graph
+    gm: torch.fx.GraphModule = graph.owning_module
+    assert isinstance(node.target, str)
+    sub_gm = getattr(gm, node.target)
+    def set_hoo_node_meta(call_func_node):
+        call_func_node.meta["nn_module_stack"] = copy.copy(
+            enter_block_node.meta.get("nn_module_stack", {})
+        )
+        call_func_node.meta["torch_fn"] = (
+            f"{wrap_hoo.__name__}",
+            f"{wrap_hoo.__class__.__name__}.{wrap_hoo.__name__}",
+        )
+        if isinstance(output_args, (tuple, list)):
+            call_func_node.meta["val"] = tuple(arg.meta["val"] for arg in output_args)
+        elif isinstance(output_args, torch.fx.Node):
+            call_func_node.meta["val"] = (output_args.meta["val"],)
+    with graph.inserting_before(node):
+        get_attr_node = graph.get_attr(node.target)
+        get_attr_node.meta["nn_module_stack"] = copy.copy(
+            enter_block_node.meta.get("nn_module_stack", {})
+        )
+        output_node = next(iter(reversed(sub_gm.graph.nodes)), None)
+        # Split_module pass intentially doesn't add output node
+        # if the graph doesn't return anything.
+        # TODO (tmanlaibaatar) Figure out if this is right behaviour
+        # for split_module
+        if isinstance(output_node, torch.fx.Node) and output_node.op != "output":
+            output_node = None
+        if output_node is not None:
+            assert len(output_node.args) == 1
+            output_args = output_node.args[0]
+            enter_block_node_args = enter_block_node.args
+            if isinstance(output_args, (tuple, list)):
+                call_func_node = graph.call_function(
+                    wrap_hoo,
+                    (*enter_block_node_args, get_attr_node, *node.args),
+                    {},
+                )
+                # Create the metadata
+                set_hoo_node_meta(call_func_node)
+                node_replace_(node, call_func_node)
+                # Rename the name of getitem nodes to the actual name of its contents
+                # for passing verifier and better readability, also propagate metadata
+                for get_item_node in call_func_node.users.keys():
+                    idx: int = get_item_node.args[1]  # type: ignore[assignment]
+                    output_node = output_args[idx]
+                    get_item_node._rename(output_node.name)
+                    get_item_node.meta = output_node.meta
+            elif isinstance(output_args, torch.fx.Node):
+                call_func_node = graph.create_node(
+                    "call_function",
+                    wrap_hoo,
+                    (*enter_block_node_args, get_attr_node, *node.args),
+                    {},
+                    output_args.name,
+                )
+                # Modify the subgraph to output a singleton list.
+                output_node.args = ((output_args,),)
+                # Add in an extra `getitem(wrap_hoo, 0)` node to the toplevel graph.
+                get_item_node = graph.create_node(
+                    "call_function",
+                    operator.getitem,
+                    (call_func_node, 0),
+                    {},
+                )
+                # Create the metadata
+                get_item_node.meta = output_args.meta
+                set_hoo_node_meta(call_func_node)
+                node_replace_(node, get_item_node)
+            else:
+                raise NotImplementedError(
+                    f"repalce_with_hop_pass doesnt' support output type {type(output_args)}"
+                )
+        else:
+            # TODO (shangdiy): remove this line, since the export graph can be non-functional
+            node.graph.erase_node(node)
+def _sequential_split_and_maybe_inline_subgraphs_helper(
+    new_gm: torch.fx.GraphModule,
+    graph_signature,
+    maybe_inline_or_replace_with_hop: Callable[[torch.fx.Node], None],
+):
+    """
+    Helper function for replacing graph nodse with higher order nodes.
+    For each subgraph in `new_gm`, decides whether to construct a HOO subgraph, or inline the calls
+    back into the parent graph module, depending on `maybe_inline_or_replace_with_hop`.
+    """
+    # new_gm is a new graph module that could have different output args names.
+    # We need to fix the graph signature.
+    replace_ctx = contextlib.nullcontext()
+    new_signature = None
+    if graph_signature is not None:
+        # Cannot deep copy a real ScriptObject, which is referenced
+        # in the FakeScriptObject. Copy should be good enough to guard
+        # against accidental mutation to original graph_signature.
+        new_signature = copy.copy(graph_signature)
+        new_gm_out_node = next(reversed(new_gm.graph.find_nodes(op="output")))
+        assert new_gm_out_node.op == "output" and len(new_gm_out_node.args[0]) == len(
+            new_signature.output_specs
+        )
+        for arg_node, out_spec in zip(
+            new_gm_out_node.args[0], new_signature.output_specs
+        ):
+            if arg_node is None:
+                assert out_spec.arg.value is None
+            elif (
+                isinstance(arg_node, torch.fx.Node)
+                and out_spec.arg.name != arg_node.name
+            ):
+                out_spec.arg.name = arg_node.name
+        replace_ctx = new_gm._set_replace_hook(new_signature.get_replace_hook())  # type: ignore[assignment]
+    with replace_ctx:
+        nodes_map(
+            list(new_gm.graph.nodes),
+            lambda node: (
+                maybe_inline_or_replace_with_hop(node)
+                if node.op == "call_module"
+                else node
+            ),
+        )
+    new_gm.recompile()
+    return new_gm, new_signature
+def _replace_with_hop_pass_helper(
+    gm: torch.fx.GraphModule,
+    graph_signature,
+    sequential_split_and_maybe_inline_subgraphs: Callable,
+):
+    """
+    Split gm into sub-graph-modules using `sequential_split_and_maybe_inline_subgraphs`, and
+    then recursively call itself on each of the submodules.
+    """
+    new_gm, new_signature = sequential_split_and_maybe_inline_subgraphs(
+        gm, graph_signature
+    )
+    # recursively call
+    for node in new_gm.graph.nodes:
+        if node.op == "get_attr":
+            subgm = getattr(new_gm, node.target)
+            if not isinstance(subgm, torch.fx.GraphModule):
+                continue
+            new_subgm, _ = _replace_with_hop_pass_helper(
+                subgm,
+                None,
+                sequential_split_and_maybe_inline_subgraphs,
+            )
+            setattr(new_gm, node.target, new_subgm)
+    new_gm.recompile()
+    new_gm.graph.lint()
+    return new_gm, new_signature